In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [13]:
Cov19_dna=pd.read_table("Cov19test_dna.txt")
Cov19_dna.head()

Unnamed: 0,gene,sequence,class
0,MT276598.1_ORF3a,ATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGA...,1
1,MT276597.1_ORF3a,ATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGA...,1
2,MT276331.1_ORF3a,ATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGA...,1
3,MT276330.1_ORF3a,ATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGA...,1
4,MT276329.1_ORF3a,ATGGATTTGTTTATGAGAATCTTCACAATTGGAACTGTAACTTTGA...,1


In [14]:
#convert a sequence of characters into k-mer words, default size = 6 (hexamers)
def Kmers_funct(seq, size=6):
    return [seq[x:x+size].lower() for x in range(len(seq) - size + 1)]

In [15]:
#convert our training data sequences into short overlapping k-mers of length 6. 
Cov19_dna['words'] = Cov19_dna.apply(lambda x: Kmers_funct(x['sequence']), axis=1)
Cov19_dna=Cov19_dna.drop("sequence",axis=1)


In [16]:
Cov19_dna.head()

Unnamed: 0,gene,class,words
0,MT276598.1_ORF3a,1,"[atggat, tggatt, ggattt, gatttg, atttgt, tttgt..."
1,MT276597.1_ORF3a,1,"[atggat, tggatt, ggattt, gatttg, atttgt, tttgt..."
2,MT276331.1_ORF3a,1,"[atggat, tggatt, ggattt, gatttg, atttgt, tttgt..."
3,MT276330.1_ORF3a,1,"[atggat, tggatt, ggattt, gatttg, atttgt, tttgt..."
4,MT276329.1_ORF3a,1,"[atggat, tggatt, ggattt, gatttg, atttgt, tttgt..."


In [18]:

cov19_texts = list(Cov19_dna['words'])
for item in range(len(cov19_texts)):
    cov19_texts[item] = ' '.join(cov19_texts[item])
#separate labels
y_cov19 = Cov19_dna.iloc[:, 1].values # y_human for human_dna

In [19]:
y_cov19

array([1, 1, 1, ..., 3, 3, 3])

In [20]:
# convert k-mer words into numerical vectors that represent counts for every k-mer in the vocabulary
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(ngram_range=(4,4)) #The n-gram size of 4 is previously determined by testing
X = cv.fit_transform(cov19_texts)

In [21]:
print(X.shape)

(1125, 2102)


In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y_cov19, random_state=3) 

In [23]:
### Multinomial Naive Bayes Classifie
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

In [24]:
print(y_pred)

[3 2 2 3 1 3 2 2 2 2 2 2 1 1 3 1 1 2 2 1 2 3 3 2 1 2 3 1 1 3 2 1 2 1 3 1 3
 2 2 2 1 1 3 3 2 3 2 2 3 1 3 3 2 1 1 1 3 2 1 3 1 1 2 1 1 1 3 2 3 3 2 1 2 2
 1 2 3 1 3 1 2 1 3 1 1 1 3 1 2 2 1 1 1 3 2 2 2 2 1 3 2 1 1 3 2 3 2 1 2 3 3
 3 1 3 2 1 1 1 3 3 1 3 2 3 1 1 2 1 3 2 2 2 1 2 3 3 2 3 2 2 1 1 3 3 2 1 2 3
 1 3 1 1 1 1 2 1 3 3 2 3 3 2 1 2 2 2 1 2 1 3 1 1 1 3 2 3 3 2 1 3 1 3 1 3 2
 3 3 2 1 1 1 3 1 2 3 2 3 1 1 3 3 2 1 1 3 2 3 2 3 2 3 3 1 2 2 3 1 2 1 3 3 3
 3 2 1 2 2 3 1 2 1 1 1 3 1 3 2 1 1 2 2 1 3 3 1 2 3 2 2 1 2 3 1 3 1 3 3 2 1
 2 3 2 2 1 3 2 2 1 2 1 3 2 3 1 3 1 1 2 1 2 2 3]


In [None]:
##confusion matrix