**Genes function prediction. Model based on k-mers. Dataset - 15 000 sequences of DNA.**

In [34]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RepeatedKFold

In [4]:
f = open('doubled_dna.txt', "r")
data = f.read()
f.close()

elements = re.findall("([A-Z]+)\t+(\d+)", data)

sequences = [s.lower() for s, n in elements]
classes = [n for s, n in elements]
classes = [int(i) for i in classes]

In [18]:
def getKmers(sequence, k=6):
    kmers = []
    for i in range(len(sequence) - k + 1):
        kmers.append(sequence[i:i+k])
    return kmers

In [19]:
kmers_list = [getKmers(sequence, k=6) for sequence in sequences]

In [20]:
kmers_strings = [" ".join(kmers) for kmers in kmers_list]

In [23]:
Y = np.array(classes, dtype=np.int64)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(4,4))
X = vectorizer.fit_transform(kmers_strings)

In [27]:
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=42)

for train_index, test_index in rkf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

TRAIN: [    1     2     4 ... 13869 13874 13875] TEST: [    0     3     8 ... 13873 13876 13877]
TRAIN: [    0     3     8 ... 13873 13876 13877] TEST: [    1     2     4 ... 13869 13874 13875]
TRAIN: [    0     1     4 ... 13874 13875 13876] TEST: [    2     3     6 ... 13866 13869 13877]
TRAIN: [    2     3     6 ... 13866 13869 13877] TEST: [    0     1     4 ... 13874 13875 13876]


In [29]:
from sklearn.naive_bayes import MultinomialNB 
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, Y_train)

MultinomialNB(alpha=0.1)

In [30]:
Y_pred = classifier.predict(X_test)

In [33]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(Y_test, name='Actual'), pd.Series(Y_pred, name='Predicted')))
def get_metrics(Y_test, y_predicted):
    accuracy = accuracy_score(Y_test, y_predicted)
    precision = precision_score(Y_test, y_predicted, average='weighted')
    recall = recall_score(Y_test, y_predicted, average='weighted')
    f1 = f1_score(Y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(Y_test, Y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix

Predicted     0    1     2    3    4    5    6
Actual                                        
0          1206    0     0    0    0    0    0
1             0  919     0    0    0    0    0
2             0    0  1164    0    0    0    0
3             0    0     0  920    0    0    0
4             0    0     0    0  836    0    0
5             0    0     0    0    0  997    0
6             0    0     0    0    0    0  897
accuracy = 1.000 
precision = 1.000 
recall = 1.000 
f1 = 1.000
