**Genes function prediction. Model based on k-mers. Dataset - 15 000 sequences of amino acids.**

In [34]:
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import RepeatedKFold

In [35]:
f = open('doubled_proteins.txt', "r")
data = f.read()
f.close()

elements = re.findall("([A-Z]+)\t+(\d+)", data)

sequences = [s.lower() for s, n in elements]
classes = [n for s, n in elements]
classes = [int(i) for i in classes]

In [36]:
def getKmers(sequence, k=6):
    kmers = []
    for i in range(len(sequence) - k + 1):
        kmers.append(sequence[i:i+k])
    return kmers

In [37]:
kmers_list = [getKmers(sequence, k=6) for sequence in sequences]

In [38]:
kmers_strings = [" ".join(kmers) for kmers in kmers_list]

In [39]:
Y = np.array(classes, dtype=np.int64)

In [40]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(ngram_range=(4,4))
X = vectorizer.fit_transform(kmers_strings)

In [41]:
rkf = RepeatedKFold(n_splits=2, n_repeats=2, random_state=42)

for train_index, test_index in rkf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    Y_train, Y_test = Y[train_index], Y[test_index]

TRAIN: [    1     2     4 ... 15071 15076 15077] TEST: [    0     3     8 ... 15075 15078 15079]
TRAIN: [    0     3     8 ... 15075 15078 15079] TEST: [    1     2     4 ... 15071 15076 15077]
TRAIN: [    0     1     4 ... 15073 15074 15075] TEST: [    2     3     6 ... 15077 15078 15079]
TRAIN: [    2     3     6 ... 15077 15078 15079] TEST: [    0     1     4 ... 15073 15074 15075]


In [42]:
from sklearn.naive_bayes import MultinomialNB 
classifier = MultinomialNB(alpha=0.1)
classifier.fit(X_train, Y_train)

MultinomialNB(alpha=0.1)

In [43]:
Y_pred = classifier.predict(X_test)

In [44]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
print("Confusion matrix\n")
print(pd.crosstab(pd.Series(Y_test, name='Actual'), pd.Series(Y_pred, name='Predicted')))
def get_metrics(Y_test, y_predicted):
    accuracy = accuracy_score(Y_test, y_predicted)
    precision = precision_score(Y_test, y_predicted, average='weighted')
    recall = recall_score(Y_test, y_predicted, average='weighted')
    f1 = f1_score(Y_test, y_predicted, average='weighted')
    return accuracy, precision, recall, f1
accuracy, precision, recall, f1 = get_metrics(Y_test, Y_pred)
print("accuracy = %.3f \nprecision = %.3f \nrecall = %.3f \nf1 = %.3f" % (accuracy, precision, recall, f1))

Confusion matrix

Predicted     0     1     2    3    4     5    6
Actual                                          
0          1235     0     0    0    0     0    0
1             0  1000     0    0    0     0    0
2             0     0  1270    0    0     0    0
3             0     0     0  975    0     0    0
4             0     0     0    0  977     0    0
5             0     0     0    0    0  1118    0
6             0     0     0    0    0     0  965
accuracy = 1.000 
precision = 1.000 
recall = 1.000 
f1 = 1.000
