In [10]:
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import numpy as np

def KMer(text, k):
    kmers_count = {}
    s = 0
    for i in range(len(text) - k + 1):
        kmer = text[i:i + k]
        s += 1
        if kmer in kmers_count:
            kmers_count[kmer] += 1
        else:
            kmers_count[kmer] = 1
    for key, value in kmers_count.items():
        kmers_count[key] = value / s

    return kmers_count

def codon(text, k):
    codon_count = {}
    s = 0
    i = 0
    while i < len(text) - (k * 3) + 1:
        kmer = text[i:i + (k * 3)]
        s += 1
        if kmer in codon_count:
            codon_count[kmer] += 1
        else:
            codon_count[kmer] = 1
        i += 3

    for key, value in codon_count.items():
        codon_count[key] = value / s

    return codon_count

def read_fasta_file(file_path): #???
    sequences = []
    current_sequence = None
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence:
                    sequences.append(current_sequence)
                sequence_id = line[1:]
                current_sequence = {'id': sequence_id, 'sequence': ''}
            else:
                current_sequence['sequence'] += line
        if current_sequence:
            sequences.append(current_sequence)
    return sequences

file_path = 'Arabidopsis_thaliana_BHLH_gene_Family.fasta'
fasta = read_fasta_file(file_path)

ids = []
for id in fasta:
    ids.append(id['id'])

sequences = []
y = []
for seq in fasta:
    sequences.append(seq['sequence'])
    y.append(1)

file_path = 'Arabidopsis_thaliana_CYP_gene_Family.fa'
fasta = read_fasta_file(file_path)

for seq in fasta:
    sequences.append(seq['sequence'])
    y.append(0)

for id in fasta:
    ids.append(id['id'])

kmer_result = []
for i in range(len(sequences)):
    kmer_result.append(codon(sequences[i], 2))
    kmer_result[i].update(codon(sequences[i], 1))  #???


v = DictVectorizer(sparse=False)

features = v.fit_transform(kmer_result)
feature_names = v.get_feature_names_out()


x = pd.DataFrame(features, columns=feature_names)


features = v.fit_transform(kmer_result)
feature_names = v.get_feature_names_out()


x.to_csv('kmer_features.csv', index=False)
x.rename(index=dict(zip(x.index, ids)), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()

for col in x.columns:
    if x[col].dtype == 'object':
        x[col] = label_encoder.fit_transform(x[col])

y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_test = np.array(X_test)


In [12]:
X_train

Unnamed: 0,AAA,AAAAAA,AAAAAC,AAAAAG,AAAAAT,AAAACA,AAAACC,AAAACG,AAAACT,AAAAGA,...,TTTTCG,TTTTCT,TTTTGA,TTTTGC,TTTTGG,TTTTGT,TTTTTA,TTTTTC,TTTTTG,TTTTTT
AT3G61950.1 | Symbols: | basic helix-loop-helix (bHLH) DNA-binding superfamily protein | chr3:22939546-22941451 FORWARD LENGTH=1906,0.053543,0.001577,0.001577,0.000000,0.001577,0.001577,0.003155,0.000000,0.000000,0.001577,...,0.001577,0.000000,0.000000,0.000000,0.000000,0.001577,0.000000,0.004732,0.003155,0.007886
"AT2G45560.1 | Symbols: CYP76C1 | cytochrome P450, family 76, subfamily C, polypeptide 1 | chr2:18776052-18778510 REVERSE LENGTH=2459",0.023199,0.000000,0.000000,0.000000,0.002445,0.000000,0.001222,0.002445,0.001222,0.000000,...,0.000000,0.001222,0.000000,0.001222,0.001222,0.001222,0.001222,0.000000,0.000000,0.000000
"AT4G37360.1 | Symbols: CYP81D2 | cytochrome P450, family 81, subfamily D, polypeptide 2 | chr4:17567124-17568858 REVERSE LENGTH=1735",0.008651,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.001733,0.001733,0.000000,0.001733,0.000000,0.000000,0.001733,0.003466
AT4G20970.1 | Symbols: | basic helix-loop-helix (bHLH) DNA-binding superfamily protein | chr4:11215259-11216212 FORWARD LENGTH=954,0.028302,0.000000,0.000000,0.003155,0.003155,0.000000,0.000000,0.006309,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.003155,0.000000,0.003155,0.000000,0.000000,0.000000,0.000000
"AT1G67110.1 | Symbols: CYP735A2 | cytochrome P450, family 735, subfamily A, polypeptide 2 | chr1:25061731-25065446 REVERSE LENGTH=3716",0.041195,0.001617,0.000808,0.000000,0.002425,0.000000,0.000000,0.000808,0.000000,0.000808,...,0.000808,0.001617,0.004042,0.000808,0.000808,0.000808,0.001617,0.000808,0.001617,0.003234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
AT3G06120.1 | Symbols: MUTE | basic helix-loop-helix (bHLH) DNA-binding superfamily protein | chr3:1846531-1848230 FORWARD LENGTH=1700,0.037102,0.000000,0.001770,0.001770,0.001770,0.001770,0.000000,0.000000,0.001770,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.003540,0.001770,0.000000,0.003540,0.000000
AT4G30980.1 | Symbols: LRL2 | LJRHL1-like 2 | chr4:15079243-15081657 REVERSE LENGTH=2415,0.047205,0.006219,0.000000,0.003731,0.000000,0.002488,0.001244,0.000000,0.000000,0.000000,...,0.001244,0.001244,0.000000,0.001244,0.001244,0.001244,0.001244,0.001244,0.001244,0.007463
"AT3G20090.1 | Symbols: CYP705A18 | cytochrome P450, family 705, subfamily A, polypeptide 18 | chr3:7015797-7018360 FORWARD LENGTH=2564",0.023419,0.000000,0.001172,0.000000,0.000000,0.000000,0.000000,0.000000,0.001172,0.002345,...,0.000000,0.001172,0.001172,0.001172,0.002345,0.001172,0.000000,0.002345,0.001172,0.002345
"AT5G44620.1 | Symbols: CYP706A3 | cytochrome P450, family 706, subfamily A, polypeptide 3 | chr5:17997779-17999558 REVERSE LENGTH=1780",0.032040,0.001689,0.000000,0.000000,0.001689,0.000000,0.000000,0.000000,0.000000,0.001689,...,0.000000,0.001689,0.001689,0.000000,0.000000,0.001689,0.000000,0.000000,0.000000,0.000000


In [8]:

knn = KNeighborsClassifier(n_neighbors=5)  

knn.fit(X_train, y_train)

y_pred = knn.predict(X_test)

y_pred_labels = label_encoder.inverse_transform(y_pred)

print("Predicted labels:", y_pred_labels)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

precision = precision_score(y_test, y_pred)
print("Precision:", precision)

sensitivity = recall_score(y_test, y_pred)
print("Sensitivity (Recall):", sensitivity)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Predicted labels: [0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 1 0 0 1 0 1 1 1 0 0 0 1 0 0
 0 1 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 1 0 0 0 0 0 0
 1 0]
Accuracy: 0.7105263157894737
Precision: 0.8571428571428571
Sensitivity (Recall): 0.4864864864864865
F1 Score: 0.6206896551724138


