In [13]:
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier  
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score


In [14]:


def KMer(text, k):
    kmers_count = {}
    s=0
    for i in range(len(text) - k + 1):
        kmer = text[i:i + k]
        s +=1
        if kmer in kmers_count:
            kmers_count[kmer] += 1
        else:
            kmers_count[kmer] = 1
    for key, value in kmers_count.items():
        kmers_count[key]=value/s
        
    return kmers_count


def codon(text, k):
    codon_count = {}
    s=0
    i=0
    while(i<len(text) - (k*3) + 1):
        kmer = text[i:i + (k*3)]
        s +=1
        if kmer in codon_count:
            codon_count[kmer] += 1
        else:
            codon_count[kmer] = 1
        i+=3
        
    for key, value in codon_count.items():
        codon_count[key]=value/s
        
    return codon_count

def read_fasta_file(file_path):
    sequences = []
    current_sequence = None
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence:
                    sequences.append(current_sequence)
                sequence_id = line[1:]
                current_sequence = {'id': sequence_id, 'sequence': ''}
            else:
                current_sequence['sequence'] += line
        if current_sequence:
            sequences.append(current_sequence)
    return sequences

file_path = 'Arabidopsis_thaliana_BHLH_gene_Family.fasta'
fasta = read_fasta_file(file_path)

ids=[]
for id in fasta:
   ids.append(id['id'])


sequences =[]
y=[]
for seq in fasta:
    sequences.append(seq['sequence'])
    y.append(1)

file_path = 'Arabidopsis_thaliana_CYP_gene_Family.fa'
fasta = read_fasta_file(file_path)

for seq in fasta:
    sequences.append(seq['sequence'])
    y.append(0)


for id in fasta:
   ids.append(id['id'])

kmer_result=[]
for i in range(len(sequences)):
    kmer_result.append (codon(sequences[i], 2))
    kmer_result[i].update(codon(sequences[i], 1))

v = DictVectorizer(sparse=False)

features = v.fit_transform(kmer_result)
feature_names = v.get_feature_names_out()



x = pd.DataFrame(features,columns=feature_names)
x.to_csv('kmer_features.csv', index=False)
x.rename(index=dict(zip(x.index, ids)), inplace=True)


In [10]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2)


label_encoder = LabelEncoder()

for col in x.columns:
    if x[col].dtype == 'object':
        
        x[col] = label_encoder.fit_transform(x[col])

y = label_encoder.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)


rfc = RandomForestClassifier(n_estimators=100, random_state=42)  


rfc.fit(X_train, y_train)

y_pred = rfc.predict(X_test)

y_pred_labels = label_encoder.inverse_transform(y_pred)

print("Predicted labels:", y_pred_labels)


Predicted labels: [0 0 0 1 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 1 0 0 1 0 1 1 0 0 1 1 0 0 0 0 0 0
 1 0]


In [11]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


precision = precision_score(y_test, y_pred)
print("Precision:", precision)


sensitivity = recall_score(y_test, y_pred)
print("Sensitivity (Recall):", sensitivity)

f1 = f1_score(y_test, y_pred)
print("F1 Score:", f1)

Accuracy: 0.75
Precision: 0.95
Sensitivity (Recall): 0.5135135135135135
F1 Score: 0.6666666666666667
