In [6]:
from sklearn.feature_extraction import DictVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
import numpy as np

def KMer(text, k):
    kmers_count = {}
    s = 0
    for i in range(len(text) - k + 1):
        kmer = text[i:i + k]
        s += 1
        if kmer in kmers_count:
            kmers_count[kmer] += 1
        else:
            kmers_count[kmer] = 1
    for key, value in kmers_count.items():
        kmers_count[key] = value / s

    return kmers_count

def codon(text, k):
    codon_count = {}
    s = 0
    i = 0
    while i < len(text) - (k * 3) + 1:
        kmer = text[i:i + (k * 3)]
        s += 1
        if kmer in codon_count:
            codon_count[kmer] += 1
        else:
            codon_count[kmer] = 1
        i += 3

    for key, value in codon_count.items():
        codon_count[key] = value / s

    return codon_count

def read_fasta_file(file_path):
    sequences = []
    current_sequence = None
    with open(file_path, 'r') as file:
        for line in file:
            line = line.strip()
            if line.startswith('>'):
                if current_sequence:
                    sequences.append(current_sequence)
                sequence_id = line[1:]
                current_sequence = {'id': sequence_id, 'sequence': ''}
            else:
                current_sequence['sequence'] += line
        if current_sequence:
            sequences.append(current_sequence)
    return sequences

file_path = 'Arabidopsis_thaliana_BHLH_gene_Family.fasta'
fasta = read_fasta_file(file_path)

ids = []
for id in fasta:
    ids.append(id['id'])

sequences = []
y = []
for seq in fasta:
    sequences.append(seq['sequence'])
    y.append(1)

file_path = 'Arabidopsis_thaliana_CYP_gene_Family.fa'
fasta = read_fasta_file(file_path)

for seq in fasta:
    sequences.append(seq['sequence'])
    y.append(0)

for id in fasta:
    ids.append(id['id'])

kmer_result = []
for i in range(len(sequences)):
    kmer_result.append(codon(sequences[i], 2))
    kmer_result[i].update(codon(sequences[i], 1))


v = DictVectorizer(sparse=False)

features = v.fit_transform(kmer_result)
feature_names = v.get_feature_names_out()


x = pd.DataFrame(features, columns=feature_names)


features = v.fit_transform(kmer_result)
feature_names = v.get_feature_names_out()


x.to_csv('kmer_features.csv', index=False)
x.rename(index=dict(zip(x.index, ids)), inplace=True)

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

label_encoder = LabelEncoder()

for col in x.columns:
    if x[col].dtype == 'object':
        x[col] = label_encoder.fit_transform(x[col])
        
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)


X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
X_test = np.array(X_test)




scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
svm_classifier = SVC(kernel='rbf', C=1)


svm_classifier.fit(X_train_scaled, y_train)
y_pred_svm = svm_classifier.predict(X_test_scaled)

y_pred_svm_labels = label_encoder.inverse_transform(y_pred_svm)

print("SVM Predicted labels:", y_pred_svm_labels)


accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

precision_svm = precision_score(y_test, y_pred_svm)
print("SVM Precision:", precision_svm)

sensitivity_svm = recall_score(y_test, y_pred_svm)
print("SVM Sensitivity (Recall):", sensitivity_svm)

f1_svm = f1_score(y_test, y_pred_svm)
print("SVM F1 Score:", f1_svm)


SVM Predicted labels: [0 0 0 1 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 0 0 0 1 1 1 1 0 0 1 0 0 1 1
 0 0 0 1 0 1 1 0 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 1 0 1 0 0 0 0 1 0
 1 0]
SVM Accuracy: 0.8421052631578947
SVM Precision: 0.9310344827586207
SVM Sensitivity (Recall): 0.7297297297297297
SVM F1 Score: 0.8181818181818181


