In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio import AlignIO
from Bio.motifs import Motif
from Bio.Align import AlignInfo
from Bio.SeqRecord import SeqRecord
import os
from Bio import motifs
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    roc_auc_score, roc_curve, accuracy_score, confusion_matrix, matthews_corrcoef,
    classification_report
)
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [None]:
train_data_path = os.path.join('..', 'Data', 'Train_valid.fasta')
test_data_path = os.path.join('..', 'Data', 'test_cleaned.fasta')

In [None]:
y_train = [int(record.description.split('_')[-1]) for record in SeqIO.parse(train_data_path,'fasta')]
y_test = [int(record.description.split('_')[-1]) for record in SeqIO.parse(test_data_path,'fasta')]

In [None]:
X_train = [str(record.seq) for record in SeqIO.parse(train_data_path,'fasta')]
X_test = [str(record.seq) for record in SeqIO.parse(test_data_path,'fasta')]

## Count Vectorizer

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize CountVectorizer for character-level features
count_vectorizer = CountVectorizer(analyzer='char')

# Fit and transform the train sequences
X_train_count = count_vectorizer.fit_transform(X_train).toarray()
X_test_count = count_vectorizer.fit_transform(X_test).toarray()


## TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE

# Initialize TfidfVectorizer for character-level features
tfidf_vectorizer = TfidfVectorizer(analyzer='char')

# Fit and transform the train sequences
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.fit_transform(X_test).toarray()

smote = SMOTE(random_state=42)
X_train_tfidf, y_train = smote.fit_resample(X_train_tfidf, y_train)

# Apply on a Classifier

In [None]:
# Function to evaluate model and print metrics
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)[:, 1] if hasattr(model, "predict_proba") else model.decision_function(X_test)

    # Confusion matrix and derived metrics
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    mcc = matthews_corrcoef(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    auc_roc = roc_auc_score(y_test, y_proba)

    print(f"Sensitivity: {sensitivity}")
    print(f"Specificity: {specificity}")
    print(f"Accuracy: {accuracy}")
    print(f"MCC: {mcc}")
    print(f"AUC-ROC: {auc_roc}")

    # Manually plot ROC curve
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f'AUC-ROC = {auc_roc:.2f}')
    plt.plot([0, 1], [0, 1], 'k--')  # Diagonal line for random guess
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve')
    plt.legend(loc='lower right')
    plt.show()

    return {"sensitivity": sensitivity, "specificity": specificity, "accuracy": accuracy, "mcc": mcc, "auc_roc": auc_roc}

In [None]:
# List of models to evaluate
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "K Neighbors Classifier": KNeighborsClassifier(n_neighbors = 15),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier()
}

In [None]:
from sklearn.preprocessing import StandardScaler

def standardize(data):
    data = pd.DataFrame(data)
    scaler = StandardScaler()
    standardized_data = scaler.fit_transform(data)
    standardized_df = pd.DataFrame(standardized_data, columns=data.columns)
    return standardized_df

## Using TF-IDF as feature

In [None]:
train_x_scaled = standardize(X_train_tfidf)
test_x_scaled = standardize(X_test_tfidf)

# mcc_list = []

# for k in range(1, 267, 2):
#     knn = KNeighborsClassifier(n_neighbors=k)
#     knn_pred = knn.fit(train_x_scaled, y_train)
#     knn_pred = knn.predict(test_x_scaled)
#     mcc_list.append([k, matthews_corrcoef(y_test, knn_pred)])

# print(max(mcc_list, key=lambda x: x[1]))

k = 37
model = KNeighborsClassifier(n_neighbors=k)
model.fit(train_x_scaled, y_train)
metrics = evaluate_model(model, test_x_scaled, y_test)



In [None]:
#  Train and evaluate each model
for model_name, model in models.items():
    print(f"Evaluating {model_name}")
    model.fit(X_train_tfidf, y_train)
    metrics = evaluate_model(model, X_test_tfidf, y_test)

In [None]:
train_x_scaled = standardize(X_train_tfidf)
test_x_scaled = standardize(X_test_tfidf)

# SVM = SVC(kernel='rbf', probability=True, class_weight='balanced', random_state=42)
# SVM.fit(X_train_tfidf, y_train)
# evaluate_model(SVM, X_test_tfidf, y_test)