In [100]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import neighbors
from sklearn.metrics import accuracy_score, precision_score, f1_score, classification_report

In [101]:
db = 'urinary_max20_min5'
region = 'V3V4' # 'dna_seq' or 'V3V4'. Will only apply to reference sequences.

In [102]:
print(db)
print(region)

urinary_max20_min5
V3V4


In [None]:
# File paths
reference_database = "../datasets/train_sets/" + db + ".csv"
test_database = "../datasets/test_sets/test_set_from_refseq_v2.csv"


if region == 'dna_seq':
    output_file = "../preds/knn/knn_" + db + "_fullseq.csv"
elif region == 'V3V4':
    output_file = "../preds/knn/knn_" + db + "_V3V4.csv"

In [104]:
# Load data
train_data = pd.read_csv(reference_database)
test_data = pd.read_csv(test_database)

taxonomy_levels = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']

In [105]:
#######################
# keep only if needed #
#######################

# remove in train_set sequences from test_set
for id in test_data['seq_id']:
    train_data = train_data.loc[(train_data['seq_id'] != id)]

In [106]:
for level in taxonomy_levels:
    train_data[level] = train_data[level].fillna('Unknown')

# Validate DNA sequences
def clean_sequence(sequence):
    return re.sub(r'[^ATGC]', '', sequence.upper())

train_data['dna_seq'] = train_data['dna_seq'].astype(str).apply(clean_sequence)
train_data['V3V4'] = train_data['V3V4'].astype(str).apply(clean_sequence)
test_data['dna_seq'] = test_data['dna_seq'].astype(str).apply(clean_sequence)

In [107]:
def generate_kmers(sequence, k):
    return [sequence[i:i+k] for i in range(len(sequence) - k + 1)]

def encode_sequences(sequences, k):
    kmer_list = [' '.join(generate_kmers(seq, k)) for seq in sequences]
    vectorizer = TfidfVectorizer()
    kmer_vectors = vectorizer.fit_transform(kmer_list)
    return kmer_vectors, vectorizer

def find_closest_taxonomy(train_data, test_data, level, k=7, threshold=0.8, n_neighbors=3):
    X_train = train_data[region].tolist()
    y_train = train_data[level]
    
    X_test = test_data["dna_seq"].tolist()
    y_test = test_data[level]
    
    # Encode sequences
    train_vectors, vectorizer = encode_sequences(X_train, k)
    test_vectors = vectorizer.transform([' '.join(generate_kmers(seq, k)) for seq in X_test])
    
    # Use Nearest Neighbors for fast lookup
    knn = neighbors.KNeighborsClassifier(n_neighbors=n_neighbors, metric='cosine')
    knn.fit(train_vectors, y_train)
    
    y_pred = knn.predict(test_vectors)
    
    return y_pred
    

In [108]:
# Run classification
y_true = pd.DataFrame()
y_pred = pd.DataFrame()

for level in taxonomy_levels:
    level_pred = find_closest_taxonomy(train_data, test_data, level, k=7, threshold=0.8, n_neighbors=3)
    y_true[level] = test_data[level]
    y_pred[level] = level_pred

y_pred.to_csv(output_file, index=False)
print("Classification completed and saved.")

Classification completed and saved.


In [109]:
def print_scores(y_true, y_pred, levels_list):
    print(db)
    print(region)
    print('\n')
    for level in levels_list:
        print(level)
        print('-' * len(level))
        print('   accuracy : ', accuracy_score(y_true[level], y_pred[level]))
        print('   precision :', precision_score(y_true[level], y_pred[level], average = 'weighted', zero_division = np.nan))
        print('   score f1 :', f1_score(y_true[level], y_pred[level], average = 'weighted', zero_division = np.nan))
        print('\n')
        #print(classification_report(y_true[level], y_pred[level], zero_division = np.nan))

print_scores(y_true, y_pred, taxonomy_levels)

urinary_max20_min5
V3V4


domain
------
   accuracy :  1.0
   precision : 1.0
   score f1 : 1.0


phylum
------
   accuracy :  0.99644128113879
   precision : 0.9964702612562535
   score f1 : 0.994672368045351


class
-----
   accuracy :  0.9952550415183867
   precision : 0.9954714971616073
   score f1 : 0.993536075108728


order
-----
   accuracy :  0.9916963226571768
   precision : 0.9921638719424295
   score f1 : 0.9892754563935725


family
------
   accuracy :  0.9845788849347569
   precision : 0.9872390559949209
   score f1 : 0.979440457526879


genus
-----
   accuracy :  0.9252669039145908
   precision : 0.9517600414430923
   score f1 : 0.9109510308181384


species
-------
   accuracy :  0.7710557532621589
   precision : 0.8657456542545462
   score f1 : 0.742429206486146




In [None]:
score_file = '../scores/recap_scores.csv'

with open(score_file, mode = 'a') as file:
    for level in taxonomy_levels[-3:]:
        print(level)
        accuracy = accuracy_score(y_true[level], y_pred[level])
        print(accuracy)
        precision = precision_score(y_true[level], y_pred[level], average = 'weighted', zero_division = np.nan)
        print(precision)
        f1 = f1_score(y_true[level], y_pred[level], average = 'weighted', zero_division = np.nan)
        print(f1)
        
        file.write('knn, {}, {}, {}, {}, {}, {}\n'.format(db, region, level, accuracy, precision, f1))


family
0.9845788849347569
0.9872390559949209
0.979440457526879
genus
0.9252669039145908
0.9517600414430923
0.9109510308181384
species
0.7710557532621589
0.8657456542545462
0.742429206486146
