In [113]:
import pandas as pd
from rapidfuzz import process, distance
from sklearn.metrics import accuracy_score, f1_score, precision_score
#import pylcs

In [114]:
db = 'urinary_max20_min5'
region = 'V3V4' # 'dna_seq' or 'V3V4'. Will only apply to reference sequences.

In [115]:
print(db)
print(region)

urinary_max20_min5
V3V4


In [None]:
ref_file = '../datasets/train_sets/' + db + '.csv'
test_file = '../datasets/test_sets/test_set_from_refseq_v2.csv'
output_file = '../preds/rapidfuzz/rapidfuzz_' + db + '_' + region + '.csv'


In [117]:
# Import datasets
urinary_known_data = pd.read_csv(ref_file)
test_data = pd.read_csv(test_file)

display(urinary_known_data.head(3))
display(test_data.head(3))

Unnamed: 0,txid,seq_id,dna_seq,domain,phylum,class,order,family,genus,species,V3V4
0,33007,>PQ788148.1 Winkia neuii strain som 201 16S ri...,CGAACGCTGGCGGCGTGCTTAACACATGCAAGTCGAACGGGATCCA...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Winkia,Winkia neuii,CCTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGACGGAAGTC...
1,33007,>OR999579.1 Winkia neuii strain CNSY1 16S ribo...,GGCCTGCGGCGTGCTTACCATGCAAGTCGAACGGGATCCATTAGCG...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Winkia,Winkia neuii,CCTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGACGAAAGTC...
2,33007,>OR260435.1 Winkia neuii strain 19 16S ribosom...,AACGGGTGAGTAACACGTGAGTAACCTGCCCTTTTCTTTGGGATAA...,Bacteria,Actinomycetota,Actinomycetes,Actinomycetales,Actinomycetaceae,Winkia,Winkia neuii,CCTACGGGAGGCAGCAGTGGGGGATATTGCACAATGGACGNAAGTC...


Unnamed: 0.1,Unnamed: 0,txid,seq_id,domain,phylum,class,order,family,genus,species,dna_seq
0,3365,24,>OK036813.1 Shewanella putrefaciens strain NMC...,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella putrefaciens,CCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGGGAAACCC...
1,1874,853,>MW398078.1 Faecalibacterium prausnitzii strai...,Bacteria,Bacillota,Clostridia,Eubacteriales,Oscillospiraceae,Faecalibacterium,Faecalibacterium prausnitzii,CCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGGGAAACCC...
2,811,98671,>EU086786.1 Arthrobacter albus strain 1366 16S...,Bacteria,Actinomycetota,Actinomycetes,Micrococcales,Micrococcaceae,Pseudoglutamicibacter,Pseudoglutamicibacter albus,CCTACGGGAGGCAGCAGTGGGGAATATTGCACAATGGGCGCAAGCC...


In [118]:
# Functions

#  get best matching reference sequence for each test sequence, returns df
def compare_seq(ref_data, ref_seq_column, unknown_data):
    known_data = ref_data.drop_duplicates(subset = ref_seq_column)
    choices = known_data[ref_seq_column]   

    # init results df
    results = pd.DataFrame(columns = ['seq_id', 'test_species', 'known_species', 'sim_score', 'test_seq', 'known_seq', 'match'])

    for i in range(unknown_data.shape[0]):
        (best_seq, score, index) = process.extractOne(unknown_data['dna_seq'][i], choices, scorer=distance.Indel.normalized_distance)
        
        # get values for results df
        unknown_id = unknown_data.loc[i, 'seq_id']          # test seq_id
        test_seq = unknown_data.loc[i, 'dna_seq']           # test dna sequence
        test_species = unknown_data.loc[i, 'species']       # real test species
        known_seq = best_seq                                # nearest reference sequence
        known_species = known_data.loc[index, 'species']    # nearest reference species
        sim_score = 1-score                                 # sim score
        match = (1 if test_species == known_species else 0) # real/ref species match or not

        # add values in results df
        results.loc[i] = [unknown_id, test_species, known_species, sim_score, test_seq, known_seq, match]

    accuracy = accuracy_score(results['test_species'], results['known_species'])
    precision = precision_score(results['test_species'], results['known_species'], average = 'weighted')
    f1 = f1_score(results['test_species'], results['known_species'], average = 'weighted')
    # print('Accuracy:', accuracy)
    # print('Precision:', precision)
    # print('f1_score:', f1)

    return(results, accuracy, precision, f1)


# get matches from results df, print matches rate, returns df
def extract_matches(results):
    matches = results.loc[results['match'] == 1]
    print('\n\nMatches')
    print(7*('-'), '\n')
    print("Taux d'espèces correctement identifiées : ", matches.shape[0]/results.shape[0]*100, '%%\n\n')
    #print(matches.info())
    return(matches.reset_index(drop = True))

# get no matches from results df, print no matches rate, returns df
def extract_not_matches(results):
    not_matches = results.loc[results['match'] == 0]
    print('\n\nNon-matches\n')
    print(11*('-'), '\n')
    print("Taux d'espèces mal identifiées : ", not_matches.shape[0]/results.shape[0]*100, '%%\n\n')
    #print(not_matches.info())
    return(not_matches.reset_index(drop = True))

# color similarities between test/ref sequences
def color_seq(df):
    for i in range(df.shape[0]):
        test_seq = df.loc[i, 'test_seq']
        real_species = df.loc[i, 'test_species']
        pred_species = df.loc[i, 'known_species']
        known_seq = df.loc[i, 'known_seq']
        score = df.loc[i,'sim_score']
#        match_list = pylcs.lcs_sequence_idx(test_seq, known_seq)
#        colored_s1, colored_s2 = pylcs.coloring_match_sequence(match_list, test_seq, known_seq, "#000066", "#000066", "#00cc66", "#00cc66", t=1)
        print("\nSéquence à identifier : ", real_species, "\n", "\nMeilleure séquence retournée : ", pred_species, "(sim_score = ", score, ")\n\n\n")


# main fuction
def full_process(known_seq, known_seq_column, test_data):
    results, accuracy, precision, f1 = compare_seq(known_seq, known_seq_column, test_data)
    
    # similarity scores
    print('\n\nScores de similarité :')
    print(22*('-'), '\n')
    print(results['sim_score'].describe())
    

    # get matches and visualize lowest scores
    matches = extract_matches(results)
    matches = matches.sort_values('sim_score', ascending = True).reset_index(drop = True).head(5)
    print('\n\nSéquences bien étiquetées avec le moins bon score')
    print(49*('-'), '\n')
    color_seq(matches)

    
    # get no-matches and visualize best scores
    not_matches = extract_not_matches(results)
    not_matches_good_score = not_matches.loc[not_matches['sim_score'] > 0.9]
    not_matches_good_score = not_matches_good_score.sort_values('sim_score', ascending = False).reset_index(drop = True).head(5)
    #display(not_matches_good_score)
    print('\n\nSéquences mal étiquetées avec le meilleur score')
    print(47*('-'), '\n')
    color_seq(not_matches_good_score)


    return(results, accuracy, precision, f1)



In [119]:
# remove in ref_data sequences from test_set
for id in test_data['seq_id']:
    urinary_known_data = urinary_known_data.loc[(urinary_known_data['seq_id'] != id)]

In [120]:
results, accuracy, precision, f1 = full_process(urinary_known_data, region, test_data)




Scores de similarité :
---------------------- 

count    843.000000
mean       0.994785
std        0.017018
min        0.836013
25%        0.997854
50%        1.000000
75%        1.000000
max        1.000000
Name: sim_score, dtype: float64


Matches
------- 

Taux d'espèces correctement identifiées :  74.97034400948992 %%




Séquences bien étiquetées avec le moins bon score
------------------------------------------------- 


Séquence à identifier :  Xylanibacter ruminicola 
 
Meilleure séquence retournée :  Xylanibacter ruminicola (sim_score =  0.9295774647887324 )




Séquence à identifier :  Eikenella corrodens 
 
Meilleure séquence retournée :  Eikenella corrodens (sim_score =  0.9656652360515021 )




Séquence à identifier :  Escherichia coli 
 
Meilleure séquence retournée :  Escherichia coli (sim_score =  0.9710610932475884 )




Séquence à identifier :  Shewanella putrefaciens 
 
Meilleure séquence retournée :  Shewanella putrefaciens (sim_score =  0.9721030042918455 )




S

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [121]:
results[['seq_id', 'test_species', 'known_species', 'sim_score']].to_csv(output_file, index = False)

In [None]:
score_file = '../scores/recap_scores.csv'

with open(score_file, mode = 'a') as file:
    file.write('rapidfuzz, {}, {}, species, {}, {}, {}\n'.format(db, region, accuracy, precision, f1))


# Classification des données provenant des échantillons

In [123]:
# samples_data = pd.read_csv('/home/marthe/Documents/DS/projet/local/pre_processed_data/thresh_17/dna_sequences_17.csv')

# # reference data to use
# known_data = urinary_known_data
# choices = known_data['V3V4']   

# # get nearest sequences and sim scores
# for i in range(samples_data.shape[0]):
#         (best_seq, score, index) = process.extractOne(samples_data['dna_seq'][i], choices, scorer=distance.Indel.normalized_distance)
#         samples_data.loc[i, 'pred_species'] = known_data.loc[index, 'species']
#         samples_data.loc[i, 'sim_score'] = 1-score

# # keep only best scores
# samples_data.loc[samples_data['sim_score'] > 0.9]

# # results info
# print(samples_data['sim_score'].describe())
# display(samples_data)

# samples_data.to_csv('/home/marthe/Documents/DS/projet/local/new_classifications/rapid_fuzz/fuzz_samples_data_classification.csv', index = False)
