In [104]:
import re
import pandas as pd
from rapidfuzz import process, distance
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score


In [105]:
db = 'urinary_max20_min5'
region = 'V3V4' # 'dna_seq' or 'V3V4'. Will only apply to reference sequences.

In [None]:
data_file = '../datasets/train_sets/' + db + '.csv'


if region == 'dna_seq':
    output_file = '../preds/rapidfuzz/fuzz_oneset_' + db + '_fullseq.csv'
elif region == 'V3V4':
    output_file = '../preds/rapidfuzz/fuzz_oneset_' + db + '_V3V4.csv'


In [107]:
# Import datasets
database = pd.read_csv(data_file)

taxonomy_levels = ['domain', 'phylum', 'class', 'order', 'family', 'genus', 'species']

database['taxonomy'] = database[taxonomy_levels].fillna("Unknown").agg('_'.join, axis=1)

In [108]:
# Validate DNA sequences
def clean_sequence(sequence):
    return re.sub(r'[^ATGC]', '', sequence.upper())

database['dna_seq'] = database['dna_seq'].astype(str).apply(clean_sequence)
database['V3V4'] = database['V3V4'].astype(str).apply(clean_sequence)

In [109]:
if region == 'V3V4':
    database = database.drop_duplicates('V3V4').reset_index()
    
ref_set, test_set = train_test_split(database, test_size = 0.2, random_state = 1234)

In [110]:
test_set = test_set.reset_index()


In [111]:
# Functions

#  get best matching reference sequence for each test sequence, returns df
def compare_seq(ref_set, region, test_set):
    choices = ref_set[region] 
    

    # init results df
    results = pd.DataFrame(columns = ['seq_id', 'test_species', 'ref_species', 'sim_score', 'test_seq', 'ref_seq', 'match'])
    
    for i in range(test_set.shape[0]):
        
        (best_seq, score, index) = process.extractOne(test_set['V3V4'][i], choices, scorer=distance.Indel.normalized_distance)
        
        # get values for results df
        test_seq_id = test_set.loc[i, 'seq_id']          # test seq_id
        test_seq = test_set.loc[i, 'dna_seq']           # test dna sequence
        test_species = test_set.loc[i, 'species']       # real test species
        best_ref_seq = best_seq                                # nearest reference sequence
        ref_species = ref_set.loc[index, 'species']    # nearest reference species
        sim_score = 1-score                                 # sim score
        match = (1 if test_species == ref_species else 0) # real/ref species match or not

        # add values in results df
        results.loc[i] = [test_seq_id, test_species, ref_species, sim_score, test_seq, best_ref_seq, match]

    accuracy = accuracy_score(results['test_species'], results['ref_species'])
    precision = precision_score(results['test_species'], results['ref_species'], average = 'weighted')
    f1 = f1_score(results['test_species'], results['ref_species'], average = 'weighted')
    print('Accuracy:', accuracy)
    print('Precision:', precision)
    print('f1_score:', f1)
    return(results, accuracy, precision, f1)




# main fuction
def full_process(ref_set, region, test_set):
    results, accuracy, precision, f1 = compare_seq(ref_set, region, test_set)
    
    # similarity scores
    print('\n\nScores de similarité :')
    print(22*('-'), '\n')
    print(results['sim_score'].describe())
    
    return(results, accuracy, precision, f1)



In [112]:
print(db)
print(region)
results, accuracy, precision, f1 = full_process(ref_set, region, test_set)
results[['seq_id', 'test_species', 'ref_species', 'sim_score']].to_csv(output_file, index = False)

urinary_max20_min5
V3V4
Accuracy: 0.7585585585585586
Precision: 0.7660703560703561
f1_score: 0.7429909129909129


Scores de similarité :
---------------------- 

count    555.000000
mean       0.991478
std        0.021320
min        0.725857
25%        0.993576
50%        0.996791
75%        0.997854
max        0.998930
Name: sim_score, dtype: float64


  type_true = type_of_target(y_true, input_name="y_true")
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)
  type_true = type_of_target(y_true, input_name="y_true")
  ys_types = set(type_of_target(x) for x in ys)


In [None]:
score_file = '../scores/recap_scores.csv'

with open(score_file, mode = 'a') as file:
    for level in taxonomy_levels[-1:]:
        print(level)
        print(accuracy)
        print(precision)
        print(f1)
        
        file.write('rapidfuzz oneset, {}, {}, species, {}, {}, {}\n'.format(db, region, accuracy, precision, f1))


species
0.7585585585585586
0.7660703560703561
0.7429909129909129
