# Notebook to format DMS data for the MARDy 2.0 database

## Import libraries

In [1]:
import pandas as pd

## Specify paths and parameters

In [2]:
list_df_paths = ['aggregated_data/classified_variants/BY4741_FKS1-HS1_single/aa_refined_classification.csv',
                 'aggregated_data/classified_variants/BY4741_FKS1-HS1_double/aa_refined_classification.csv',
                 'aggregated_data/classified_variants/BY4741_FKS1-HS2_single/aa_refined_classification.csv',
                 'aggregated_data/classified_variants/BY4741_FKS1-HS2_double/aa_refined_classification.csv'
                ]

pos_offset = {'FKS1-HS1': 639,
              'FKS1-HS2': 1353,
              'FKS2-HS1': 659,
              'FKS2-HS2': 1372
             }

wt_hotspots = {'FKS1-HS1': 'FLVLSLRDP',
               'FKS1-HS2': 'DWVRRYTL',
               'FKS2-HS1': 'LILSLRDP',
               'FKS2-HS2': 'DWVRRYTL'
              }

## Import data

In [3]:
ldf = []

for f in list_df_paths:
    # Extract gene name and pool type from filepath
    locus = f.split('/')[-2].split('_')[1]
    pool_type = f.split('/')[-2].split('_')[2]
    
    # Import dataframe
    fdf = pd.read_csv(f, index_col=0)
    
    # Add gene name
    fdf['locus'] = locus
    
    # Filter single mutants
    if pool_type == 'single':
        fdf = fdf[(fdf.seq_type == pool_type)
                  & (fdf.Nham_aa == 1)
                 ]
    
    # Exclude variants with a confidence score of 3
    # Keep only drug conditions
    fdf = fdf[(fdf.cscore < 3)
              & (fdf.compound.isin(['caspofungin','micafungin','anidulafungin']))
             ]
    
    # Add to list of dataframes
    ldf.append(fdf)

df = pd.concat(ldf, ignore_index=True)
df

Unnamed: 0,compound,seq_type,Nham_aa,aa_seq,s,cscore,refined_class,sensres,locus
0,anidulafungin,single,1.0,*LVLSLRDP,0.230635,1.0,WT-like,sensitive,FKS1-HS1
1,anidulafungin,single,1.0,ALVLSLRDP,1.756300,1.0,intermediary,resistant,FKS1-HS1
2,anidulafungin,single,1.0,DLVLSLRDP,0.047285,1.0,WT-like,sensitive,FKS1-HS1
3,anidulafungin,single,1.0,ELVLSLRDP,0.266907,1.0,WT-like,sensitive,FKS1-HS1
4,anidulafungin,single,1.0,F*VLSLRDP,0.165205,1.0,WT-like,sensitive,FKS1-HS1
...,...,...,...,...,...,...,...,...,...
20651,micafungin,,,YWVRVYTL,4.483994,2.0,WT-like,sensitive,FKS1-HS2
20652,micafungin,,,YWVRWYTL,11.479603,1.0,resistant,resistant,FKS1-HS2
20653,micafungin,,,YWVSRYTL,-3.195642,1.0,WT-like,sensitive,FKS1-HS2
20654,micafungin,,,YWVVRYTL,-1.842202,2.0,WT-like,sensitive,FKS1-HS2


## Get mutations

In [4]:
def get_mutations(aa, wt, offset):
    lmut = [wt[i]+str(i+offset)+v for i,v in enumerate(aa) if wt[i] != v]
    return ','.join(lmut)

In [5]:
df['mutation'] = df.apply(lambda row: get_mutations(row.aa_seq, wt_hotspots[row.locus], pos_offset[row.locus]), axis=1)
df

Unnamed: 0,compound,seq_type,Nham_aa,aa_seq,s,cscore,refined_class,sensres,locus,mutation
0,anidulafungin,single,1.0,*LVLSLRDP,0.230635,1.0,WT-like,sensitive,FKS1-HS1,F639*
1,anidulafungin,single,1.0,ALVLSLRDP,1.756300,1.0,intermediary,resistant,FKS1-HS1,F639A
2,anidulafungin,single,1.0,DLVLSLRDP,0.047285,1.0,WT-like,sensitive,FKS1-HS1,F639D
3,anidulafungin,single,1.0,ELVLSLRDP,0.266907,1.0,WT-like,sensitive,FKS1-HS1,F639E
4,anidulafungin,single,1.0,F*VLSLRDP,0.165205,1.0,WT-like,sensitive,FKS1-HS1,L640*
...,...,...,...,...,...,...,...,...,...,...
20651,micafungin,,,YWVRVYTL,4.483994,2.0,WT-like,sensitive,FKS1-HS2,"D1353Y,R1357V"
20652,micafungin,,,YWVRWYTL,11.479603,1.0,resistant,resistant,FKS1-HS2,"D1353Y,R1357W"
20653,micafungin,,,YWVSRYTL,-3.195642,1.0,WT-like,sensitive,FKS1-HS2,"D1353Y,R1356S"
20654,micafungin,,,YWVVRYTL,-1.842202,2.0,WT-like,sensitive,FKS1-HS2,"D1353Y,R1356V"


In [6]:
df[df.duplicated(subset=['compound','aa_seq'], keep=False)]

Unnamed: 0,compound,seq_type,Nham_aa,aa_seq,s,cscore,refined_class,sensres,locus,mutation
39,anidulafungin,single,1.0,FLVKSLRDP,0.752623,1.0,intermediary,resistant,FKS1-HS1,L642K
147,anidulafungin,single,1.0,FLWLSLRDP,1.492155,1.0,intermediary,resistant,FKS1-HS1,V641W
213,caspofungin,single,1.0,FLVKSLRDP,1.885922,1.0,resistant,resistant,FKS1-HS1,L642K
321,caspofungin,single,1.0,FLWLSLRDP,0.936245,1.0,intermediary,resistant,FKS1-HS1,V641W
529,anidulafungin,single,1.0,FLVKSLRDP,1.007996,1.0,intermediary,resistant,FKS1-HS1,L642K
530,caspofungin,single,1.0,FLVKSLRDP,1.866959,1.0,resistant,resistant,FKS1-HS1,L642K
538,anidulafungin,single,1.0,FLWLSLRDP,1.446169,1.0,intermediary,resistant,FKS1-HS1,V641W
539,caspofungin,single,1.0,FLWLSLRDP,1.040897,1.0,intermediary,resistant,FKS1-HS1,V641W


## Get degree of evidence

All observations should be attributed a MARDy 2 degree of 3, except for mutants individually recreated in the lab. For mutants for which we do not have DMS data, an inferred was inferred by linear regression from normalized individual growth measurements.

need to change "notes" depending on the type of mutant

In [7]:
# List of mutants recreated in the lab
lmissing = ['F639C','L640D','L642G','P647N','P647Q'] # mutants for which we don't have DMS data
lcorr = ['L642K','V641W'] # mutants for which the DMS data was replaced by the inferred score (because the DMS score was underestimated) **ONLY IN MICAFUNGIN**
lboth = ['F639V','F639S','L642Y','L642F','S643P','S643F','S643Y','S643C','L640K','V641S','L640G','D646W','F639D','F639E','V641F','V641H','S643L','R645D','R645S','R645Q','F639Q'
        ] # mutants for which we have DMS data       

In [8]:
def get_evidence_degree(mut, lrec, phe):
    # First argument is mutation
    # Second argument is list of mutations that were individually recreated
    # Third argument is phenotype
    
    if mut in lrec:
        score = 1 # Evidence score for a reconstructed mutant
    else:
        score = 3 # Evidence score for bulk competition measurement
        
    if phe == 'resistant':
        return score
    else:
        return -score

In [10]:
df['degree of evidence'] = df.apply(lambda row: get_evidence_degree(row.mutation, lmissing+lcorr+lboth, row.sensres), axis=1)

## Custom notes

In [11]:
def get_notes(m, lmissing, lcorr, lboth, rclass, comp):
    if m in lmissing:
        return f'Mutant missing from the bulk competition assay, reconstructed individually. Selection coefficient was inferred by linear regression from normalized individual growth measurements. The mutant was classified as {rclass} using a Gaussian mixture model.'
    elif (m in lcorr) & (comp=='micafungin'):
        return f'Selection coefficient from bulk competition assay was found to be underestimated after reconstructing the mutant individually and performing individual growth measurements (validation assay). A corrected selection coefficient was inferred by linear regression from the validation assay. The mutant was classified as {rclass} using a Gaussian mixture model.'
    elif (m in lcorr+lboth):
        return f'Mutant individually reconstructed to validate bulk competition assay. The selection coefficient indicated is the one from bulk competition assay but correlates well with individual growth measurements. The mutant was classified as {rclass} using a Gaussian mixture model.'
    else:
        return f'Selection coefficient from bulk competition assay. The mutant was classified as {rclass} using a Gaussian mixture model.'

In [12]:
df['notes'] = df.apply(lambda row: get_notes(row.mutation, lmissing, lcorr, lboth, row.refined_class, row.compound), axis=1)

## Convert columns

In [13]:
df['first author name'] = 'Durand'
df['journal'] = 'bioRxiv'
df['year'] = '2024'
df['pubmedid'] = ''
df['species'] = 'Saccharomyces cerevisiae'
df['gene or protein'] = df.locus.apply(lambda x: x.split('-')[0])
df['accession number for protein name'] = ''
df['source (NCBI, uniprot, ensembl, etc)'] = ''
df['drug'] = df.compound.apply(lambda x: x.title())
df['MIC if provided or fold or any quantitative measure'] = df.s.apply(lambda x: 'selection coefficient = '+str(round(x,2)))
df['strain_ID'] = ''
df['strain origin if available (Environment, patient, lab evolved)'] = 'Lab'
df['curator'] = 'Romain Durand'
df[['first author name','journal','year','pubmedid','species','gene or protein','accession number for protein name','source (NCBI, uniprot, ensembl, etc)','drug','mutation','degree of evidence','MIC if provided or fold or any quantitative measure','strain_ID','strain origin if available (Environment, patient, lab evolved)','curator','notes']
  ].to_excel('mardy2/Durand_2024_curation_mardy_2.xlsx', index=False)