In [1]:
import pandas as pd

In [72]:
# https://www.ncbi.nlm.nih.gov/nuccore/1798174254
# https://www.ncbi.nlm.nih.gov/datasets/taxonomy/2697049/ 

# 
gene_mapping = {
    '3CLpro': (10055,10972),  
    # https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2?report=genbank&log$=seqview
    # https://www.ncbi.nlm.nih.gov/datasets/taxonomy/2697049/ 

    'RdRp': (13468, 16264),  
    # https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2?report=genbank&log$=seqview
    
    'Spike': (21563, 25384), # https://www.ncbi.nlm.nih.gov/gene?LinkName=nuccore_gene&from_uid=1798174254
    
}

In [73]:
# the drug restistance mutation files are from 
# https://covdb.stanford.edu/drms/3clpro/
# The downloaded files are here: [from Gordon]
# https://gist.github.com/gordonkoehn/c7f14867217ba8b4b784131951a08531

df = pd.read_csv("3CLpro_inhibitors_datasheet.csv")
df['gene'] = '3CLpro'

df1 = pd.read_csv("spike_mAbs_datasheet.csv")
df1['gene'] = 'Spike'


df2 = pd.read_csv('RdRP_inhibitors_datasheet.csv')
df2['gene'] = 'RdRp'

df = pd.concat([df, df1, df2])

In [74]:

df1['RefAminoAcid']=df1['Mutation'].str[0]
df1['AltAminoAcid']=df1['Mutation'].str[-1]
df1['PosAminoAcid']=df1['Mutation'].str[1:-1]

df_test = df1[df1['PosAminoAcid']=='440'][['Mutation', 'gene', 'RefAminoAcid', 'AltAminoAcid', 'PosAminoAcid']]

df_test['PosNucleotide'] = df_test.apply(f_map2nucleotidespace, axis=1)

In [75]:
df_test

Unnamed: 0,Mutation,gene,RefAminoAcid,AltAminoAcid,PosAminoAcid,PosNucleotide
35,N440D,Spike,N,D,440,22880
36,N440E,Spike,N,E,440,22880
37,N440I,Spike,N,I,440,22880
38,N440K,Spike,N,K,440,22880
39,N440R,Spike,N,R,440,22880
40,N440T,Spike,N,T,440,22880
41,N440Y,Spike,N,Y,440,22880


In [76]:
# Spike 440 coincides with 22880,22881,22882

In [77]:
df['RefAminoAcid']=df['Mutation'].str[0]
df['AltAminoAcid']=df['Mutation'].str[-1]
df['PosAminoAcid']=df['Mutation'].str[1:-1]

# overwrite corner case
df.loc[df['PosAminoAcid']=='49de', 'PosAminoAcid']= '49'
df.loc[df['PosAminoAcid']=='49de', 'AltAminoAcid']='del'

# overwrite corner case
df.loc[df['PosAminoAcid']=='168de', 'PosAminoAcid']= '168'
df.loc[df['PosAminoAcid']=='168de', 'AltAminoAcid']='del'

df['PosAminoAcid'] = df['PosAminoAcid'].astype(int) 

In [78]:
def f_map2nucleotidespace(row): 
    start_gene=int(gene_mapping[row['gene']][0])
    n_aa = int(row['PosAminoAcid'])-1
    return start_gene+n_aa*3

In [79]:
df['PosNucleotide'] = df.apply(f_map2nucleotidespace, axis=1)
df['SNPCodonPosition'] = 0

In [80]:
df_CodonPosition1 = df.copy()
df_CodonPosition1['SNPCodonPosition']=1
df_CodonPosition1['PosNucleotide']=df_CodonPosition1['PosNucleotide']+1

In [81]:
df_CodonPosition2 = df.copy()
df_CodonPosition2['SNPCodonPosition']=2
df_CodonPosition2['PosNucleotide']=df_CodonPosition2['PosNucleotide']+2

In [82]:
df = pd.concat([df, df_CodonPosition1, df_CodonPosition2])

In [83]:
df[df['Mutation'].isin(['C799F', 'C799R'])]

Unnamed: 0,Mutation,NTV: fold,NTV: pocket,ENS: fold,fitness,in patient,in vitro,Prevalence,gene,BAM: fold,...,C135: fold,C135: dms,C144: fold,C144: dms,RDV: fold,RefAminoAcid,AltAminoAcid,PosAminoAcid,PosNucleotide,SNPCodonPosition
14,C799F,,,,,,2.0,0.0009%,RdRp,,...,,,,,3.4,C,F,799,15862,0
15,C799R,,,,,,1.0,0.00006%,RdRp,,...,,,,,2.8,C,R,799,15862,0
14,C799F,,,,,,2.0,0.0009%,RdRp,,...,,,,,3.4,C,F,799,15863,1
15,C799R,,,,,,1.0,0.00006%,RdRp,,...,,,,,2.8,C,R,799,15863,1
14,C799F,,,,,,2.0,0.0009%,RdRp,,...,,,,,3.4,C,F,799,15864,2
15,C799R,,,,,,1.0,0.00006%,RdRp,,...,,,,,2.8,C,R,799,15864,2


In [84]:
df.columns

Index(['Mutation', 'NTV: fold', 'NTV: pocket', 'ENS: fold', 'fitness',
       'in patient', 'in vitro', 'Prevalence', 'gene', 'BAM: fold', 'BAM: dms',
       'ETE: fold', 'ETE: dms', 'CAS: fold', 'CAS: dms', 'IMD: fold',
       'IMD: dms', 'CIL: fold', 'CIL: dms', 'TIX: fold', 'TIX: dms',
       'SOT: fold', 'SOT: dms', 'BEB: fold', 'BEB: dms', 'REG: fold',
       'AMU: fold', 'ROM: fold', 'ADI: fold', 'C135: fold', 'C135: dms',
       'C144: fold', 'C144: dms', 'RDV: fold', 'RefAminoAcid', 'AltAminoAcid',
       'PosAminoAcid', 'PosNucleotide', 'SNPCodonPosition'],
      dtype='object')

In [85]:
df[['Mutation', 'gene','RefAminoAcid', 'AltAminoAcid',
       'PosAminoAcid', 'PosNucleotide', 'SNPCodonPosition']].to_csv("drug_resistance_mutations.csv")