In [1]:
import pandas as pd

In [2]:
# https://www.ncbi.nlm.nih.gov/nuccore/1798174254
# https://www.ncbi.nlm.nih.gov/datasets/taxonomy/2697049/ 

# 
gene_mapping = {
    '3CLpro': (10055,10972),  
    # https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2?report=genbank&log$=seqview
    # https://www.ncbi.nlm.nih.gov/datasets/taxonomy/2697049/ 

    'RdRp': (13468, 16264),  
    # https://www.ncbi.nlm.nih.gov/nuccore/NC_045512.2?report=genbank&log$=seqview
    
    'Spike': (21563, 25384), # https://www.ncbi.nlm.nih.gov/gene?LinkName=nuccore_gene&from_uid=1798174254
    
}

In [3]:
# the drug restistance mutation files are from 
# https://covdb.stanford.edu/drms/3clpro/
# The downloaded files are here: [from Gordon]
# https://gist.github.com/gordonkoehn/c7f14867217ba8b4b784131951a08531

df = pd.read_csv("3CLpro_inhibitors_datasheet.csv")
df['gene'] = '3CLpro'

df1 = pd.read_csv("spike_mAbs_datasheet.csv")
df1['gene'] = 'Spike'


df2 = pd.read_csv('RdRP_inhibitors_datasheet.csv')
df2['gene'] = 'RdRp'

df = pd.concat([df, df1, df2])

In [4]:
df['RefAminoAcid']=df['Mutation'].str[0]
df['AltAminoAcid']=df['Mutation'].str[-1]
df['PosAminoAcid']=df['Mutation'].str[1:-1]

# overwrite corner case
df.loc[df['PosAminoAcid']=='49de', 'PosAminoAcid']= '49'
df.loc[df['PosAminoAcid']=='49de', 'AltAminoAcid']='del'

# overwrite corner case
df.loc[df['PosAminoAcid']=='168de', 'PosAminoAcid']= '168'
df.loc[df['PosAminoAcid']=='168de', 'AltAminoAcid']='del'

In [5]:
def f_map2nucleotidespace(row): 
    start_gene=int(gene_mapping[row['gene']][0])
    n_aa = int(row['PosAminoAcid'])
    return start_gene+n_aa*3

In [6]:
df['PosNucleotide'] = df.apply(f_map2nucleotidespace, axis=1)
df['CodonPosition'] = 0

In [8]:
df_CodonPosition1 = df.copy()
df_CodonPosition1['CodonPosition']=1
df_CodonPosition1['PosNucleotide']=df_CodonPosition1['PosNucleotide']+1

In [9]:
df_CodonPosition2 = df.copy()
df_CodonPosition2['CodonPosition']=2
df_CodonPosition2['PosNucleotide']=df_CodonPosition2['PosNucleotide']+2

In [10]:
df = pd.concat([df, df_CodonPosition1, df_CodonPosition2])

In [11]:
df[df['Mutation'].isin(['C799F', 'C799R'])]

Unnamed: 0,Mutation,NTV: fold,NTV: pocket,ENS: fold,fitness,in patient,in vitro,Prevalence,gene,BAM: fold,...,C135: fold,C135: dms,C144: fold,C144: dms,RDV: fold,RefAminoAcid,AltAminoAcid,PosAminoAcid,PosNucleotide,CodonPosition
14,C799F,,,,,,2.0,0.0009%,RdRp,,...,,,,,3.4,C,F,799,15865,0
15,C799R,,,,,,1.0,0.00006%,RdRp,,...,,,,,2.8,C,R,799,15865,0
14,C799F,,,,,,2.0,0.0009%,RdRp,,...,,,,,3.4,C,F,799,15866,1
15,C799R,,,,,,1.0,0.00006%,RdRp,,...,,,,,2.8,C,R,799,15866,1
14,C799F,,,,,,2.0,0.0009%,RdRp,,...,,,,,3.4,C,F,799,15867,2
15,C799R,,,,,,1.0,0.00006%,RdRp,,...,,,,,2.8,C,R,799,15867,2


In [12]:
df.to_csv("drug_resistance_mutations.csv")