In [1]:
import pandas as pd
import numpy as np

In [2]:
# download table of drug resistance mutations
# https://github.com/hivdb/hivfacts/blob/main/data/drms_hiv1.csv

In [5]:
%%bash
wget https://raw.githubusercontent.com/hivdb/hivfacts/main/data/drms_hiv1.csv

--2024-01-05 11:55:48--  https://raw.githubusercontent.com/hivdb/hivfacts/main/data/drms_hiv1.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3567 (3.5K) [text/plain]
Saving to: ‘drms_hiv1.csv’

     0K ...                                                   100% 28.1M=0s

2024-01-05 11:55:49 (28.1 MB/s) - ‘drms_hiv1.csv’ saved [3567/3567]



In [2]:
df_drms_hiv1 = pd.read_csv("drms_hiv1.csv")
df_drms_hiv1['position'] = df_drms_hiv1['position'].astype(int)

CA_positions = list(df_drms_hiv1[df_drms_hiv1['gene']=='CA']['position'])
PR_positions = list(df_drms_hiv1[df_drms_hiv1['gene']=='PR']['position'])
RT_positions = list(df_drms_hiv1[df_drms_hiv1['gene']=='RT']['position'])
IN_positions = list(df_drms_hiv1[df_drms_hiv1['gene']=='IN']['position'])

In [3]:
df = pd.read_csv("../../results/all_mutations.annotated.csv")
df['n_reads_var'] = df['Rvar'] + df['Fvar']
df['coverage'] = df['Rtot'] + df['Ftot']
df['frequency'] = df['n_reads_var'] / df['coverage']
df["gene"].unique()
df['aa_position'] = df['aa_position'].apply(lambda x: np.nan if x=="error" else int(x))

In [4]:
df = df[['patient', 'time', 'Pos', 'Ref', 'Var', 'frequency',
         'AltAA_f1', 'RefAA_f1', 'f1_IsSynonymous',
         'AltAA_f2', 'RefAA_f2', 'f2_IsSynonymous',
         'AltAA_f3', 'RefAA_f3', 'f3_IsSynonymous',
          'gene', 'aa_position']]

In [5]:
df['gene'].unique()

array(['error', 'CA', 'PR', 'RT', 'IN'], dtype=object)

In [6]:
df['time'].unique()

array(['week_358', 'week_112', 'week_10', 'week_4', 'week_106',
       'week_237'], dtype=object)

In [7]:
# exclude CA_positions for now
df_res_CA = df[((df['gene']=='CA') & (df["aa_position"].isin(CA_positions)))]
df_res_CA = pd.merge(df_res_CA, df_drms_hiv1[df_drms_hiv1['gene']=='CA'], left_on="aa_position", right_on="position")

df_res_PR = df[((df['gene']=='PR') & (df["aa_position"].isin(PR_positions)))]
df_res_PR = pd.merge(df_res_PR, df_drms_hiv1[df_drms_hiv1['gene']=='PR'], left_on="aa_position", right_on="position")
# PR corresponding to frame3 in our dataframe annotations

df_res_RT = df[((df['gene']=='RT') & (df["aa_position"].isin(RT_positions)))]
df_res_RT = pd.merge(df_res_RT, df_drms_hiv1[df_drms_hiv1['gene']=='RT'], left_on="aa_position", right_on="position")
# RT corresponding to frame3 in our dataframe annotations

df_res_IN = df[((df['gene']=='IN') & (df["aa_position"].isin(IN_positions)))]
df_res_IN = pd.merge(df_res_IN, df_drms_hiv1[df_drms_hiv1['gene']=='IN'], left_on="aa_position", right_on="position")
# IN corresponding to frame3 in our dataframe annotations
# however postion 232 has the wrong reference somehow 


df_res = pd.concat([df_res_PR, df_res_RT, df_res_IN, df_res_CA])

df_res = df_res[['patient', 'time', 'Pos', 'Ref', 'Var', 'frequency',
       'AltAA_f3', 'RefAA_f3', 'f3_IsSynonymous', 'gene_x', 'aa_position',
       'drug_class', 'gene_y', 'position', 'aa']]

In [8]:
df_res[df_res['f3_IsSynonymous']==0]

Unnamed: 0,patient,time,Pos,Ref,Var,frequency,AltAA_f3,RefAA_f3,f3_IsSynonymous,gene_x,aa_position,drug_class,gene_y,position,aa
17,CAP217,week_358,2350,T,C,0.001669,S,L,0,PR,33.0,PI,PR,33,F
53,CAP217,week_358,2410,T,C,0.002912,S,F,0,PR,53.0,PI,PR,53,L
67,CAP188,week_4,2413,T,C,0.033369,T,I,0,PR,54.0,PI,PR,54,A
68,CAP188,week_4,2413,T,C,0.033369,T,I,0,PR,54.0,PI,PR,54,L
69,CAP188,week_4,2413,T,C,0.033369,T,I,0,PR,54.0,PI,PR,54,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13,CAP217,week_358,1407,T,C,0.012370,R,*,0,CA,74.0,CAI,CA,74,S
14,CAP217,week_10,1386,A,G,0.087805,D,N,0,CA,67.0,CAI,CA,67,H
15,CAP217,week_10,1386,A,G,0.087805,D,N,0,CA,67.0,CAI,CA,67,Y
16,CAP217,week_10,1386,A,G,0.087805,D,N,0,CA,67.0,CAI,CA,67,N


In [9]:
df_res[(df_res["aa"]==df_res["AltAA_f3"]) & (df_res["patient"]=="CAP217") ].sort_values(['gene_y', 'position'])

Unnamed: 0,patient,time,Pos,Ref,Var,frequency,AltAA_f3,RefAA_f3,f3_IsSynonymous,gene_x,aa_position,drug_class,gene_y,position,aa
48,CAP217,week_10,4518,A,G,0.007849,A,T,0,IN,97.0,INSTI,IN,97,A
34,CAP217,week_358,4925,T,C,0.987421,N,N,1,IN,232.0,INSTI,IN,232,N
36,CAP217,week_10,4925,T,C,0.988691,N,N,1,IN,232.0,INSTI,IN,232,N
36,CAP217,week_358,2842,C,G,0.999038,G,A,0,RT,98.0,NNRTI,RT,98,G
37,CAP217,week_10,2842,C,G,0.997669,G,A,0,RT,98.0,NNRTI,RT,98,G
201,CAP217,week_10,3112,A,G,0.005255,C,Y,0,RT,188.0,NNRTI,RT,188,C


In [10]:
# CAP217
# https://hivdb.stanford.edu/hivdb/by-patterns/report/?mutations=RT%3AA98G%2CIN%3AD232N&name=RT%3AA98G%2BIN%3AD232N 

In [11]:
df_res[(df_res["aa"]==df_res["AltAA_f3"]) & (df_res["patient"]=="CAP188") ].sort_values(['gene_y', 'position'])

Unnamed: 0,patient,time,Pos,Ref,Var,frequency,AltAA_f3,RefAA_f3,f3_IsSynonymous,gene_x,aa_position,drug_class,gene_y,position,aa
49,CAP188,week_106,4518,A,G,0.006667,A,T,0,IN,97.0,INSTI,IN,97,A
6,CAP188,week_106,4647,G,A,0.005405,R,G,0,IN,140.0,INSTI,IN,140,R
38,CAP188,week_4,4925,T,C,0.984689,N,N,1,IN,232.0,INSTI,IN,232,N
40,CAP188,week_106,4925,T,C,1.0,N,N,1,IN,232.0,INSTI,IN,232,N
42,CAP188,week_237,4925,T,C,0.992216,N,N,1,IN,232.0,INSTI,IN,232,N
71,CAP188,week_4,2413,T,C,0.033369,T,I,0,PR,54.0,PI,PR,54,T
222,CAP188,week_237,2850,A,G,0.208367,E,K,0,RT,101.0,NNRTI,RT,101,E
65,CAP188,week_4,2962,A,C,0.998696,A,E,0,RT,138.0,NNRTI,RT,138,A
75,CAP188,week_106,2962,A,C,0.99801,A,E,0,RT,138.0,NNRTI,RT,138,A
85,CAP188,week_237,2962,A,C,0.998082,A,E,0,RT,138.0,NNRTI,RT,138,A


In [None]:
# CAP188
# https://hivdb.stanford.edu/hivdb/by-patterns/report/?mutations=RT%3AE138A%2CPR%3AI54T%2CIN%3AD232N&name=RT%3AE138A%2BPR%3AI54T%2BIN%3AD232N
