# Notebook to generate attribute file to plot data on protein structure (Chimerax)

## Libraries, paths and parameters

In [1]:
import pandas as pd
aggdata_outpath = 'aggregated_data/'
heatmapdata_outpath = 'heatmaps_data/'
classified_outpath = 'aggregated_data/classified_variants/'

In [2]:
s = 'R1158'
p = 'FKS2'
c = 'anidulafungin'
ldata = [f'{s}_{p}-HS1_single_ortho_{c}', f'{s}_{p}-HS2_single_ortho_{c}']

# Specify positions and WT sequences of hotspots
pos_offset = {'FKS1-HS1': (639, 'FLVLSLRDP'),
              'FKS1-HS2': (1353, 'DWVRRYTL'),
              'FKS2-HS1': (659, 'LILSLRDP'),
              'FKS2-HS2': (1372, 'DWVRRYTL')
             }

# Specifcy name of chain in pdb file (or empty string)
chains = {'FKS1':'/F', 'FKS2':'#2 /A'}

# Specify color palette
palette = '#F1F1F1:#E4C1D9:#D691C1:#C75DAB'

In [3]:
aa_classes_hs1 = pd.read_csv(f'{classified_outpath}/{s}_{p}-HS1_single/aa_refined_classification.csv')
aa_classes_hs1['locus'] = p+'-HS1'
aa_classes_hs2 = pd.read_csv(f'{classified_outpath}/{s}_{p}-HS2_single/aa_refined_classification.csv')
aa_classes_hs2['locus'] = p+'-HS2'
aa_classes_all = pd.concat([aa_classes_hs1, aa_classes_hs2], ignore_index=True)
aa_classes = aa_classes_all[(aa_classes_all.compound == c)
                            & (aa_classes_all.seq_type == 'single')
                           ][['locus','aa_seq','sensres']].reset_index(drop=True)
aa_classes

Unnamed: 0,locus,aa_seq,sensres
0,FKS2-HS1,LILSLRDP,sensitive
1,FKS2-HS1,*ILSLRDP,sensitive
2,FKS2-HS1,AILSLRDP,resistant
3,FKS2-HS1,CILSLRDP,resistant
4,FKS2-HS1,DILSLRDP,sensitive
...,...,...,...
261,FKS2-HS2,SWVRRYTL,sensitive
262,FKS2-HS2,TWVRRYTL,sensitive
263,FKS2-HS2,VWVRRYTL,sensitive
264,FKS2-HS2,WWVRRYTL,sensitive


In [4]:
def get_mutated_hs(locus, pos, mut, convert_dict):
    wt_seq = convert_dict[locus][1]
    index = pos - convert_dict[locus][0]
    mut_seq = wt_seq[:index] + mut + wt_seq[index + 1:]
    return mut_seq

In [5]:
get_mutated_hs('FKS1-HS1', 639, '*', pos_offset)

'*LVLSLRDP'

In [6]:
dfl = []

for f in ldata:
    strain, locus, pool_type, compound = [f.split('_')[i] for i in (0,1,2,-1)]
    locus_df = pd.read_csv(f'{heatmapdata_outpath}/{f}.csv', index_col=0)
    locus_df['locus'] = locus
    dfl.append(locus_df)
    
aa_pos_single = pd.concat(dfl, ignore_index=True)
aa_pos_single

Unnamed: 0,alt_aa,aa_pos,median_s,locus
0,*,659,0.448349,FKS2-HS1
1,*,660,0.324991,FKS2-HS1
2,*,661,-0.072967,FKS2-HS1
3,*,662,-0.102703,FKS2-HS1
4,*,663,-0.250261,FKS2-HS1
...,...,...,...,...
275,Y,1375,0.116944,FKS2-HS2
276,Y,1376,1.333257,FKS2-HS2
277,Y,1377,1.743158,FKS2-HS2
278,Y,1378,0.170589,FKS2-HS2


In [7]:
aa_pos_single['aa_seq'] = aa_pos_single.apply(lambda row: get_mutated_hs(row.locus, row.aa_pos, row.alt_aa, pos_offset), axis=1)
aa_pos_single

Unnamed: 0,alt_aa,aa_pos,median_s,locus,aa_seq
0,*,659,0.448349,FKS2-HS1,*ILSLRDP
1,*,660,0.324991,FKS2-HS1,L*LSLRDP
2,*,661,-0.072967,FKS2-HS1,LI*SLRDP
3,*,662,-0.102703,FKS2-HS1,LIL*LRDP
4,*,663,-0.250261,FKS2-HS1,LILS*RDP
...,...,...,...,...,...
275,Y,1375,0.116944,FKS2-HS2,DWVYRYTL
276,Y,1376,1.333257,FKS2-HS2,DWVRYYTL
277,Y,1377,1.743158,FKS2-HS2,DWVRRYTL
278,Y,1378,0.170589,FKS2-HS2,DWVRRYYL


In [8]:
aa_annot = aa_pos_single.merge(right=aa_classes, on=['locus','aa_seq'])
aa_annot

Unnamed: 0,alt_aa,aa_pos,median_s,locus,aa_seq,sensres
0,*,659,0.448349,FKS2-HS1,*ILSLRDP,sensitive
1,*,660,0.324991,FKS2-HS1,L*LSLRDP,sensitive
2,*,661,-0.072967,FKS2-HS1,LI*SLRDP,sensitive
3,*,662,-0.102703,FKS2-HS1,LIL*LRDP,sensitive
4,*,663,-0.250261,FKS2-HS1,LILS*RDP,sensitive
...,...,...,...,...,...,...
275,Y,1374,-0.042442,FKS2-HS2,DWYRRYTL,sensitive
276,Y,1375,0.116944,FKS2-HS2,DWVYRYTL,sensitive
277,Y,1376,1.333257,FKS2-HS2,DWVRYYTL,resistant
278,Y,1378,0.170589,FKS2-HS2,DWVRRYYL,sensitive


In [9]:
aa_annot['aa_pos'] = aa_annot.apply(lambda row: f'{chains[p]}:{str(int(row.aa_pos))}', axis=1)
aa_annot

Unnamed: 0,alt_aa,aa_pos,median_s,locus,aa_seq,sensres
0,*,#2 /A:659,0.448349,FKS2-HS1,*ILSLRDP,sensitive
1,*,#2 /A:660,0.324991,FKS2-HS1,L*LSLRDP,sensitive
2,*,#2 /A:661,-0.072967,FKS2-HS1,LI*SLRDP,sensitive
3,*,#2 /A:662,-0.102703,FKS2-HS1,LIL*LRDP,sensitive
4,*,#2 /A:663,-0.250261,FKS2-HS1,LILS*RDP,sensitive
...,...,...,...,...,...,...
275,Y,#2 /A:1374,-0.042442,FKS2-HS2,DWYRRYTL,sensitive
276,Y,#2 /A:1375,0.116944,FKS2-HS2,DWVYRYTL,sensitive
277,Y,#2 /A:1376,1.333257,FKS2-HS2,DWVRYYTL,resistant
278,Y,#2 /A:1378,0.170589,FKS2-HS2,DWVRRYYL,sensitive


In [10]:
aa_annot[aa_annot.aa_pos == '/F:639'].reset_index(drop=True)

Unnamed: 0,alt_aa,aa_pos,median_s,locus,aa_seq,sensres


In [11]:
#aa_df = aa_annot.groupby(['locus','aa_pos'])[['median_s']].median().reset_index()
aa_df = aa_annot.groupby(['locus','aa_pos'])['sensres'].apply(lambda x: x[x.str.contains('resistant')].count() / len(x)).reset_index()
aa_df

Unnamed: 0,locus,aa_pos,sensres
0,FKS2-HS1,#2 /A:659,0.190476
1,FKS2-HS1,#2 /A:660,0.166667
2,FKS2-HS1,#2 /A:661,0.052632
3,FKS2-HS1,#2 /A:662,0.611111
4,FKS2-HS1,#2 /A:663,0.352941
5,FKS2-HS1,#2 /A:664,0.157895
6,FKS2-HS1,#2 /A:665,0.277778
7,FKS2-HS1,#2 /A:666,0.5
8,FKS2-HS2,#2 /A:1372,0.1875
9,FKS2-HS2,#2 /A:1373,0.1875


In [12]:
aa_df['empty_col'] = ''
attrdf = aa_df[['empty_col','aa_pos', 'sensres']]
custom_header = f'#\nattribute: {s}_{p}_{c}_scoeff\nmatch mode: 1-to-1\nrecipient: residues\n'
with open(f'{aggdata_outpath}/defattr_files/{s}_{p}_{c}_scoeff_attr.txt', 'w') as fp:
    fp.write(custom_header)

attrdf.to_csv(f'{aggdata_outpath}/defattr_files/{s}_{p}_{c}_scoeff_attr.txt', sep='\t', header=None, index=None, mode='a')

cmd = f'color byattribute {s}_{p}_{c}_scoeff range 0, 1 palette {palette}'
print(cmd)

color byattribute R1158_FKS2_anidulafungin_scoeff range 0, 1 palette #F1F1F1:#E4C1D9:#D691C1:#C75DAB
