In [12]:
import itertools
import math
import os

from IPython.display import display, HTML

import matplotlib.pyplot as plt

import pandas as pd

from plotnine import *

import yaml

%matplotlib inline

In [13]:
with open('config.yaml') as f:
    config = yaml.safe_load(f)

In [14]:
os.makedirs(config['supp_data_dir_ZIKV'], exist_ok=True)

In [15]:
site_metric = config['site_metric']
mut_metric = config['mut_metric']

print(f"At site level, quantifying selection by {site_metric}")
print(f"At mutation level, quantify selection by {mut_metric}")

At site level, quantifying selection by site_total_escape_frac_single_mut
At mutation level, quantify selection by mut_escape_frac_single_mut


In [16]:
print(f"Reading escape-profile configuration from {config['escape_profiles_config_ZIKV']}")
with open(config['escape_profiles_config_ZIKV']) as f:
    escape_profiles_config = yaml.safe_load(f)
    
condition_sets = {name: specs['conditions'] for name, specs in escape_profiles_config.items()
                  if 'make_supp_data' in specs and specs['make_supp_data']}

print('Making supplementary data for the following condition sets:\n  ' + '\n  '.join(condition_sets))

Reading escape-profile configuration from data/escape_profiles_config_ZIKV.yaml
Making supplementary data for the following condition sets:
  MAP_paper_antibodies


In [17]:
print(f"Reading escape fractions from {config['escape_fracs_ZIKV']}")
escape_fracs_all = pd.read_csv(config['escape_fracs_ZIKV'])

Reading escape fractions from results/escape_scores/escape_fracs_ZIKV.csv


In [18]:
with open (config['output_pdbs_config']) as f:
    output_pdbs_config = yaml.safe_load(f)

In [43]:
raw_data = (
    escape_fracs_all
    .query('library == "average"')
    .drop(columns=['library'])
    .rename(columns={mut_metric: 'mut_escape',
                     site_metric: 'site_total_escape'})
    [['condition', 'site', 'label_site', 'wildtype', 'mutation', 'protein_chain',
      'protein_site', 'mut_escape', 'site_total_escape']]
    .assign(site_max_escape=lambda x: x.groupby(['condition', 'site'])['mut_escape'].transform('max'))
    )
raw_data

Unnamed: 0,condition,site,label_site,wildtype,mutation,protein_chain,protein_site,mut_escape,site_total_escape,site_max_escape
0,Z2029_1000,1,299,R,C,E,299,4.583000e-02,0.113200,0.050610
1,Z2029_1000,1,299,R,K,E,299,1.653000e-02,0.113200,0.050610
2,Z2029_1000,1,299,R,L,E,299,5.061000e-02,0.113200,0.050610
3,Z2029_1000,1,299,R,N,E,299,2.656000e-04,0.113200,0.050610
4,Z2029_1000,2,300,L,C,E,300,2.857000e-02,0.116300,0.056170
...,...,...,...,...,...,...,...,...,...,...
4245,ZV67_1000,108,406,T,C,E,406,3.660000e-07,0.000003,0.000001
4246,ZV67_1000,108,406,T,G,E,406,8.689000e-07,0.000003,0.000001
4247,ZV67_1000,108,406,T,I,E,406,1.345000e-06,0.000003,0.000001
4248,ZV67_1000,109,407,I,F,E,407,1.034000e-03,0.001034,0.001034


In [40]:
for set_name, condition_set in condition_sets.items():
    print(f"\nRaw data for {set_name}:")
    dms_view_df = (raw_data
          .query('condition in @condition_set')
          .assign(condition=lambda x: x['condition'].map(condition_set))
          [['condition', 'site', 'label_site', 'wildtype', 'mutation', 'protein_chain', 'protein_site', 'mut_escape',
            'site_total_escape', 'site_max_escape']]
          )
    #display(HTML(df.head().to_html(index=False)))
    csv_file = os.path.join(config['supp_data_dir_ZIKV'], f"{set_name}_raw_data.csv")
    print(f"Writing to {csv_file}")
    df.to_csv(csv_file, index=False, float_format='%.4g')
    
df


Raw data for MAP_paper_antibodies:
Writing to results/supp_data/ZIKV/MAP_paper_antibodies_raw_data.csv


Unnamed: 0,condition,site,label_site,wildtype,mutation,protein_chain,protein_site,mut_escape,site_total_escape,site_max_escape
0,Z2029,1,299,R,C,E,299,4.583000e-02,0.113200,0.050610
1,Z2029,1,299,R,K,E,299,1.653000e-02,0.113200,0.050610
2,Z2029,1,299,R,L,E,299,5.061000e-02,0.113200,0.050610
3,Z2029,1,299,R,N,E,299,2.656000e-04,0.113200,0.050610
4,Z2029,2,300,L,C,E,300,2.857000e-02,0.116300,0.056170
...,...,...,...,...,...,...,...,...,...,...
4245,ZV67,108,406,T,C,E,406,3.660000e-07,0.000003,0.000001
4246,ZV67,108,406,T,G,E,406,8.689000e-07,0.000003,0.000001
4247,ZV67,108,406,T,I,E,406,1.345000e-06,0.000003,0.000001
4248,ZV67,109,407,I,F,E,407,1.034000e-03,0.001034,0.001034


Data for dms-view

In [30]:
dms_view_df = (
    pd.concat([raw_data
               ])
    .rename(columns={'site_max_escape': 'site_max escape',
                     'site_total_escape': 'site_total escape'})
    .drop(columns='protein_chain')
    )

display(HTML(dms_view_df.head().to_html(index=False)))

condition,site,label_site,wildtype,mutation,protein_site,mut_escape,site_total escape,site_max escape
Z2029_1000,1,299,R,C,299,0.04583,0.1132,0.05061
Z2029_1000,1,299,R,K,299,0.01653,0.1132,0.05061
Z2029_1000,1,299,R,L,299,0.05061,0.1132,0.05061
Z2029_1000,1,299,R,N,299,0.000266,0.1132,0.05061
Z2029_1000,2,300,L,C,300,0.02857,0.1163,0.05617


In [29]:
for set_name, condition_set in condition_sets.items():
    
    for pdb_name, pdb_specs in output_pdbs_config.items():
        
        # get all conditions for this set to be written to this PDB
        if isinstance(pdb_specs['conditions'], str) and pdb_specs['conditions'].upper() == 'ALL':
            pdb_conditions = condition_set
        else:
            assert isinstance(pdb_specs['conditions'], list)
            pdb_conditions = [condition for condition in condition_set
                              if condition in pdb_specs['conditions']]
        if not pdb_conditions:
            continue
            
        df = (dms_view_df
              .query('condition in @pdb_conditions')
              .assign(condition=lambda x: x['condition'].map(condition_set))  # re-name to shorter names
              )
        
        # assign all relevant protein chains
        cols = df.columns.tolist()
        i = cols.index('protein_site')  # add protein chain column here
        newcols = cols[: i] + ['protein_chain'] + cols[i: ]
        chains = ' '.join(pdb_specs['chains'])
        df = df.assign(protein_chain=chains)[newcols]
   
        csv_file = os.path.join(config['supp_data_dir_ZIKV'], f"{set_name}_{pdb_name}_dms-view_data.csv")
        print(f"Writing `dms-view` input file for {set_name} mapped to PDB {pdb_name} to {csv_file}")
        df.to_csv(csv_file, index=False, float_format='%.4g')

Writing `dms-view` input file for MAP_paper_antibodies mapped to PDB 5KVD to results/supp_data/ZIKV/MAP_paper_antibodies_5KVD_dms-view_data.csv


In [44]:
for set_name, condition_set in condition_sets.items():
    df = (raw_data
          .query('condition in @condition_set')
          .assign(condition=lambda x: x['condition'].map(condition_set))
          )
    csv_file = os.path.join(config['supp_data_dir_ZIKV'], f"{set_name}_dms-view_data.csv")
    print(f"Writing `dms-view` input file for {set_name} to {csv_file}")
    df.to_csv(csv_file, index=False, float_format='%.4g')

Writing `dms-view` input file for MAP_paper_antibodies to results/supp_data/ZIKV/MAP_paper_antibodies_dms-view_data.csv
