In [42]:
import numpy as np
import copy
import os
import pickle
import scipy
try:
    import simulation as simu
    import analysis as ana
    import HA_analysis as hana
except ModuleNotFoundError:
    from fitnessinference import simulation as simu
    from fitnessinference import analysis as ana
    from fitnessinference import HA_analysis as hana
from sklearn.metrics import precision_recall_curve, auc, roc_auc_score, roc_curve
from datetime import date
import matplotlib as mpl
import matplotlib.pyplot as plt
from Bio import SeqIO
from Bio.Seq import Seq
from math import log10, floor
import pandas as pd

In [18]:
# get aa preference table (from csv file) as pandas dataframe
data_filename = 'github_jbloomlab_Perth2009-DMS-Manuscript_summary_avgprefs.csv'
data_folder = os.path.normpath('C:/Users/julia/Documents/Resources/InfluenzaFitnessLandscape'
                      '/NewApproachFromMarch2021/InfluenzaFitnessInference/figures/Perth_16_2009_G78D_T212I')
data_path = os.path.join(data_folder, data_filename)

data = pd.read_csv(data_path)

In [39]:
# get reference sequence for strain Perth_16_2009_G78D_T212I
strain_name = 'Perth_16_2009_G78D_T212I'

strain_list_folder = os.path.normpath('C:/Users/julia/Documents/Resources/InfluenzaFitnessLandscape'
                      '/NewApproachFromMarch2021/InfluenzaFitnessInference/figures')
strain_list_filename = 'reference_sequences.data'
strain_list_filepath = os.path.join(strain_list_folder, strain_list_filename)

with open(strain_list_filepath, 'rb') as f:
    seq_ref_dict = pickle.load(f)
    
seq_ref = seq_ref_dict[strain_name]

In [43]:
# epitope sites (in my numbering) for which I did the inference
res_epitope_list = hana.def_res_epitope_list()
res_allepitopes_list = [res for res_list in res_epitope_list for res in res_list]

In [48]:
## extract preferences and aa_list as list/array (sequence position in array has my numbering)

# list of amino acids
aa_list = list(data.columns)[1:]

# transform preference table into array of shape N_site rows * num_aa=20 cols
aa_pref_arr = data.to_numpy()[:,1:]

# extract preference array and ref sequence for epitope sites only (for which I did the inference)
aa_pref_epi = aa_pref_arr[res_allepitopes_list,:]
seq_ref_epi = np.array(seq_ref)[res_allepitopes_list]

In [79]:
## calculate measured mutational effects as log(max(p_mut(i))/p_ref(i)) as
## the intrinsic mutational effect for the easiest mutation at site i away from the aa of the reference seq
## or as  avg(log(p_mut(i)/p_ref(i))), i.e. the average mutational effect

max_mut_effect_list = []
avg_mut_effect_list = []
for i in range(len(seq_ref_epi)):
    aa_ref = seq_ref_epi[i] # reference state
    ref_index = aa_list.index(aa_ref) # index for ref state in array
    p_ref_list = aa_pref_epi[i,:]
    p_ref = p_ref_list[ref_index] # preference for ref state
    p_mut_list = np.delete(p_ref_list, ref_index) # preference for mutated states
    p_max = np.amax(p_mut_list) # maximum preference to another state
    max_mut_effect = np.log(p_max/p_ref)
    mut_effects = np.log(p_mut_list/p_ref)# list of log preference ratios
    avg_mut_effect = np.mean(mut_effects)
    max_mut_effect_list.append(max_mut_effect)
    avg_mut_effect_list.append(avg_mut_effect)

In [86]:
## calculate shannon entropy from aa preferences
shannon_e_list = []

for i in range(len(seq_ref_epi)):
    p_list = aa_pref_epi[i,:]
    shannon_e = -np.sum(np.log(p_list)*p_list)
    shannon_e_list.append(shannon_e)

In [66]:
## get the inferred fitness coefficients for this reference sequence
sigma_h = 1
D0 = 5
result_filename = 'HA_Inference_noCouplings' + 'sigma_h_' + str(sigma_h) + '_D0_' + str(D0) + '.data'
seqref_results_folder = data_folder
result_filepath = os.path.join(seqref_results_folder, result_filename)
with open(result_filepath, 'rb') as f:
    ana_result_dict = pickle.load(f)

# inferred fitness params (in same order as mut_effect_list)
h_inf_list = ana_result_dict['h_inf_list']
h_inf_std_list = ana_result_dict['h_inf_std_list']

In [88]:
## calculate the rank correlation between inferred and measured mutational effects and with measured shannon entropy
rhoMaxEffect, prho_MaxEffect = scipy.stats.spearmanr(max_mut_effect_list, h_inf_list)
print(rhoMaxEffect, prho_MaxEffect)
rhoAvgEffect, prho_AvgEffect = scipy.stats.spearmanr(avg_mut_effect_list, h_inf_list)
print(rhoAvgEffect, prho_AvgEffect)
rho_shannon, prho_shannon = scipy.stats.spearmanr(shannon_e_list, h_inf_list)
print(rho_shannon, prho_shannon)

0.11555543692948274 0.18873717237619975
0.12722468371323334 0.14759231430384054
0.11969785939251587 0.17326869102572026


In [81]:
## plot the inferred (y-axis) vs measured (x-axis) mutational effects
plt_set = ana.set_plot_settings()

# inferred vs max mutational effects
fig_name = 'hInferred_vs_hMaxMeasured_' + 'sigma_h_' + str(sigma_h) + '_D0_' + str(D0) + plt_set['file_extension']
this_plot_filepath = os.path.join(data_folder, fig_name)
fig = plt.figure(figsize=(plt_set['full_page_width']/2, 3))
ax1 = fig.add_axes(plt_set['plot_dim_1pan'][0])
ax1.errorbar(max_mut_effect_list, h_inf_list, h_inf_std_list, marker='o', linestyle='none', zorder=1)
ax1.set_xlabel('measured max log aa preference ratios')
ax1.set_ylabel('inferred $h$')
ax1.set_ylim(-1.5,1.5)

plt.savefig(this_plot_filepath, bbox_inches='tight')
plt.close()

# inferred vs avg. mutational effects
fig_name = 'hInferred_vs_hAvgMeasured_' + 'sigma_h_' + str(sigma_h) + '_D0_' + str(D0) + plt_set['file_extension']
this_plot_filepath = os.path.join(data_folder, fig_name)
fig = plt.figure(figsize=(plt_set['full_page_width']/2, 3))
ax1 = fig.add_axes(plt_set['plot_dim_1pan'][0])
ax1.errorbar(avg_mut_effect_list, h_inf_list, h_inf_std_list, marker='o', linestyle='none', zorder=1)
ax1.set_xlabel('measured avg. log aa preference ratios')
ax1.set_ylabel('inferred $h$')
ax1.set_ylim(-1.5,1.5)

plt.savefig(this_plot_filepath, bbox_inches='tight')
plt.close()

In [37]:
## 

[0.04007939 0.05493957 0.02124495 0.03012344 0.03527437 0.0118324
 0.02453517 0.07592211 0.02110395 0.23884933 0.05486379 0.02444667
 0.03557482 0.0235555  0.04055357 0.02232434 0.05319938 0.13476075
 0.03710657 0.01970993]
