## Conformational buffering

This notebook generates data for Fig. 5a–d

Authors: Frederik E. Knudsen and Giulio Tesei

Contact: giulio.tesei@bio.ku.dk

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import pearsonr, spearmanr
from mpl_toolkits.axes_grid1 import make_axes_locatable
import itertools

# Analysis
***

In [4]:
# load IDRome database
df_idrome = pd.read_csv('IDRome_DB.csv',index_col=0)

In [5]:
# load data for the orthologs
dfs = []
for i in range(4):  
    dfs.append(pd.read_csv(f'idr_orthologs/data/idr_orthologs_and_human_{i:d}.csv.gz',index_col=0))    
data = pd.concat(dfs)
data['N'] = data.fasta.apply(len)

#### Group by ortholog families

In [6]:
families = pd.DataFrame(index=data.human.unique())
families['nu'] = data.groupby('human').nu_svr.apply(np.array)
families['SPR'] = data.groupby('human').SPR_svr.apply(np.array)
families['N'] = data.groupby('human').N.apply(np.array)

#### Filter data

In [7]:
# Filtering family by number of unique linker lengths
unique_entry_cutoff = 10
print(families.shape)
families = families[families.N.apply(lambda N: len(np.unique(N))) > unique_entry_cutoff]
print(families.shape)

(26839, 3)
(21336, 3)


In [8]:
# Filtering families in N range with more than 10 IDRs in IDRome
unique_entry_cutoff = 10
print(families.shape)
families = families[families.N.apply(lambda N: df_idrome.query(
    f'N >= {N.min():d} and N <= {N.max():d}').N.unique().size > unique_entry_cutoff)]
print(families.shape)

(21336, 3)
(21335, 3)


In [9]:
# Filtering families by length range
range_cutoff = 50
print(families.shape)
families = families[families.N.apply(lambda N: N.max() - N.min()) > range_cutoff]
print(families.shape)

(21335, 3)
(15235, 3)


In [10]:
def calc_corr_params(row,df_idrome):
    # Reference values from IDRome
    df_range = df_idrome.query(f'N >= {row.N.min():d} and N <= {row.N.max():d}')[['N','nu_svr','SPR_svr']]
    N = df_range.N.values
    N_lognormalised = np.log(N/N.mean())
    nu = df_range.nu_svr
    SPR = df_range.SPR_svr
    #######
    corr, p = pearsonr(row.N, row.nu)
    row['nu_pcorr'] = corr
    row['nu_pcorr_p'] = p
    corr, p = pearsonr(row.N, row.SPR)
    row['SPR_pcorr'] = corr
    row['SPR_pcorr_p'] = p
    corr, p = pearsonr(N, nu)
    row['nu_pcorr_ref'] = corr
    row['nu_pcorr_p_ref'] = p
    corr, p = pearsonr(N, SPR)
    row['SPR_pcorr_ref'] = corr
    row['SPR_pcorr_p_ref'] = p
    #######
    alpha, _ = np.polyfit(row.N, row.nu, 1)
    row['nu_alpha'] = alpha
    alpha, _ = np.polyfit(row.N, row.SPR, 1)
    row['SPR_alpha'] = alpha
    alpha, _ = np.polyfit(N, nu, 1)
    row['nu_alpha_ref'] = alpha
    alpha, _ = np.polyfit(N, SPR, 1)
    row['SPR_alpha_ref'] = alpha
    return row

In [11]:
families = families.apply(lambda x : calc_corr_params(x,df_idrome), axis=1)

In [12]:
families['nu_alpha_ratio'] = families.nu_alpha/families.nu_alpha_ref
families['SPR_alpha_ratio'] = families.SPR_alpha/families.SPR_alpha_ref

In [13]:
families.to_pickle('idr_orthologs/data/conf_buffering_families.pkl')

In [16]:
data[data.human.isin(['O75037_811_918','P56470_152_183',
                      'P53667_251_339','P98077_303_484'])][['human','N',
                        'fasta', 'first', 'last','N_FL']].to_csv(
    'md_simulations/data/conf_buffering/conf_buffering_seq.csv')