## Conformational buffering

This notebook generates data for Fig. 5a–d

Authors: Frederik E. Knudsen and Giulio Tesei

Contact: giulio.tesei@bio.ku.dk

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from ast import literal_eval
from scipy.stats import pearsonr, spearmanr
from mpl_toolkits.axes_grid1 import make_axes_locatable
import itertools

# Analysis
***

In [2]:
# load IDRome database
df_idrome = pd.read_csv('IDRome_DB.csv',index_col=0)

In [3]:
# load data for the orthologs
dfs = []
for i in range(4):  
    dfs.append(pd.read_csv(f'idr_orthologs/data/idr_orthologs_and_human_{i:d}.csv.gz',index_col=0))    
data = pd.concat(dfs)
data['N'] = data.fasta.apply(len)
data['OMA_ID'] = data.index

#### Group by ortholog families

In [4]:
families = pd.DataFrame(index=data.human.unique())
families['nu'] = data.groupby('human').nu_svr.apply(np.array)
families['SPR'] = data.groupby('human').SPR_svr.apply(np.array)
families['N'] = data.groupby('human').N.apply(np.array)
families['OMA_ID'] = data.groupby('human').OMA_ID.apply(np.array)

#### Filter data

In [5]:
# Filtering family by number of unique linker lengths
unique_entry_cutoff = 10
print(families.shape)
families = families[families.N.apply(lambda N: len(np.unique(N))) > unique_entry_cutoff]
print(families.shape)

(26839, 4)
(21336, 4)


In [6]:
# Filtering families in N range with more than 10 IDRs in IDRome
unique_entry_cutoff = 10
print(families.shape)
families = families[families.N.apply(lambda N: df_idrome.query(
    f'N >= {N.min():d} and N <= {N.max():d}').N.unique().size > unique_entry_cutoff)]
print(families.shape)

(21336, 4)
(21335, 4)


In [7]:
# Filtering families by length range
range_cutoff = 50
print(families.shape)
families = families[families.N.apply(lambda N: N.max() - N.min()) > range_cutoff]
print(families.shape)

(21335, 4)
(15235, 4)


In [8]:
def calc_corr_params(row,df_idrome):
    # Reference values from IDRome
    df_range = df_idrome.query(f'N >= {row.N.min():d} and N <= {row.N.max():d}')[['N','nu_svr','SPR_svr']]
    N = df_range.N.values
    N_lognormalised = np.log(N/N.mean())
    nu = df_range.nu_svr
    SPR = df_range.SPR_svr
    #######
    corr, p = pearsonr(row.N, row.nu)
    row['nu_pcorr'] = corr
    row['nu_pcorr_p'] = p
    corr, p = pearsonr(row.N, row.SPR)
    row['SPR_pcorr'] = corr
    row['SPR_pcorr_p'] = p
    corr, p = pearsonr(N, nu)
    row['nu_pcorr_ref'] = corr
    row['nu_pcorr_p_ref'] = p
    corr, p = pearsonr(N, SPR)
    row['SPR_pcorr_ref'] = corr
    row['SPR_pcorr_p_ref'] = p
    #######
    alpha, _ = np.polyfit(row.N, row.nu, 1)
    row['nu_alpha'] = alpha
    alpha, _ = np.polyfit(row.N, row.SPR, 1)
    row['SPR_alpha'] = alpha
    alpha, _ = np.polyfit(N, nu, 1)
    row['nu_alpha_ref'] = alpha
    alpha, _ = np.polyfit(N, SPR, 1)
    row['SPR_alpha_ref'] = alpha
    return row

In [9]:
families = families.apply(lambda x : calc_corr_params(x,df_idrome), axis=1)

In [10]:
families['nu_alpha_ratio'] = families.nu_alpha/families.nu_alpha_ref
families['SPR_alpha_ratio'] = families.SPR_alpha/families.SPR_alpha_ref

In [11]:
families.to_pickle('idr_orthologs/data/conf_buffering_families.pkl')

In [12]:
data[data.human.isin(['O75037_811_918','P56470_152_183',
                      'P53667_251_339','P98077_303_484'])][['human','N',
                        'fasta', 'first', 'last','N_FL']].to_csv(
    'md_simulations/data/conf_buffering/conf_buffering_seq.csv')

Table containing sequences for human IDRs belonging to protein families displaying conformational buffering

In [13]:
sel_families = families.query(f'nu_pcorr_p<3.282e-6 and nu_alpha_ratio>1.5').index

In [14]:
conf_buf = df_idrome.loc[sel_families,['N', 'nu', 'SPR', 'Rg/nm',
       'Ree/nm', 'fasta', 'is_btw_folded', 'is_nterm',
       'is_cterm', 'is_idp', 'first', 'last', 'N_FL', 'UniProt_ID']]

In [15]:
for name in conf_buf.index:
    conf_buf.loc[name,'fasta_ortho'] = "['"+"', '".join(data.query(f'human == "{name:s}"').fasta.values)+"']"
    conf_buf.loc[name,'N_ortho'] = '['+', '.join([str(i) for i in families.loc[name].N])+']'
    conf_buf.loc[name,'nu_ortho'] = '['+', '.join([f'{i:.4f}' for i in families.loc[name].nu])+']'
    conf_buf.loc[name,'SPR_ortho'] = '['+', '.join([f'{i:.4f}' for i in families.loc[name].SPR])+']'
    conf_buf.loc[name,'OMA_IDs'] = "['"+"', '".join(families.loc[name].OMA_ID)+"']"
    conf_buf.loc[name,'nu_alpha_ratio'] = families.loc[name].nu_alpha_ratio
    conf_buf.loc[name,'nu_alpha'] = families.loc[name].nu_alpha
    conf_buf.loc[name,'nu_pcorr_p'] = families.loc[name].nu_pcorr_p

In [16]:
conf_buf.to_csv('idr_orthologs/data/conf_buffering_all.csv')

In [17]:
conf_buf.query('not is_nterm and not is_cterm').to_csv('idr_orthologs/data/conf_buffering_non_term.csv')

In [18]:
conf_buf.shape,conf_buf.query('not is_nterm and not is_cterm').shape

((1137, 22), (398, 22))

Load CSV file

In [19]:
conf_buf = pd.read_csv('idr_orthologs/data/conf_buffering_all.csv',index_col=0)

In [20]:
conf_buf['fasta_ortho'] = conf_buf.fasta_ortho.apply(lambda x : literal_eval(x))
conf_buf['N_ortho'] = conf_buf.N_ortho.apply(lambda x : literal_eval(x))
conf_buf['nu_ortho'] = conf_buf.nu_ortho.apply(lambda x : literal_eval(x))
conf_buf['SPR_ortho'] = conf_buf.SPR_ortho.apply(lambda x : literal_eval(x))
conf_buf['OMA_IDs'] = conf_buf.OMA_IDs.apply(lambda x : literal_eval(x))

In [21]:
conf_buf

Unnamed: 0,N,nu,SPR,Rg/nm,Ree/nm,fasta,is_btw_folded,is_nterm,is_cterm,is_idp,...,N_FL,UniProt_ID,fasta_ortho,N_ortho,nu_ortho,SPR_ortho,OMA_IDs,nu_alpha_ratio,nu_alpha,nu_pcorr_p
P19622_1_241,241,0.513,9.856015,4.290894,9.917744,MEENDPKPGEAAAAVEGQRQPESSPGGGSGGGGGSSPGEADTGRRR...,False,True,False,False,...,333,P19622,[AAAAERPRQPDSSPGGGGSPGDSDTGRRRALMLPAELQAPGNHQH...,"[223, 242, 232, 231, 239, 244, 214, 232, 175, ...","[0.496, 0.487, 0.493, 0.492, 0.488, 0.479, 0.4...","[9.956, 9.948, 9.966, 9.971, 9.953, 9.941, 9.9...","[OCTDE06575_1_223, MANJA46555_1_242, NANGA0576...",2.066869,-0.000187,3.268011e-07
P26599_1_63,63,0.542,10.212306,2.198996,5.201352,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,False,True,False,False,...,531,P26599,[LHSIVPDIAVGTKRGSDELFSTCVSNGPFIMSSSASAANGNDSKK...,"[62, 33, 63, 60, 62, 71, 66, 61, 59, 32, 66, 3...","[0.544, 0.579, 0.543, 0.545, 0.543, 0.537, 0.5...","[10.185, 10.338, 10.182, 10.179, 10.183, 10.13...","[RATNO39646_32_93, FICAL10791_1_33, MANJA21663...",1.571458,-0.000803,7.870696e-13
Q96JA4_1_455,455,0.506,10.022119,6.339878,14.645407,MESTSQDRRATHVITIKPNETVLTAFPYRPHSSLLDFLKGEPRVLG...,False,True,False,False,...,679,Q96JA4,[MESPSQDKRPTHVITIKPNETVWTAFPYRPHSSLLDFLKGEPRVL...,"[490, 407, 467, 403, 255, 494, 425, 492, 441, ...","[0.503, 0.51, 0.509, 0.5, 0.523, 0.5, 0.5, 0.5...","[9.947, 9.969, 9.957, 9.934, 10.038, 9.929, 9....","[RHIBE25024_1_490, SAIBB00842_1_407, CEBIM2627...",2.866771,-0.000095,1.506015e-06
Q96L91_2469_3159,691,0.535,9.984543,8.870963,20.978855,KMTAGKRSPPIKPLLGMNPFQKNPKHASVLAESGINYDKPLPPIQV...,False,False,True,False,...,3159,Q96L91,[KMTAGKRSPPIKPLLGMNPFQKNPKHASVLAESGINYDKPLPPIQ...,"[666, 685, 664, 674, 419, 648, 675, 659, 669, ...","[0.52, 0.518, 0.518, 0.519, 0.54, 0.52, 0.518,...","[9.972, 9.968, 9.972, 9.968, 10.002, 9.969, 9....","[CANLF10214_2445_3110, RATNO05232_2468_3152, C...",2.102158,-0.000064,6.113485e-20
Q9UHB6_1_381,381,0.530,10.011638,6.023770,14.286446,MESSPFNRRQWTSLSLRVTAKELSLVNKNKSSAIVEIFSKYQKAAE...,False,True,False,False,...,759,Q9UHB6,[MESTPFNRRQWTSLSLRVTAKELSLVNKNKSSAIVEIFSKYQKAA...,"[382, 382, 220, 381, 381, 381, 390, 219, 378, ...","[0.508, 0.508, 0.521, 0.51, 0.508, 0.506, 0.50...","[10.01, 10.013, 10.056, 10.017, 10.012, 10.005...","[PROCO09250_1_382, TURTR11477_1_382, HETGA0095...",1.512287,-0.000100,4.752026e-11
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q92563_1_141,141,0.490,10.061512,3.158404,7.464973,MRAPGCGRLVLPLLLLAAAALAEGDAKGLKEGETPGNFMEDEQWLS...,False,True,False,False,...,424,Q92563,[MRAPGSCLVPLLLAIVLAEGDGKGGKEGENPGNFMEDEQWLSSIS...,"[134, 139, 73, 145, 136, 137, 134, 60, 90, 99,...","[0.5, 0.492, 0.549, 0.488, 0.502, 0.482, 0.484...","[10.07, 10.04, 10.2, 10.059, 10.073, 10.07, 10...","[VARKO08165_1_134, ORNAN18115_1_139, URSMA0360...",1.930569,-0.000619,5.052486e-14
P35659_184_319,136,0.506,10.216948,3.192747,7.608550,MHPKPSGKPLPKSKKTCSKGSKKERNSSGMARKAKRTKCPEILSDE...,False,False,False,False,...,375,P35659,[MHPKPSGKPLPKSKKSSSKGSKKERNSSGTTRKSKQTKCPEILSD...,"[137, 134, 137, 136, 135, 137, 135, 133, 135, ...","[0.53, 0.529, 0.524, 0.529, 0.529, 0.515, 0.52...","[10.251, 10.266, 10.228, 10.258, 10.248, 10.21...","[RATNO12942_186_322, MONDO16711_183_316, NANGA...",12.493346,-0.002180,1.047817e-09
Q13017_1451_1502,52,0.557,10.279700,2.036589,4.882123,NGEIVETTNIVAPPPPSNPGQLVEPMVPLQLPPPLQPQLIQPQLQT...,False,False,True,False,...,1502,Q13017,[NGEIVETTNPVAPQPPSLNPGQMAEPMVPLQLPPPLQPQLIQPQL...,"[53, 52, 53, 53, 53, 54, 53, 49, 45, 52, 54, 4...","[0.575, 0.575, 0.574, 0.577, 0.578, 0.577, 0.5...","[10.256, 10.255, 10.254, 10.267, 10.272, 10.26...","[ORNAN06591_1449_1501, DIPOR19334_1452_1503, M...",1.535831,-0.000809,9.556704e-08
Q6IQ49_176_390,215,0.534,10.095657,4.597398,10.824966,QAASSKMVSAEISENRKRQWPTKSQTDRGASAGKRRCFWLGMEGLE...,False,False,False,False,...,451,Q6IQ49,[QAASSKMVSAEISENRKRQWPTKSKTDRGASMGKKRCFWLGMEGL...,"[215, 219, 217, 216, 217, 217, 217, 208, 217, ...","[0.521, 0.518, 0.522, 0.522, 0.526, 0.539, 0.5...","[10.123, 10.077, 10.107, 10.124, 10.113, 10.11...","[AOTNA01007_176_390, CARSF28468_176_394, VICPA...",9.173286,-0.000427,1.194233e-06
