## Conformational buffering

This notebook generates data for Fig. 5a–d

Authors: Frederik E. Knudsen and Giulio Tesei

Contact: giulio.tesei@bio.ku.dk

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from ast import literal_eval
from scipy.stats import pearsonr, spearmanr
from mpl_toolkits.axes_grid1 import make_axes_locatable
import itertools

# Analysis
***

In [2]:
# load IDRome database
df_idrome = pd.read_csv('IDRome_DB.csv',index_col=0)

In [3]:
# load data for the orthologs
dfs = []
for i in range(4):  
    dfs.append(pd.read_csv(f'idr_orthologs/data/idr_orthologs_and_human_{i:d}.csv.gz',index_col=0))    
data = pd.concat(dfs)
data['N'] = data.fasta.apply(len)
data['OMA_ID'] = data.index

#### Group by ortholog families

In [4]:
families = pd.DataFrame(index=data.human.unique())
families['nu'] = data.groupby('human').nu_svr.apply(np.array)
families['SPR'] = data.groupby('human').SPR_svr.apply(np.array)
families['N'] = data.groupby('human').N.apply(np.array)
families['OMA_ID'] = data.groupby('human').OMA_ID.apply(np.array)

#### Filter data

In [5]:
# Filtering family by number of unique linker lengths
unique_entry_cutoff = 10
print(families.shape)
families = families[families.N.apply(lambda N: len(np.unique(N))) > unique_entry_cutoff]
print(families.shape)

(26839, 4)
(21336, 4)


In [6]:
# Filtering families in N range with more than 10 IDRs in IDRome
unique_entry_cutoff = 10
print(families.shape)
families = families[families.N.apply(lambda N: df_idrome.query(
    f'N >= {N.min():d} and N <= {N.max():d}').N.unique().size > unique_entry_cutoff)]
print(families.shape)

(21336, 4)
(21335, 4)


In [7]:
# Filtering families by length range
range_cutoff = 50
print(families.shape)
families = families[families.N.apply(lambda N: N.max() - N.min()) > range_cutoff]
print(families.shape)

(21335, 4)
(15235, 4)


In [8]:
def calc_corr_params(row,df_idrome):
    # Reference values from IDRome
    df_range = df_idrome.query(f'N >= {row.N.min():d} and N <= {row.N.max():d}')[['N','nu_svr','SPR_svr']]
    N = df_range.N.values
    N_lognormalised = np.log(N/N.mean())
    nu = df_range.nu_svr
    SPR = df_range.SPR_svr
    #######
    corr, p = pearsonr(row.N, row.nu)
    row['nu_pcorr'] = corr
    row['nu_pcorr_p'] = p
    corr, p = pearsonr(row.N, row.SPR)
    row['SPR_pcorr'] = corr
    row['SPR_pcorr_p'] = p
    corr, p = pearsonr(N, nu)
    row['nu_pcorr_ref'] = corr
    row['nu_pcorr_p_ref'] = p
    corr, p = pearsonr(N, SPR)
    row['SPR_pcorr_ref'] = corr
    row['SPR_pcorr_p_ref'] = p
    #######
    alpha, _ = np.polyfit(row.N, row.nu, 1)
    row['nu_alpha'] = alpha
    alpha, _ = np.polyfit(row.N, row.SPR, 1)
    row['SPR_alpha'] = alpha
    alpha, _ = np.polyfit(N, nu, 1)
    row['nu_alpha_ref'] = alpha
    alpha, _ = np.polyfit(N, SPR, 1)
    row['SPR_alpha_ref'] = alpha
    return row

In [9]:
families = families.apply(lambda x : calc_corr_params(x,df_idrome), axis=1)

In [10]:
families['nu_alpha_ratio'] = families.nu_alpha/families.nu_alpha_ref
families['SPR_alpha_ratio'] = families.SPR_alpha/families.SPR_alpha_ref

In [11]:
families.to_pickle('idr_orthologs/data/conf_buffering_families.pkl')

In [12]:
data[data.human.isin(['O75037_811_918','P56470_152_183',
                      'P53667_251_339','P98077_303_484'])][['human','N',
                        'fasta', 'first', 'last','N_FL']].to_csv(
    'md_simulations/data/conf_buffering/conf_buffering_seq.csv')

Table containing sequences for human IDRs belonging to protein families displaying conformational buffering

In [13]:
sel_families = families.query(f'nu_pcorr_p<3.282e-6 and nu_alpha_ratio>1.5').index

In [14]:
conf_buf = df_idrome.loc[sel_families,['N', 'nu', 'SPR', 'Rg/nm',
       'Ree/nm', 'fasta', 'is_btw_folded', 'is_nterm',
       'is_cterm', 'is_idp', 'first', 'last', 'N_FL', 'UniProt_ID']]

In [18]:
for name in conf_buf.index:
    conf_buf['fasta_ortho'] = "['"+"', '".join(data.query(f'human == "{name:s}"').fasta.values)+"']"
    conf_buf['N_ortho'] = '['+', '.join([str(i) for i in families.loc[name].N])+']'
    conf_buf['nu_ortho'] = '['+', '.join([f'{i:.4f}' for i in families.loc[name].nu])+']'
    conf_buf['SPR_ortho'] = '['+', '.join([f'{i:.4f}' for i in families.loc[name].SPR])+']'
    conf_buf['OMA_IDs'] = "['"+"', '".join(families.loc[name].OMA_ID)+"']"

In [19]:
conf_buf.to_csv('idr_orthologs/data/conf_buffering_all.csv')

In [20]:
conf_buf.query('not is_nterm and not is_cterm').to_csv('idr_orthologs/data/conf_buffering_non_term.csv')

In [21]:
conf_buf.shape,conf_buf.query('not is_nterm and not is_cterm').shape

((1137, 19), (398, 19))

Load CSV file

In [22]:
conf_buf = pd.read_csv('idr_orthologs/data/conf_buffering_all.csv',index_col=0)

In [23]:
conf_buf['fasta_ortho'] = conf_buf.fasta_ortho.apply(lambda x : literal_eval(x))
conf_buf['N_ortho'] = conf_buf.N_ortho.apply(lambda x : literal_eval(x))
conf_buf['nu_ortho'] = conf_buf.nu_ortho.apply(lambda x : literal_eval(x))
conf_buf['SPR_ortho'] = conf_buf.SPR_ortho.apply(lambda x : literal_eval(x))
conf_buf['OMA_IDs'] = conf_buf.OMA_IDs.apply(lambda x : literal_eval(x))

In [24]:
conf_buf

Unnamed: 0,N,nu,SPR,Rg/nm,Ree/nm,fasta,is_btw_folded,is_nterm,is_cterm,is_idp,first,last,N_FL,UniProt_ID,fasta_ortho,N_ortho,nu_ortho,SPR_ortho,OMA_IDs
P19622_1_241,241,0.513,9.856015,4.289309,9.908980,MEENDPKPGEAAAAVEGQRQPESSPGGGSGGGGGSSPGEADTGRRR...,False,True,False,False,1,241,333,P19622,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
P26599_1_63,63,0.542,10.212306,2.201224,5.206513,MDGIVPDIAVGTKRGSDELFSTCVTNGPFIMSSNSASAANGNDSKK...,False,True,False,False,1,63,531,P26599,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
Q96JA4_1_455,455,0.506,10.022119,6.329568,14.609507,MESTSQDRRATHVITIKPNETVLTAFPYRPHSSLLDFLKGEPRVLG...,False,True,False,False,1,455,679,Q96JA4,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
Q96L91_2469_3159,691,0.535,9.984543,8.862114,20.946098,KMTAGKRSPPIKPLLGMNPFQKNPKHASVLAESGINYDKPLPPIQV...,False,False,True,False,2469,3159,3159,Q96L91,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
Q9UHB6_1_381,381,0.530,10.011638,6.018598,14.272049,MESSPFNRRQWTSLSLRVTAKELSLVNKNKSSAIVEIFSKYQKAAE...,False,True,False,False,1,381,759,Q9UHB6,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Q92563_1_141,141,0.490,10.061512,3.154634,7.453064,MRAPGCGRLVLPLLLLAAAALAEGDAKGLKEGETPGNFMEDEQWLS...,False,True,False,False,1,141,424,Q92563,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
P35659_184_319,136,0.506,10.216948,3.195932,7.617078,MHPKPSGKPLPKSKKTCSKGSKKERNSSGMARKAKRTKCPEILSDE...,False,False,False,False,184,319,375,P35659,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
Q13017_1451_1502,52,0.557,10.279700,2.037794,4.883550,NGEIVETTNIVAPPPPSNPGQLVEPMVPLQLPPPLQPQLIQPQLQT...,False,False,True,False,1451,1502,1502,Q13017,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
Q6IQ49_176_390,215,0.534,10.095657,4.599826,10.838666,QAASSKMVSAEISENRKRQWPTKSQTDRGASAGKRRCFWLGMEGLE...,False,False,False,False,176,390,451,Q6IQ49,"[EDEDEDHRPDDYDEEDEDEVEEEETNRLSG, MKVLRALLLALLL...","[30, 56, 55, 52, 74, 48, 47, 54, 43, 46, 52, 5...","[0.602, 0.535, 0.51, 0.536, 0.524, 0.552, 0.54...","[10.859, 10.413, 10.291, 10.329, 10.313, 10.38...","[RHIBE43560_1_30, MANLE24486_1_56, CEBIM25705_..."
