<a href="https://colab.research.google.com/github/KULL-Centre/ColabCALVADOS/blob/main/nu_SVR_predictor/nu_SVR_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This Colab notebook shows how to train and test a machine-learning model that predicts the compaction of intrinsically disordered regions (IDRs) based on selected sequence features [1].

The training data consist of apparent scaling exponents for human IDRs, estimated from simulations using the CALVADOS model [2].

The notebook allows exploration of how the choice of hyperparameters affects the accuracy and transferability of the support vector regression (SVR) model.

1. G. Tesei, A. I. Trolle, N. Jonsson, J. Betz, F. Pesce, K. E. Johansson, K. Lindorff-Larsen __Conformational ensembles of the human intrinsically disordered proteome__ _Nature_ 2024 626(8000):897-904 DOI: https://doi.org/10.1038/s41586-023-07004-5
2. G. Tesei and K. Lindorff-Larsen __Improved predictions of phase behaviour of intrinsically disordered proteins by tuning the interaction range [version 2; peer review: 2 approved]__ _Open Research Europe_ 2023 2(94) DOI: https://doi.org/10.12688/openreseurope.14967.2

In [None]:
# @title 1. Set the environment
!pip install wget
!pip install fastprogress
!pip install pandas
!pip install -U scikit-learn

In [None]:
# @title 2. Load the IDRome database

#@markdown The IDRome database contains the apparent scaling exponents of 28,058 IDRs obtained from CALVADOS simulations.

#@markdown We load the database and split it into training, validation, and test sets by randomly selecting 30, 60, and 10 % of the entries.

import numpy as np
import pandas as pd
import os
import shutil
import ipywidgets as widgets
import warnings
import wget
import random
import subprocess
from sklearn import svm 
from sklearn.inspection import permutation_importance
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from scipy.stats import pearsonr
from scipy.stats import binned_statistic, binned_statistic_2d
from google.colab import files
se = lambda x : np.std(x)/np.sqrt(x.size)
se.__name__ = 'SE'
hexcolors = ['#EE7733', '#0077BB', '#33BBEE', '#EE3377', '#CC3311', '#009988', '#BBBBBB']
colornames = ['orange','blue','cyan','magenta','red','teal','grey']
pt_colors = dict(zip(colornames,hexcolors))
plt.rcParams['font.family'] = 'sans-serif'
plt.rcParams['font.sans-serif'] = ['Arial'] + plt.rcParams['font.sans-serif']
mpl.rcParams['mathtext.fontset'] = 'custom'
mpl.rcParams['mathtext.rm'] = 'Times New Roman'
mpl.rcParams['mathtext.it'] = 'Times New Roman:italic'
mpl.rc('pdf', fonttype=42)

# input features
features_nu = ['scd','shd','kappa','fcr','mean_lambda']
feature_dict = {'scd':'SCD','shd':'SHD',
                'mean_lambda':r'$\langle \lambda \rangle$',
                'fcr':'FCR','kappa':'$\kappa$'}

# load IDRome database
wget.download("https://raw.githubusercontent.com/KULL-Centre/_2023_Tesei_IDRome/refs/heads/main/IDRome_DB.csv")
df_idrome = pd.read_csv('IDRome_DB.csv',index_col=0)

# randomly draw indices for 10%, 30%, and 60% of dataset
all_indices = df_idrome[~df_idrome.fasta.duplicated()].sample(frac=1,axis='index',random_state=17).index
inds10 = all_indices[:int(all_indices.size*.1)]
inds30 = all_indices[int(all_indices.size*.1):int(all_indices.size*.4)]
inds60 = all_indices[int(all_indices.size*.4):]

print('Number of IDRs in the training set:',inds30.size)
print('Number of IDRs in the validation set:',inds60.size)
print('Number of IDRs in the test set:',inds10.size)

In [None]:
# @title 3. Set the hyperparameters of the SVR model

# @markdown Set the $\epsilon$ parameter. A good starting value is 0.02 which corresponds to the error on the $\nu$ values from simulations.
epsilon = 0.02  #@param {type:"float"}
# @markdown Set the value for the regularization parameter $C$.
C = 100 #@param {type:"float"}

# @markdown This cell trains and tests an SVR model with the selected hyperparameters. Tip: Try C=1, 100, and 1000. Using the Pearson correlation coefficient as a performance metric, how does this choice affect the accuracy and transferrability of the model?

f, (ax1,ax2,ax3) = plt.subplots(1, 3, sharex=False, sharey=False, figsize=(9,3.5))
plt.rc('font', **{'family':'sans-serif','sans-serif':['Arial'], 'size': 12})

#############

X = np.array(df_idrome.loc[inds30,features_nu])
y = np.array(df_idrome.loc[inds30,'nu'])
model_nu = svm.SVR(epsilon=epsilon,C=C,verbose=True)
model_nu.fit(X,y)

for seq_name in df_idrome.index:
    df_idrome.loc[seq_name,'nu_svr'] = model_nu.predict(df_idrome.loc[seq_name,features_nu].values.reshape(1, -1))
df_idrome.nu_svr = df_idrome.nu_svr.apply(lambda x: round(x,3))

#############

interval = .01
decimal = 2

train_30 = df_idrome.loc[inds30,'nu']
train_30_svr = df_idrome.loc[inds30,'nu_svr']

pr = pearsonr(train_30,train_30_svr)[0]

x = np.linspace(train_30.min(),train_30.max(),100)
ax1.plot(x,x,lw=.5,zorder=4,color='k',label='$r={:.2f}$'.format(pr))

bins = np.round(np.arange(train_30.min(),train_30.max(),interval),decimal)
h,_,_ = np.histogram2d(train_30,train_30_svr,bins=bins)
bins_center = bins[:-1] + interval/2
im = ax1.imshow(h, cmap=plt.cm.viridis_r,extent=[bins_center.min(),bins_center.max(),
                                              bins_center.min(),bins_center.max()],
                 origin='lower',alpha=1,norm=LogNorm(vmin=1,vmax=h.max()),
                 aspect='auto',interpolation='nearest')
divider = make_axes_locatable(ax1)
cax = divider.new_horizontal(size="{:.1f}%".format(5), pad=.1)
f.add_axes(cax)
cb = f.colorbar(im, cax=cax, orientation='vertical',
            label='Number of IDRs')

#########

interval = .01
decimal = 2

val_60 = df_idrome.loc[inds60,'nu']
val_60_svr = df_idrome.loc[inds60,'nu_svr']

pr = pearsonr(val_60,val_60_svr)[0]

x = np.linspace(val_60.min(),val_60.max(),100)
ax2.plot(x,x,lw=.5,zorder=4,color='k',label='$r={:.2f}$'.format(pr))

bins = np.round(np.arange(val_60.min(),val_60.max(),interval),decimal)
h,_,_ = np.histogram2d(val_60,val_60_svr,bins=bins)
bins_center = bins[:-1] + interval/2
im = ax2.imshow(h, cmap=plt.cm.viridis_r,extent=[bins_center.min(),bins_center.max(),
                                              bins_center.min(),bins_center.max()],
                 origin='lower',alpha=1,norm=LogNorm(vmin=1,vmax=h.max()),
                 aspect='auto',interpolation='nearest')
divider = make_axes_locatable(ax2)
cax = divider.new_horizontal(size="{:.1f}%".format(5), pad=.1)
f.add_axes(cax)
cb = f.colorbar(im, cax=cax, orientation='vertical',
            label='Number of IDRs')

#########

test_10 = df_idrome.loc[inds10,'nu']
test_10_svr = df_idrome.loc[inds10,'nu_svr']

pr = pearsonr(test_10,test_10_svr)[0]

x = np.linspace(train_30.min(),train_30.max(),100)
ax3.plot(x,x,lw=.5,zorder=4,color='k',label='$r={:.2f}$'.format(pr))

bins = np.round(np.arange(test_10.min(),test_10.max(),interval),decimal)
h,_,_ = np.histogram2d(test_10,test_10_svr,bins=bins)
bins_center = bins[:-1] + interval/2
im = ax3.imshow(h, cmap=plt.cm.viridis_r,extent=[bins_center.min(),bins_center.max(),
                                              bins_center.min(),bins_center.max()],
                 origin='lower',alpha=1,norm=LogNorm(vmin=1,vmax=h.max()),
                 aspect='auto',interpolation='nearest')
divider = make_axes_locatable(ax3)
cax = divider.new_horizontal(size="{:.1f}%".format(5), pad=.1)
f.add_axes(cax)
cb = f.colorbar(im, cax=cax, orientation='vertical',
            label='Number of IDRs')

for ax in (ax1,ax2,ax3):
    ax.set_aspect('equal')
    ax.set_xlim(x.min(),x.max())
    ax.set_ylim(x.min(),x.max())
    ax.legend(frameon=False,loc='upper left',handlelength=0,borderpad=0,handletextpad=.4)
    
    ax.set_xlabel('$\\nu$ from CALVADOS')
    ax.set_ylabel('$\\nu$ from SVR model')
    
ax1.set_title('Training set')
ax2.set_title('Validation set')
ax3.set_title('Test set')

plt.tight_layout()
plt.savefig(f'corr_{epsilon}_{C}.jpg',dpi=600)
files.download(f'corr_{epsilon}_{C}.jpg')

In [None]:
# @title 4.  Permutation feature importance testing

# @markdown This cell uses permutation feature importance testing to rank the sequence descriptors on the basis of their effect on model performance.
# @markdown How does the regularization strength influence the relative importance of the five sequence features?

In [None]:
f, axes = plt.subplots(2, 3, sharex=False, sharey=False, figsize=(8,4))
plt.rc('font', **{'family':'sans-serif','sans-serif':['Arial'], 'size': 12})

axes = axes.flatten()

features_nu = np.array(features_nu)
sorted_idx = per_imp_nu.importances_mean.argsort()
axes[0].barh(features_nu[sorted_idx],per_imp_nu.importances_mean[sorted_idx],color=pt_colors['grey'],height=.5)
feature_labels = [feature_dict[f] for f in features_nu[sorted_idx]]
axes[0].set_yticks(range(5))
axes[0].set_yticklabels(feature_labels)
axes[0].set_xlabel('Permutation Importance for $\\nu_\mathrm{SVR}$')

bin_width = .015
bins = np.arange(0.14, .701, bin_width)
x = bins[:-1]+bin_width/2

for ax,feature,label in zip(axes[1:6],features_nu[sorted_idx][::-1],feature_labels[::-1]):

    average = binned_statistic(df_idrome.nu,df_idrome[feature],bins=bins)[0]
    standard_error = binned_statistic(df_idrome.nu,df_idrome[feature],statistic=se,
                                     bins=bins)[0]
    counts = binned_statistic(df_idrome.nu,df_idrome[feature],statistic='count',
                                     bins=bins)[0]
    
    ax.errorbar(x[counts>2],average[counts>2],yerr=standard_error[counts>2],ms=2,marker='o',lw=0,zorder=0,
             elinewidth=1,capsize=2,capthick=1,label=label,color=pt_colors['grey'])
    
    average = binned_statistic(df_idrome.nu_svr,df_idrome[feature],bins=bins)[0]
    standard_error = binned_statistic(df_idrome.nu_svr,df_idrome[feature],statistic=se,
                                     bins=bins)[0]
    counts = binned_statistic(df_idrome.nu_svr,df_idrome[feature],statistic='count',
                                     bins=bins)[0]

    ax.errorbar(x[counts>2],average[counts>2],yerr=standard_error[counts>2],ms=2,marker='o',lw=0,zorder=0,
             elinewidth=1,capsize=2,capthick=1,label=label,color=pt_colors['red'])

    ax.set_ylabel(label)
    ymin = np.nanmin(average)
    ymin = ymin*1.3 if ymin <0 else ymin*.8
    ymax = np.nanmax(average)*1.05
    ax.vlines([.45,.55],ymin=-40,ymax=80,color='k',ls=':',lw=.7)
    ax.set_ylim(ymin,ymax)
    ax.set_xlim(.125,.708)
    ax.set_xticks(np.arange(.2,.71,.2))
    ax.set_xlabel('$\\nu$')
    
plt.tight_layout()
plt.savefig(f'per_imp_{epsilon}_{C}.jpg',dpi=600)
files.download(f'per_imp_{epsilon}_{C}.jpg')