<a href="https://colab.research.google.com/github/KULL-Centre/_2023_Tesei_IDRome/blob/main/IDR_SVR_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Preliminary information:**

This Colab notebook enables to predict the scaling exponent, $\nu$, and the conformational entropy per residue, $S_\text{conf}/N$, of an intrinsically disordered protein (IDP) or protein region (IDR) based on the amino acid sequence.

Predictions are generated by a support vector regression (SVR) model [1], which was trained on simulations of all the IDPs and IDRs of the human proteome performed using the CALVADOS model [2].

Amino acid sequences can be provided via (i) a single fasta file containing one or several entries, (ii) multiple fasta files, or (iii) pasting each sequence in separate input text boxes.

<b><font color='#FA003F'>How to cite this notebook:</font></b> If you use the $\nu$ values generated by the SVR model, we ask you to cite Tesei, Trolle et al. [1]. 
1. G. Tesei, A. I. Trolle, N. Jonsson, J. Betz, F. Pesce, K. E. Johansson, K. Lindorff-Larsen __Conformational ensembles of the human intrinsically disordered proteome: Bridging chain compaction with function and sequence conservation__ _bioRxiv_ 2023 2023.05.08.539815 DOI: https://doi.org/10.1101/2023.05.08.539815
2. G. Tesei and K. Lindorff-Larsen __Improved predictions of phase behaviour of intrinsically disordered proteins by tuning the interaction range [version 2; peer review: 2 approved]__ _Open Research Europe_ 2023 2(94) DOI: https://doi.org/10.12688/openreseurope.14967.2
---


In [None]:
#@title <b>Preliminary operations</b>
import subprocess
subprocess.run( 'pip install wget localcider==0.1.18'.split() )
subprocess.run('pip uninstall scikit-learn -y'.split())
subprocess.run('pip install scikit-learn==1.0.2'.split())
import numpy as np
import itertools
from localcider.sequenceParameters import SequenceParameters
import wget
import sys
import os
from joblib import dump, load
import pandas as pd
from google.colab import files
from ipywidgets import IntProgress
from IPython.display import display
from IPython.display import clear_output

def calc_seq_prop(seq,residues,Nc,Cc,Hc):
    seq = list(seq).copy()
    fasta_kappa = np.array(seq.copy())
    N = len(seq)
    r = residues.copy()

    # calculate properties that do not depend on charges
    mean_lambda = np.mean(r.loc[seq].lambdas)
    pairs = np.array(list(itertools.combinations(seq,2)))
    pairs_indices = np.array(list(itertools.combinations(range(N),2)))
    # calculate sequence separations
    ij_dist = np.diff(pairs_indices,axis=1).flatten().astype(float)
    # calculate lambda sums
    ll = r.lambdas.loc[pairs[:,0]].values+r.lambdas.loc[pairs[:,1]].values
    # calculate SHD
    beta = -1
    shd = np.sum(ll*np.power(np.abs(ij_dist),beta))/N

    # fix charges
    if Nc == 1:
        r.loc['X'] = r.loc[seq[0]]
        r.loc['X','q'] = r.loc[seq[0],'q'] + 1.
        seq[0] = 'X'
        if r.loc['X','q'] > 0:
            fasta_kappa[0] = 'K'
        else:
            fasta_kappa[0] = 'A'
    if Cc == 1:
        r.loc['Z'] = r.loc[seq[-1]]
        r.loc['Z','q'] = r.loc[seq[-1],'q'] - 1.
        seq[-1] = 'Z'
        if r.loc['Z','q'] < 0:
            fasta_kappa[-1] = 'D'
        else:
            fasta_kappa[-1] = 'A'
    if Hc < 0.5:
        r.loc['H', 'q'] = 0
        fasta_kappa[np.where(np.array(seq) == 'H')[0]] = 'A'
    elif Hc >= 0.5:
        r.loc['H', 'q'] = 1
        fasta_kappa[np.where(np.array(seq) == 'H')[0]] = 'K'

    # calculate properties that depend on charges
    pairs = np.array(list(itertools.combinations(seq,2)))
    # calculate charge products
    qq = r.q.loc[pairs[:,0]].values*r.q.loc[pairs[:,1]].values
    # calculate SCD
    scd = np.sum(qq*np.sqrt(ij_dist))/N
    SeqOb = SequenceParameters(''.join(fasta_kappa))
    kappa = SeqOb.get_kappa()
    fcr = r.q.loc[seq].abs().mean()
    ncpr = r.q.loc[seq].mean()
    
    return pd.DataFrame(data=[scd,shd,kappa,FCR,mean_lambda,ncpr],
                 columns=['fK','fR','fE','fD','fARO','SCD','SHD','kappa','FCR','mean_lambda','NCPR'])

aa = ['A','C','D','E','F','G','H','I','K','L','M','N','P','Q','R','S','T','V','W','Y']

url = 'https://github.com/KULL-Centre/_2023_Tesei_IDRome/blob/main'

if os.path.exists('svr_model_nu.joblib') == False:
    wget.download(url+'/svr_models/svr_model_nu.joblib?raw=true')
if os.path.exists('svr_model_SPR.joblib') == False:
    wget.download(url+'/svr_models/svr_model_SPR.joblib?raw=true')
if os.path.exists('residues.csv') == False:
    wget.download(url+'/md_simulations/data/residues.csv')
    
model_nu = load('svr_model_nu.joblib') 
model_spr = load('svr_model_SPR.joblib') 
features_nu = ['SCD','SHD','kappa','FCR','mean_lambda']
features_spr = ['SCD','SHD','mean_lambda']

residues = pd.read_csv('residues.csv',index_col='one')

fasta_dict = {}
df = pd.DataFrame(columns=['nu_svr','SPR_svr','mean_lambda','SHD','SCD','kappa','FCR','NCPR',
                           'fK','fR','fE','fD','fARO'])

In [None]:
#@title <b>Upload sequence(s)</b>
#@markdown Upload fasta file. File with multiple sequences and multiple files upload is supported.
current_upload = []
fasta_file = files.upload()
for fn in fasta_file.keys():
    fasta = open(fn).readlines()
    try:
        fasta.remove("")
    except:
        pass
    for l in fasta:
        if l.startswith('>'):
            name = l[1:].strip()
            fasta_dict[name] = ''
            current_upload.append(name)
        else:
            fasta_dict[name] += l.strip()

#check sequence
for x in current_upload:
    for a in fasta_dict[x]:
        if a not in aa:
            print('WARNING: {} sequence contains a character ({}) not recognized as an aminoacid. This sequence will be ignored.'.format(x,a))
            del fasta_dict[x]
            break

In [None]:
#@title <b>Input sequence(s)</b>
#@markdown Or paste a sequence and provide a name. This cell can be executed multiple times to register more sequences.
NAME = "" #@param {type:"string"}
SEQUENCE = "" #@param {type:"string"}

if NAME != "" and SEQUENCE != "":
    if " " in SEQUENCE:
        SEQUENCE = ''.join(SEQUENCE.split())
    fasta_dict[NAME] = SEQUENCE

    #check sequence
    for a in fasta_dict[NAME]:
        if a not in aa:
            print('WARNING: {} sequence contains a character not recognized as an aminoacid. This sequence will be ignored'.format(name))
            del fasta_dict[NAME]

else:
    print('No NAME and/or SEQUENCE provided. Upload fasta files with the cell above or paste a sequence at the time here.')

In [None]:
#@title <b>Predict $\nu$
#@markdown Use this cell to calculate sequence features and predict the scaling exponent $\nu$. Results will be download in a csv file.

f = IntProgress(min=0, max=len(fasta_dict), description='Progress:', bar_style='warning')
display(f)

for k in fasta_dict.keys():
    res = calc_seq_prop(fasta_dict[k],residues,1,1,0)
    nu = np.around(model_nu.predict(res.loc[features_nu].values.reshape(1, -1)),3)
    spr = np.around(model_spr.predict(res.loc[features_spr].values.reshape(1, -1)),3)
    df.loc[k,'nu_svr'] = np.concatenate((nu,res))
    df.loc[k,'SPR_svr'] = np.concatenate((spr,res))
    df.loc[k,res.index.values] = np.around(res.loc[features_nu].values,3)
    f.value += 1

clear_output()
df.to_csv('svr_pred.csv',index_label='name')
files.download('svr_pred.csv')

df