# Phase separation prediction

This Colab notebook enables prediction of IDR transfer free energies and saturation concentrations from sequence.
- The models have been trained on CALVADOS 2 slab simulation data.
- Conditions are fixed to T=293 K and I=150 mM.

<b>How to cite this notebook:</b>
- [PREPRINT]

Further references:
- Use of $\nu_\mathrm{SVR}$: \\
G. Tesei, A. I. Trolle, N. Jonsson, J. Betz, F. Pesce, K. E. Johansson, K. Lindorff-Larsen __Conformational ensembles of the human intrinsically disordered proteome__ _Nature_ 2024 626, 897–904 DOI: https://doi.org/10.1038/s41586-023-07004-5
- CALVADOS 2 model: \\
G. Tesei and K. Lindorff-Larsen __Improved predictions of phase behaviour of intrinsically disordered proteins by tuning the interaction range [version 2; peer review: 2 approved]__ _Open Research Europe_ 2023 2(94) DOI: https://doi.org/10.12688/openreseurope.14967.2

Author: Sören von Bülow (soren.bulow@bio.ku.dk)

In [None]:
#@title <b>Preliminary operations</b>

import os
import warnings
warnings.simplefilter("ignore")

print('Setting up the environment...')
# try:
#     os.rmdir('sample_data')
#     os.rmdir('sequence.*')
#     os.rmdir('svr_model_nu*')
#     os.rmdir('residues*')
# except:
#     pass

!rm -r sample_data &> dump
!rm sequence.* &> dump
!rm svr_model_nu* &> dump
!rm residues* &> dump
!rm example* &> dump

github_folder = 'https://raw.githubusercontent.com/KULL-Centre/_2024_buelow_PSpred/main/models'

print(f'Downloading files from {github_folder}')

os.system(f'wget {github_folder}/sequence.py')
os.system(f'wget {github_folder}/residues.csv')
os.system(f'wget {github_folder}/svr_model_nu.joblib')
os.system(f'wget {github_folder}/example.fasta')

!pip install 'scikit-learn==1.3' MDAnalysis biopython numba &> dump

import joblib
import sklearn
import pandas as pd

import sequence
import numpy as np
import MDAnalysis as mda
import matplotlib.pyplot as plt

from tqdm import tqdm
from google.colab import files

ncrossval = 50
print('Environment set up.')
print('Loading models...')

# if os.path.isfile('models_loaded'):
  # print('Models already downloaded. Skip download.')
# else:
os.system(f'wget -P mlmodels {github_folder}/model_dG.joblib')#?raw=true')
os.system(f'wget -P mlmodels {github_folder}/model_logcdil_mgml.joblib')#?raw=true')
# !touch models_loaded

residues = pd.read_csv('residues.csv').set_index('one')
nu_file = 'svr_model_nu.joblib'

features = ['mean_lambda', 'faro', 'shd', 'ncpr', 'fcr', 'scd', 'ah_ij','nu_svr']

features_clean = {
    'mean_lambda' : 'lambda',
    'faro' : 'f(aromatics)',
    'shd' : 'SHD',
    'ncpr' : 'NCPR',
    'fcr' : 'FCR',
    'scd' : 'SCD',
    'ah_ij' : 'LJ pairs',
    'nu_svr' : 'nu(SVR)'
}

print('Input features are:')
print('>>>>> '+ ', '.join([features_clean[fe] for fe in features]))

def predict_single(X,model):
    y = model.predict(X)
    return y

def predict_multimodels(X,models):
    ys = np.zeros(len(models))
    for idx, model in enumerate(models):
        ys[idx] = predict_single(X,model)
    return ys

def X_from_seq(seq,feats,residues=[],charge_termini=True,nu_file=None,ah_intgrl_map=None):
    X = []
    seqfeats = sequence.SeqFeatures(seq,residues=residues,charge_termini=charge_termini,nu_file=nu_file,
                                       ah_intgrl_map=ah_intgrl_map)
    for feat in feats:
        X.append(getattr(seqfeats,feat))
    X = np.array(X)
    X = np.reshape(X,(1,-1))
    return X

def makeXy(df,feats,target=None):
    """ Make feature (X) -- target (y) pairs from dataframe """
    X, y, X_keys = [], [], []

    for key, val in df.iterrows():
        features = []

        for feat in feats: # feats is a list of string
            features.append(val[feat]) # features is a list of values

        X.append(features)
        X_keys.append(key)

        if target is not None:
            target_sim = val[target]
            y.append(target_sim)

    X = np.array(X)
    if target is not None:
        y = np.array(y)
        return X, y, X_keys
    else:
        return X, X_keys

def predict_df(df, model, features):
    X_full, X_full_keys = makeXy(df, features)
    ypred_full = model.predict(X_full)
    ypred_full_m = np.mean(ypred_full,axis=0)
    df.loc[X_full_keys,'dG_pred'] = ypred_full_m
    return df

class AttrSetter:
    def __init__(self,**kwargs):
        for key, val in kwargs.items():
            setattr(self, key, val)

class Model:
    def __init__(self,**kwargs):
        self.mltype = kwargs.get('mltype','svr')
        self.layers = kwargs.get('layers',(10,10))
        self.alpha = kwargs.get('alpha',5)
        self.C = kwargs.get('C',10)
        self.epsilon = kwargs.get('epsilon',1e-2)
        self.ptrain = kwargs.get('ptrain',0.8)
        self.ncrossval = kwargs.get('ncrossval',50)

    @staticmethod
    def split_data(X,y,X_keys,ptrain):
        """ Split data into train and test set and return corresponding indices """
        nsamp = len(X)
        if nsamp != len(y):
            raise ValueError("X and y size is not equal!")

        random_idx = np.random.choice(nsamp, size=nsamp, replace=False)
        ntrain = int(nsamp * ptrain)
        train_idx = random_idx[:ntrain]
        test_idx = random_idx[ntrain:]
        X_train, y_train = X[train_idx], y[train_idx]
        X_test, y_test = X[test_idx], y[test_idx]
        X_train_keys = [X_keys[idx] for idx in train_idx]
        X_test_keys = [X_keys[idx] for idx in test_idx]

        return X_train, X_test, y_train, y_test, X_train_keys, X_test_keys

    @staticmethod
    def calc_statistics(y, ypred, verbose=True):
        # Pearson
        fit = linregress(y, ypred)
        rp = fit.rvalue

        # Spearmanx
        rs = spearmanr(y, ypred).statistic

        # Root mean squared deviation
        rmsd = np.sqrt(np.mean((y - ypred)**2))

        if verbose:
            print(f'Pearson: {rp:.3f}, Spearman: {rs:.3f}, RMSD: {rmsd:.3f}')
        return rp, rs, rmsd

    @staticmethod
    def calc_statistics_multimodel(y, ypred, verbose=True):
        nmodels = len(ypred)

        rp = np.zeros((nmodels))
        rs = np.zeros((nmodels))
        rmsd = np.zeros((nmodels))

        for idx, yp in enumerate(ypred):
            rp[idx], rs[idx], rmsd[idx] = Model.calc_statistics(y, yp, verbose=verbose)
        return rp, rs, rmsd

    def predict(self,X):
        ypred = np.zeros((self.ncrossval, len(X)))
        for idx, crossval in enumerate(self.crossvals):
            ypred[idx] = crossval.mlmodel.predict(X)
        return ypred

    def train(self,X,y,X_keys,**kwargs):
        self.models = []
        self.crossvals = []
        verbose = kwargs.get('verbose',True)

        for idx in range(self.ncrossval):
            X_train, X_test, y_train, y_test, X_train_keys, X_test_keys = self.split_data(X,y,X_keys,self.ptrain)

            if self.mltype == 'svr':
                mlmodel = make_pipeline(StandardScaler(), SVR(C=self.C, epsilon=self.epsilon))
            elif self.mltype == 'mlp':
                mlmodel = make_pipeline(
                    StandardScaler(),
                    MLPRegressor(
                        hidden_layer_sizes=self.layers,activation='tanh',
                        solver='lbfgs',max_iter=10000,alpha=self.alpha),
                )
            mlmodel.fit(X_train, y_train)

            ypred_train = mlmodel.predict(X_train)
            ypred_test = mlmodel.predict(X_test)

            rp, rs, rmsd = self.calc_statistics(y_test, ypred_test, verbose=verbose)

            self.crossvals.append(AttrSetter(
                X_train = X_train,
                X_test = X_test,
                y_train = y_train,
                y_test = y_test,
                X_train_keys = X_train_keys,
                X_test_keys = X_test_keys,
                mlmodel = mlmodel,
                ypred_train = ypred_train,
                ypred_test = ypred_test,
                rp = rp,
                rs = rs,
                rmsd = rmsd
            ))
        self.rp_mean = np.mean([cval.rp for cval in self.crossvals])
        self.rs_mean = np.mean([cval.rs for cval in self.crossvals])
        self.rmsd_mean = np.mean([cval.rmsd for cval in self.crossvals])

!touch calvados.py

models = {}
models['dG'] = joblib.load(f'mlmodels/model_dG.joblib')
models['logcdil_mgml'] = joblib.load(f'mlmodels/model_logcdil_mgml.joblib')

mltype = 'mlp'
alpha = 5
layers = (10,10)

targets = ['dG','logcdil_mgml']
targets_clean = {
    'dG' : 'Delta G',
    'logcdil_mgml' : 'Saturation concentration',
}

print('Models loaded.')

In [None]:
#@title <b>Predict single IDR sequence</font></b>

try:
    os.rmdir('sample_data')
except:
    pass

NAME = "LAF1" #@param {type:"string"}
SEQUENCE = "MESNQSNNGGSGNAALNRGGRYVPPHLRGGDGGAAAAASAGGDDRRGGAGGGGYRRGGGNSGGGGGGGYDRGYNDNRDDRDNRGGSGGYGRDRNYEDRGYNGGGGGGGNRGYNNNRGGGGGGYNRQDRGDGGSSNFSRGGYNNRDEGSDNRGSGRSYNNDRRDNGGDGLEHHHHHH" #@param {type:"string"}
CHARGE_TERMINI = True # @param {type:'boolean'}
TEMPERATURE = "293 K (fixed)" # @param ['293 K (fixed)']
IONIC_STRENGTH = "150 uM (fixed)" # @param ['150 uM (fixed)']

seq = SEQUENCE
if " " in seq:
    seq = ''.join(seq.split())
    print('Blank character(s) found in the provided sequence. Sequence has been corrected, but check for integrity.')

print('='*80)
print(f'NAME: {NAME}')
print(f'SEQUENCE: {seq}')

seqfeats = sequence.SeqFeatures(seq,residues=residues,charge_termini=CHARGE_TERMINI)
X = X_from_seq(seq,features,residues=residues,charge_termini=CHARGE_TERMINI,nu_file=nu_file)

for target in targets:
  print('-'*80)
  ys = models[target].predict(X)#,models)
  ys_m = np.mean(ys)

  if target == 'dG':
    output = ys_m
    unit = 'kT'
    lower = ys_m - 1
    upper = ys_m + 1
  elif target == 'logcdil_mgml':
    output = np.exp(ys_m)
    lower = np.exp(ys_m-0.82)
    upper = np.exp(ys_m+0.82)
    unit = 'mg/mL'

  print(f'{targets_clean[target]:25s} = {output:5.1f} {unit:6s} ({lower:.1f} -- {upper:.1f} {unit})')
  if target == 'logcdil_mgml':
    output_uM = output / seqfeats.mw * 1e6
    lower_uM = lower / seqfeats.mw * 1e6
    upper_uM = upper / seqfeats.mw * 1e6
    print(f'{"":25s} = {output_uM:5.1f} {"uM":6s} ({lower_uM:.1f} -- {upper_uM:.1f} {"uM"})')
print('='*80)

In [None]:
#@title <b>Run batch prediction</b>

#@markdown File name
FASTA_FILE = "example.fasta" #@param {type:"string"}
CHARGE_TERMINI = True # @param {type:'boolean'}
TEMPERATURE = "293 K (fixed)" # @param ['293 K (fixed)']
IONIC_STRENGTH = "150 uM (fixed)" # @param ['150 uM (fixed)']

if not os.path.isfile(FASTA_FILE):
  print(f'Please upload file {FASTA_FILE}')
  uploaded = files.upload()
  if FASTA_FILE not in uploaded.keys():
    raise NameError(f'Could not find file {FASTA_FILE}')

records = sequence.read_fasta(FASTA_FILE)

print('-'*80)
print(f'FASTA FILE: {FASTA_FILE}')
print(f'NUMBER OF SEQUENCES: {len(records)}')

#@title <b>Run batch prediction</font></b>

df_records = pd.DataFrame(dtype=object)

for name, record in tqdm(records.items(),total=len(records)):
  seq = str(record.seq)
  df_records.loc[name,'Sequence'] = seq
  seqfeats = sequence.SeqFeatures(seq,residues=residues,
                                  charge_termini=CHARGE_TERMINI,nu_file=nu_file)
  for feat in features:
    df_records.loc[name,feat] = getattr(seqfeats,feat)
  X = X_from_seq(seq,features,residues=residues,
                 charge_termini=CHARGE_TERMINI,nu_file=nu_file)
  for target in targets:
    ys = models[target].predict(X)#,models)
    ys_m = np.mean(ys)
    if target == 'dG':
      df_records.loc[name,'Delta G [kT]'] = ys_m
    if target == 'logcdil_mgml':
      cdil_mgml = np.exp(ys_m)
      df_records.loc[name,'Saturation concentration [mg/mL]'] = cdil_mgml
      cdil_uM = cdil_mgml / seqfeats.mw * 1e6
      df_records.loc[name,'Saturation concentration [uM]'] = cdil_uM

df_records.index.name = 'Name'
df_records.to_csv('df_PSprediction.csv')

print('\n')
print('='*114)
print(f'{"Name":20s} {"Sequence":33s} {"Delta G":>10s} {"Saturation":>16s} {"Saturation":>16s}')
print(f'{"":20s} {"":33s} {"":>10s} {"concentration":>16s} {"concentration":>16s}')
print(f'{"":20s} {"":33s} {"[kT]":>10s} {"[mg/mL]":>16s} {"[uM]":>16s}')

print('='*114)
for key, val in df_records.iterrows():
  if len(list(val["Sequence"])) > 30:
    seqpr = f'{val["Sequence"][:30]:30s}...'
  else:
    seqpr = f'{val["Sequence"][:30]:30s}'
  print(f'{key:20s} {seqpr:33s} {val["Delta G [kT]"]:10.1f} {val["Saturation concentration [mg/mL]"]:16.1f} {val["Saturation concentration [uM]"]:16.1f}')


In [None]:
#@title <b>Analysis</font></b>

fcolor = plt.cm.summer

fig, ax = plt.subplots(1,2,figsize=(9,4))

for idx, target in enumerate(['Delta G [kT]','Saturation concentration [mg/mL]']):
  axij = ax[idx]
  axij.hist(df_records[target],bins=20,color=fcolor(0))

  axij.set_xlabel(f'{target}')
  axij.set_ylabel('Counts')
  axij.grid(alpha=0.3)
fig.tight_layout()

In [None]:
#@title <b>Download dataframe</font></b>

files.download('df_PSprediction.csv')