# Tuning and training the models for standard-pHLA-score

Input are the per-peptide-position ref2015 features for complex, already generated in ../Featurization/rosettaPPPEnergies.csv

We tune the parameters in the 5-fold-crossvalidation setting.

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from IPython.display import display
from scipy import stats
import _pickle as cPickle
import statistics

## Load the standard ref2015 features

### Full dataset

In [7]:
## 1 - load the energies
def pppene_to_array(tmp):
    tmp = tmp.replace("(", "")
    tmp = tmp.replace(")", "")
    tmp = tmp.strip("[]")
    tmp = tmp.replace(" ", "")
    tmp = tmp.replace("\n", ",")
    return np.fromstring(tmp, dtype=float, sep=", ").reshape(9,20)

ppp_ene = pd.read_csv("../Featurization/rosettaPPPEnergies.csv")
ppp_ene = ppp_ene[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
ppp_ene["energies"] = ppp_ene["energies"].apply(pppene_to_array)
ppp_ene

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy
0,A0101,YLEQLHQLY,1,0.574375,"[[-9.37669305, 4.67802037, 8.54111444, 10.3367...",112.623867
1,A0101,HSERHVLLY,1,0.574375,"[[-7.89190954, 0.93707113, 10.06233605, 2.7158...",91.902185
2,A0101,MTDPEMVEV,1,0.574375,"[[-8.25236275, 10.56587939, 7.5793572, 1.14710...",146.451590
3,A0101,LTDFIREEY,1,0.574375,"[[-8.43720197, 10.21830335, 7.113905, 36.51672...",138.735082
4,A0101,LLDQRPAWY,1,0.574375,"[[-8.18944861, 12.37002534, 6.69837214, 8.0238...",142.756344
...,...,...,...,...,...,...
77576,C1601,QQTTTSFQN,0,0.000000,"[[-8.23468271, 4.14167228, 9.40648541, 38.0082...",128.349704
77577,C1601,QQVEQMEIP,0,0.000000,"[[-9.20362039, 22.89414572, 10.40234849, 35.70...",159.872992
77578,C1601,QQWQVFSAE,0,0.000000,"[[-8.46025926, 3.82365938, 10.25920191, 61.414...",97.152888
77579,C1601,QRCVVLRFL,0,0.000000,"[[-7.11160825, 1.35151526, 9.62650699, 20.2230...",116.714004


### Map the full dataset to the training set

In [8]:
#Load split
train_set = pd.read_csv("../Datasets/train_set.csv")
train_set = train_set[["allele", "peptide", "fileloc", "allele_type", "fold_num"]]

#Merge to form the training set
train_dataset = pd.merge(ppp_ene, train_set, on=["allele", "peptide"], suffixes=["", "_y"], how="inner")
train_dataset

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy,fileloc,allele_type,fold_num
0,A0101,YLEQLHQLY,1,0.574375,"[[-9.37669305, 4.67802037, 8.54111444, 10.3367...",112.623867,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,2.0
1,A0101,HSERHVLLY,1,0.574375,"[[-7.89190954, 0.93707113, 10.06233605, 2.7158...",91.902185,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
2,A0101,MTDPEMVEV,1,0.574375,"[[-8.25236275, 10.56587939, 7.5793572, 1.14710...",146.451590,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
3,A0101,LTDFIREEY,1,0.574375,"[[-8.43720197, 10.21830335, 7.113905, 36.51672...",138.735082,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
4,A0101,LLDQRPAWY,1,0.574375,"[[-8.18944861, 12.37002534, 6.69837214, 8.0238...",142.756344,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
...,...,...,...,...,...,...,...,...,...
69793,C1601,QQTTTSFQN,0,0.000000,"[[-8.23468271, 4.14167228, 9.40648541, 38.0082...",128.349704,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69794,C1601,QQVEQMEIP,0,0.000000,"[[-9.20362039, 22.89414572, 10.40234849, 35.70...",159.872992,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69795,C1601,QQWQVFSAE,0,0.000000,"[[-8.46025926, 3.82365938, 10.25920191, 61.414...",97.152888,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,1.0
69796,C1601,QRCVVLRFL,0,0.000000,"[[-7.11160825, 1.35151526, 9.62650699, 20.2230...",116.714004,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,2.0


In [9]:
# extracting features in training format
# and get the cross-validation iterator
def get_energies(X):
    ene = np.roll(X, 4, axis = 0)[:9,:19]
    ene = np.roll(ene, -4, axis = 0)
    ene = ene.reshape(9*19)
    return ene

def extract_features_Xy_cv(merged_df, allele):
    allele_data = merged_df[merged_df["allele"]==allele]
    allele_data["enefeat"] = allele_data["energies"].apply(get_energies)
    allele_data = allele_data.reset_index(drop=True)
    flag = 0
    for index, row in allele_data.iterrows():
        if flag == 0:
            X = np.array(row['enefeat'])
            flag = 1
        else: 
            X = np.vstack((X, row['enefeat']))
    #extract binding energies        
    y = np.array(list(allele_data["ba"]))
    y_l = np.array(list(allele_data["binder"]))
    
    cv_iter = []
    for split in range(5):
        test_ind = allele_data.index[(allele_data['fold_num'] == split)].tolist()
        train_ind = allele_data.index[~(allele_data['fold_num'] == split)].tolist()
        cv_iter.append((train_ind, test_ind))
        
    return (X, y, y_l, cv_iter)
    


In [10]:
def param_tune_allele(allele, train_dataset):
    
    allele_td = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_td, allele)
    #shuffle for oob
    X_train_s, y_train_s, y_l_s = shuffle(X_train, y_train, y_l, random_state=0)


    grid_params = {'n_estimators': [x for x in range(100, 1100, 100)], 
            'max_depth': [x for x in range(10, 100, 10)], 
            'min_samples_leaf': [x for x in range(10, 100, 10)], 
            'bootstrap': [True],
            'oob_score':[True],
            'max_features':["auto", "sqrt", "log2"]
            }



    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    best_oob_score = 0 
    best_oob_params = None
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)



## Crossvalidation

In [None]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

## Tune the best

In [12]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = RandomForestRegressor(n_jobs=-1)
    regr_best.set_params(**params)
    regr_best.fit(X_train, y_train)
    with open('./final_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)    

In [None]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "ppp")   

## Middle position

In [13]:
# extracting features in training format
# and get the cross-validation iterator
def get_energies(X):
    ene = np.roll(X, -3, axis = 0)[:4,:19]
    ene = ene.reshape(4*19)
    return ene

In [None]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

In [None]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = RandomForestRegressor(n_jobs=-1)
    regr_best.set_params(**params)
    regr_best.fit(X_train, y_train)
    with open('./final_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)    

In [None]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "ppp-middle") 

## Anchor positions

In [14]:
# extracting features in training format
# and get the cross-validation iterator
def get_energies(X):
    ene = np.roll(X, 2, axis = 0)[:5,:19]
    ene = np.roll(ene, -2, axis = 0)
    ene = ene.reshape(5*19)
    return ene

In [None]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

In [None]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = RandomForestRegressor(n_jobs=-1)
    regr_best.set_params(**params)
    regr_best.fit(X_train, y_train)
    with open('./final_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)

In [None]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "ppp-anchor") 