# Tuning and training the models for standard-pHLA-score

Input are the standard ref2015 features for complex, already generated in ../Featurization/rosettaComplexEnergies.csv

We tune the parameters in the 5-fold-crossvalidation setting.


In [2]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import sklearn.linear_model as linear_model
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from IPython.display import display
from scipy import stats
import _pickle as cPickle
import statistics
from sklearn.cross_decomposition import PLSRegression

## Load the standard ref2015 features

### Full dataset

In [3]:
## 1 - load the energies
def ene_to_array(ene_str):
    ene_str = ene_str.strip("[]")
    ene_str = ene_str.strip("\(\)")
    return np.fromstring(ene_str, dtype=float, count = 20, sep=", ")

complex_ene = pd.read_csv("../Featurization/rosettaComplexEnergies.csv")
complex_ene = complex_ene[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
complex_ene["energies"] = complex_ene["energies"].apply(ene_to_array)
complex_ene

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy
0,A0101,YLEQLHQLY,1,0.574375,"[-2306.77360984, 811.85340549, 1461.80507136, ...",44.784793
1,A0101,HSERHVLLY,1,0.574375,"[-2293.38661156, 760.61067588, 1470.48909157, ...",-15.285789
2,A0101,MTDPEMVEV,1,0.574375,"[-2276.11339533, 793.72608921, 1465.61096698, ...",96.468759
3,A0101,LTDFIREEY,1,0.574375,"[-2295.34585307, 757.61420988, 1473.41382942, ...",59.739557
4,A0101,LLDQRPAWY,1,0.574375,"[-2293.03299876, 860.34732958, 1461.86885467, ...",103.222148
...,...,...,...,...,...,...
77576,C1601,QQTTTSFQN,0,0.000000,"[-2289.00765669, 2426.75217972, 1479.38435418,...",1470.688209
77577,C1601,QQVEQMEIP,0,0.000000,"[-2303.64763419, 2393.02916873, 1484.9113067, ...",1481.597525
77578,C1601,QQWQVFSAE,0,0.000000,"[-2296.25090083, 2349.00146681, 1475.89378426,...",1404.053591
77579,C1601,QRCVVLRFL,0,0.000000,"[-2308.57295453, 2368.37391254, 1478.19777993,...",1426.372734


### Map the full dataset to the training set

In [4]:
#Load split
train_set = pd.read_csv("../Datasets/train_set.csv")
train_set = train_set[["allele", "peptide", "fileloc", "allele_type", "fold_num"]]

#Merge to form the training set
train_dataset = pd.merge(complex_ene, train_set, on=["allele", "peptide"], suffixes=["", "_y"], how="inner")
train_dataset

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy,fileloc,allele_type,fold_num
0,A0101,YLEQLHQLY,1,0.574375,"[-2306.77360984, 811.85340549, 1461.80507136, ...",44.784793,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,2.0
1,A0101,HSERHVLLY,1,0.574375,"[-2293.38661156, 760.61067588, 1470.48909157, ...",-15.285789,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
2,A0101,MTDPEMVEV,1,0.574375,"[-2276.11339533, 793.72608921, 1465.61096698, ...",96.468759,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
3,A0101,LTDFIREEY,1,0.574375,"[-2295.34585307, 757.61420988, 1473.41382942, ...",59.739557,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
4,A0101,LLDQRPAWY,1,0.574375,"[-2293.03299876, 860.34732958, 1461.86885467, ...",103.222148,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
...,...,...,...,...,...,...,...,...,...
69793,C1601,QQTTTSFQN,0,0.000000,"[-2289.00765669, 2426.75217972, 1479.38435418,...",1470.688209,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69794,C1601,QQVEQMEIP,0,0.000000,"[-2303.64763419, 2393.02916873, 1484.9113067, ...",1481.597525,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69795,C1601,QQWQVFSAE,0,0.000000,"[-2296.25090083, 2349.00146681, 1475.89378426,...",1404.053591,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,1.0
69796,C1601,QRCVVLRFL,0,0.000000,"[-2308.57295453, 2368.37391254, 1478.19777993,...",1426.372734,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,2.0


In [5]:
# extracting features in training format
# and get the cross-validation iterator
def extract_features_Xy_cv(merged_df, allele):
    allele_data = merged_df[merged_df["allele"]==allele]
    allele_data["enefeat"] = allele_data["energies"].apply(lambda x: x[:-1])
    allele_data = allele_data.reset_index(drop=True)
    flag = 0
    for index, row in allele_data.iterrows():
        if flag == 0:
            X = np.array(row['enefeat'])
            flag = 1
        else: 
            X = np.vstack((X, row['enefeat']))
    #extract binding energies        
    y = np.array(list(allele_data["ba"]))
    y_l = np.array(list(allele_data["binder"]))
    
    cv_iter = []
    for split in range(5):
        test_ind = allele_data.index[(allele_data['fold_num'] == split)].tolist()
        train_ind = allele_data.index[~(allele_data['fold_num'] == split)].tolist()
        cv_iter.append((train_ind, test_ind))
        
    return (X, y, y_l, cv_iter)
    

In [6]:

def param_tune_allele(allele, train_dataset):
    
    allele_td = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_td, allele)

    
    grid_params = {'n_components': [2, 5, 10, 100]
            }
    
    regr_results = {}
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    regr_pls = PLSRegression()
    
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=4)):
        print("CV")
        print(g)
        #cross-validation
        regr_pls.set_params(**g)
        cv_scores = cross_val_score(regr_pls, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
  
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (None, None, best_cv_params, best_cv_scores)
            
    
'''
    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    best_oob_score = 0 
    best_oob_params = None
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)

'''   

'''
    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    regr_results = {}
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)
'''

'\n    regr_oob = RandomForestRegressor(n_jobs=-1)\n    regr_cv = RandomForestRegressor(n_jobs=-1)\n\n    regr_results = {}\n    best_cv_mscore = 0\n    best_cv_scores = None\n    best_cv_params = None\n    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):\n        print("Parameter iteration: "+str(i))\n        print("OOB")\n        #out of bag\n        print(g)\n        regr_oob.set_params(**g)\n        regr_oob.fit(X_train_s,y_train_s)\n        print(regr_oob.oob_score_)\n        if regr_oob.oob_score_ > best_oob_score:\n            best_oob_params = g\n            best_oob_score = regr_oob.oob_score_\n        \n        print("CV")\n        #cross-validation\n        regr_cv.set_params(**g)\n        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)\n        cv_mscore = statistics.mean(cv_scores)\n        print(cv_scores)\n        print(cv_mscore)\n        if cv_mscore > best_cv_mscore:\n            best_cv_params = g\n            best_cv_score

### Parameter tuning

In [11]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}
exp_name="complex"

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = PLSRegression()
    regr_best.set_params(**res[2])
    regr_best.fit(X_train, y_train)
    with open('./final_PLS_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)  


------------------------------------------------------------------------
ALLELE
A0101
CV
{'n_components': 2}
[0.29437926 0.46754521 0.41570904 0.33357811 0.3843718 ]
0.3791166851439601
CV
{'n_components': 5}
[0.29875237 0.47069913 0.40457892 0.33178773 0.3843784 ]
0.3780393072739937
CV
{'n_components': 10}
[0.29823355 0.47095647 0.40520307 0.33383682 0.38624442]
0.3788948663232843
CV
{'n_components': 100}
[nan nan nan nan nan]
nan
Best CV 0
{'n_components': 10}
------------------------------------------------------------------------
ALLELE
A0201
CV
{'n_components': 2}
[0.13170884 0.21997978 0.17590732 0.20146114 0.18518103]
0.1828476201252574
CV
{'n_components': 5}
[0.12942373 0.24243139 0.21166217 0.2351891  0.21756898]
0.20725507370712035
CV
{'n_components': 10}
[0.13456297 0.24310415 0.21049996 0.2354911  0.21794846]
0.2083213272105461
CV
{'n_components': 100}
[nan nan nan nan nan]
nan
Best CV 0
{'n_components': 10}
-------------------------------------------------------------------

------------------------------------------------------------------------
ALLELE
B3501
CV
{'n_components': 2}
[0.2517586  0.24371807 0.23212892 0.24533434 0.27269111]
0.2491262074666939
CV
{'n_components': 5}
[0.31360252 0.29648458 0.28640299 0.29191493 0.33063035]
0.303807073753792
CV
{'n_components': 10}
[0.31203041 0.29862894 0.28874943 0.29459608 0.32739006]
0.30427898386642194
CV
{'n_components': 100}
[nan nan nan nan nan]
nan
Best CV 0
{'n_components': 10}
------------------------------------------------------------------------
ALLELE
B3901
CV
{'n_components': 2}
[0.17770328 0.08827867 0.11348925 0.14081198 0.21522271]
0.1471011789735119
CV
{'n_components': 5}
[0.18101497 0.09684326 0.11350597 0.12518057 0.20939296]
0.14518754787247679
CV
{'n_components': 10}
[0.18144251 0.09693389 0.11401421 0.12962453 0.21039346]
0.14648172010809263
CV
{'n_components': 100}
[nan nan nan nan nan]
nan
Best CV 0
{'n_components': 10}
------------------------------------------------------------------

In [12]:
results_df = pd.DataFrame(results)
results_df.to_pickle("crossval_complex_PLS.pkl")
results_df.to_csv("crossval_complex_PLS.csv")
results_df

Unnamed: 0,allele,best_oob_param,best_oob_score,best_cv_param,best_cv_scores
0,A0101,,,{'n_components': 10},"[0.29823355484034797, 0.4709564693741055, 0.40..."
1,A0201,,,{'n_components': 10},"[0.1345629673762896, 0.2431041451520208, 0.210..."
2,A0203,,,{'n_components': 10},"[0.31038474147067363, 0.336825062070823, 0.293..."
3,A0206,,,{'n_components': 10},"[0.2811135565657714, 0.24292974288526437, 0.16..."
4,A0301,,,{'n_components': 10},"[0.10176973047420723, 0.1194203934737248, 0.10..."
5,A1101,,,{'n_components': 10},"[0.15743102715432944, 0.12421395736565764, 0.1..."
6,A2301,,,{'n_components': 10},"[0.30945377252010975, 0.20403272889404922, 0.2..."
7,A2402,,,{'n_components': 10},"[0.20440289483222798, 0.14923300897091607, 0.1..."
8,A2601,,,{'n_components': 10},"[0.20558045662762126, 0.18682049082374552, 0.1..."
9,A2902,,,{'n_components': 10},"[0.38079573134476175, 0.41281264043546034, 0.4..."


### Training the models with best parameters on the full training dataset

In [18]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = PLSRegression()
    regr_best.set_params(**params)
    regr_best.fit(X_train, y_train)
    with open('./final_PLS_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)  

In [19]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "complex") 