# Tuning and training the models for standard-pHLA-score

Input are the standard ref2015 features for complex, already generated in ../Featurization/rosettaComplexEnergies.csv

We tune the parameters in the 5-fold-crossvalidation setting.


In [10]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import sklearn.linear_model as linear_model
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from IPython.display import display
from scipy import stats
import _pickle as cPickle
import statistics

## Load the standard ref2015 features

### Full dataset

In [2]:
## 1 - load the energies
def ene_to_array(ene_str):
    ene_str = ene_str.strip("[]")
    ene_str = ene_str.strip("\(\)")
    return np.fromstring(ene_str, dtype=float, count = 20, sep=", ")

complex_ene = pd.read_csv("../Featurization/rosettaComplexEnergies.csv")
complex_ene = complex_ene[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
complex_ene["energies"] = complex_ene["energies"].apply(ene_to_array)
complex_ene

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy
0,A0101,YLEQLHQLY,1,0.574375,"[-2306.77360984, 811.85340549, 1461.80507136, ...",44.784793
1,A0101,HSERHVLLY,1,0.574375,"[-2293.38661156, 760.61067588, 1470.48909157, ...",-15.285789
2,A0101,MTDPEMVEV,1,0.574375,"[-2276.11339533, 793.72608921, 1465.61096698, ...",96.468759
3,A0101,LTDFIREEY,1,0.574375,"[-2295.34585307, 757.61420988, 1473.41382942, ...",59.739557
4,A0101,LLDQRPAWY,1,0.574375,"[-2293.03299876, 860.34732958, 1461.86885467, ...",103.222148
...,...,...,...,...,...,...
77576,C1601,QQTTTSFQN,0,0.000000,"[-2289.00765669, 2426.75217972, 1479.38435418,...",1470.688209
77577,C1601,QQVEQMEIP,0,0.000000,"[-2303.64763419, 2393.02916873, 1484.9113067, ...",1481.597525
77578,C1601,QQWQVFSAE,0,0.000000,"[-2296.25090083, 2349.00146681, 1475.89378426,...",1404.053591
77579,C1601,QRCVVLRFL,0,0.000000,"[-2308.57295453, 2368.37391254, 1478.19777993,...",1426.372734


### Map the full dataset to the training set

In [3]:
#Load split
train_set = pd.read_csv("../Datasets/train_set.csv")
train_set = train_set[["allele", "peptide", "fileloc", "allele_type", "fold_num"]]

#Merge to form the training set
train_dataset = pd.merge(complex_ene, train_set, on=["allele", "peptide"], suffixes=["", "_y"], how="inner")
train_dataset

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy,fileloc,allele_type,fold_num
0,A0101,YLEQLHQLY,1,0.574375,"[-2306.77360984, 811.85340549, 1461.80507136, ...",44.784793,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,2.0
1,A0101,HSERHVLLY,1,0.574375,"[-2293.38661156, 760.61067588, 1470.48909157, ...",-15.285789,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
2,A0101,MTDPEMVEV,1,0.574375,"[-2276.11339533, 793.72608921, 1465.61096698, ...",96.468759,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
3,A0101,LTDFIREEY,1,0.574375,"[-2295.34585307, 757.61420988, 1473.41382942, ...",59.739557,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
4,A0101,LLDQRPAWY,1,0.574375,"[-2293.03299876, 860.34732958, 1461.86885467, ...",103.222148,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
...,...,...,...,...,...,...,...,...,...
69793,C1601,QQTTTSFQN,0,0.000000,"[-2289.00765669, 2426.75217972, 1479.38435418,...",1470.688209,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69794,C1601,QQVEQMEIP,0,0.000000,"[-2303.64763419, 2393.02916873, 1484.9113067, ...",1481.597525,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69795,C1601,QQWQVFSAE,0,0.000000,"[-2296.25090083, 2349.00146681, 1475.89378426,...",1404.053591,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,1.0
69796,C1601,QRCVVLRFL,0,0.000000,"[-2308.57295453, 2368.37391254, 1478.19777993,...",1426.372734,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,2.0


In [4]:
# extracting features in training format
# and get the cross-validation iterator
def extract_features_Xy_cv(merged_df, allele):
    allele_data = merged_df[merged_df["allele"]==allele]
    allele_data["enefeat"] = allele_data["energies"].apply(lambda x: x[:-1])
    allele_data = allele_data.reset_index(drop=True)
    flag = 0
    for index, row in allele_data.iterrows():
        if flag == 0:
            X = np.array(row['enefeat'])
            flag = 1
        else: 
            X = np.vstack((X, row['enefeat']))
    #extract binding energies        
    y = np.array(list(allele_data["ba"]))
    y_l = np.array(list(allele_data["binder"]))
    
    cv_iter = []
    for split in range(5):
        test_ind = allele_data.index[(allele_data['fold_num'] == split)].tolist()
        train_ind = allele_data.index[~(allele_data['fold_num'] == split)].tolist()
        cv_iter.append((train_ind, test_ind))
        
    return (X, y, y_l, cv_iter)
    

In [8]:

def param_tune_allele(allele, train_dataset):
    
    allele_td = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_td, allele)
    '''
    #shuffle for oob
    X_train_s, y_train_s, y_l_s = shuffle(X_train, y_train, y_l, random_state=0)


    grid_params = {'n_estimators': [x for x in range(100, 1100, 100)], 
            'max_depth': [x for x in range(10, 100, 10)], 
            'min_samples_leaf': [x for x in range(10, 100, 10)], 
            'bootstrap': [True],
            'oob_score':[True],
            'max_features':["auto", "sqrt", "log2"]
            }

    '''
    alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
    regr_types = ["lr", "lasso", "ridge"]

def param_tune_allele(allele, train_dataset):
    
    allele_td = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_td, allele)

    '''
    grid_params = {'n_estimators': [x for x in range(100, 1100, 100)], 
            'max_depth': [x for x in range(10, 100, 10)], 
            'min_samples_leaf': [x for x in range(10, 100, 10)], 
            'bootstrap': [True],
            'oob_score':[True],
            'max_features':["auto", "sqrt", "log2"]
            }
    '''
    alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
    regr_types = ["lr", "lasso", "ridge"]
    
    regr_results = {}
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for rt in regr_types:
        if rt == "lr":
            regr_cv = linear_model.LinearRegression()
            print("CV")
            #cross-validation
            cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
            cv_mscore = statistics.mean(cv_scores)
            print("scores: ")
            print(cv_scores)
            print("mean score:")
            print(cv_mscore)
            if cv_mscore > best_cv_mscore:
                best_cv_params = {"type":"lr", "alpha": "x"}
                best_cv_scores = cv_scores
                best_cv_mscore = cv_mscore
        else:
            if rt == "lasso":
                regr_cv = linear_model.Lasso()
            if rt == "ridge":
                regr_cv = linear_model.Ridge()
            print("CV")
            #cross-validation
            for alpha in alpha_ridge:
                regr_cv.set_params(**{'alpha': alpha})
                cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
                cv_mscore = statistics.mean(cv_scores)
                print("alpha: "+str(alpha))
                print("scores: ")
                print(cv_scores)
                print("mean score:")
                print(cv_mscore)
                if cv_mscore > best_cv_mscore:
                    best_cv_params = {"type":rt, "alpha": alpha}
                    best_cv_scores = cv_scores
                    best_cv_mscore = cv_mscore
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (None, None, best_cv_params, best_cv_scores)
            
    
'''
    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    best_oob_score = 0 
    best_oob_params = None
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)

'''   

'''
    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    regr_results = {}
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)
'''

'\n    regr_oob = RandomForestRegressor(n_jobs=-1)\n    regr_cv = RandomForestRegressor(n_jobs=-1)\n\n    regr_results = {}\n    best_cv_mscore = 0\n    best_cv_scores = None\n    best_cv_params = None\n    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):\n        print("Parameter iteration: "+str(i))\n        print("OOB")\n        #out of bag\n        print(g)\n        regr_oob.set_params(**g)\n        regr_oob.fit(X_train_s,y_train_s)\n        print(regr_oob.oob_score_)\n        if regr_oob.oob_score_ > best_oob_score:\n            best_oob_params = g\n            best_oob_score = regr_oob.oob_score_\n        \n        print("CV")\n        #cross-validation\n        regr_cv.set_params(**g)\n        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)\n        cv_mscore = statistics.mean(cv_scores)\n        print(cv_scores)\n        print(cv_mscore)\n        if cv_mscore > best_cv_mscore:\n            best_cv_params = g\n            best_cv_score

### Parameter tuning

In [11]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

------------------------------------------------------------------------
ALLELE
A0101
CV
scores: 
[0.2982224  0.47096091 0.40519824 0.33383093 0.38623554]
mean score:
0.37888960451880016
CV
alpha: 1e-15
scores: 
[0.2982224  0.47096091 0.40519824 0.33383096 0.38623554]
mean score:
0.3788896098674581
alpha: 1e-10
scores: 
[0.2982224  0.47096091 0.40519824 0.33383096 0.38623554]
mean score:
0.3788896098680554
alpha: 1e-08
scores: 
[0.29822241 0.47096091 0.40519826 0.33383095 0.38623553]
mean score:
0.3788896099271871
alpha: 0.0001
scores: 
[0.29826466 0.47089101 0.40540782 0.33374833 0.38612796]
mean score:
0.37888795637549794
alpha: 0.001
scores: 
[0.29843989 0.47002188 0.40710832 0.33283091 0.38574372]
mean score:
0.3788289455968112
alpha: 0.01
scores: 
[0.29161938 0.45908654 0.40686856 0.32681651 0.38185655]
mean score:
0.3732495091412688
alpha: 1
scores: 
[-0.01963633  0.02827773  0.03056814  0.03089224  0.0278551 ]
mean score:
0.019591375871754347
alpha: 5
scores: 
[-2.17247373e-05 -

alpha: 1
scores: 
[ 0.02051506  0.02024698 -0.01384754  0.01891048  0.01030658]
mean score:
0.011226312265339277
alpha: 5
scores: 
[-1.16853688e-07 -8.81720255e-05 -8.84772424e-05 -2.40254347e-04
 -1.78608243e-05]
mean score:
-8.697625867353232e-05
alpha: 10
scores: 
[-1.16853688e-07 -8.81720255e-05 -8.84772424e-05 -2.40254347e-04
 -1.78608243e-05]
mean score:
-8.697625867353232e-05
alpha: 20
scores: 
[-1.16853688e-07 -8.81720255e-05 -8.84772424e-05 -2.40254347e-04
 -1.78608243e-05]
mean score:
-8.697625867353232e-05
CV
alpha: 1e-15
scores: 
[0.28112227 0.24292334 0.16911049 0.31267887 0.26804734]
mean score:
0.254776459848749
alpha: 1e-10
scores: 
[0.28112227 0.24292334 0.16911049 0.31267887 0.26804734]
mean score:
0.2547764598487532
alpha: 1e-08
scores: 
[0.28112227 0.24292334 0.16911049 0.31267887 0.26804734]
mean score:
0.2547764598491191
alpha: 0.0001
scores: 
[0.28112226 0.24292334 0.1691105  0.31267888 0.26804734]
mean score:
0.25477646354658257
alpha: 0.001
scores: 
[0.28112226

alpha: 1e-10
scores: 
[0.30945218 0.20403459 0.29285942 0.26811353 0.34816797]
mean score:
0.284525540335575
alpha: 1e-08
scores: 
[0.30945218 0.20403459 0.29285942 0.26811353 0.34816797]
mean score:
0.284525540335706
alpha: 0.0001
scores: 
[0.30945218 0.20403459 0.29285942 0.26811353 0.34816797]
mean score:
0.2845255416572463
alpha: 0.001
scores: 
[0.30945218 0.20403463 0.29285943 0.26811355 0.34816798]
mean score:
0.28452555355223347
alpha: 0.01
scores: 
[0.30945216 0.20403497 0.29285949 0.26811374 0.348168  ]
mean score:
0.28452567249504973
alpha: 1
scores: 
[0.30944958 0.20407218 0.29286621 0.26813462 0.3481708 ]
mean score:
0.28453867852224973
alpha: 5
scores: 
[0.30943895 0.20421801 0.29289211 0.26821799 0.3481815 ]
mean score:
0.2845897112060723
alpha: 10
scores: 
[0.30942522 0.20439075 0.29292185 0.26832013 0.34819352]
mean score:
0.284650295478685
alpha: 20
scores: 
[0.30939634 0.20470827 0.29297351 0.26851809 0.34821356]
mean score:
0.2847619537252214
Best CV 0.28502955541721

alpha: 10
scores: 
[0.38075876 0.41272251 0.40829179 0.39043436 0.34621377]
mean score:
0.3876842370025266
alpha: 20
scores: 
[0.3807198  0.41263557 0.40845361 0.39062208 0.34626057]
mean score:
0.38773832345462467
Best CV 0.38773832345462467
{'type': 'ridge', 'alpha': 20}
------------------------------------------------------------------------
ALLELE
A3101
CV
scores: 
[0.31743513 0.20749668 0.25081092 0.33216788 0.22639306]
mean score:
0.266860735173453
CV
alpha: 1e-15
scores: 
[0.31743513 0.20749668 0.25081092 0.33216788 0.22639306]
mean score:
0.2668607351734554
alpha: 1e-10
scores: 
[0.31743513 0.20749668 0.25081092 0.33216788 0.22639306]
mean score:
0.26686073527238274
alpha: 1e-08
scores: 
[0.31743513 0.20749671 0.25081091 0.33216789 0.22639308]
mean score:
0.26686074506719243
alpha: 0.0001
scores: 
[0.31743205 0.20783159 0.25070341 0.33226701 0.22655317]
mean score:
0.26695744712697556
alpha: 0.001
scores: 
[0.31722719 0.21065722 0.25150518 0.33296114 0.22774303]
mean score:
0.2

alpha: 0.0001
scores: 
[0.18517416 0.15895175 0.17366694 0.19496254 0.21722658]
mean score:
0.185996392620639
alpha: 0.001
scores: 
[0.18760792 0.16083373 0.17381515 0.19587718 0.2162051 ]
mean score:
0.18686781609149203
alpha: 0.01
scores: 
[0.18776646 0.15265505 0.17145563 0.19103878 0.20100751]
mean score:
0.18078468651034202
alpha: 1
scores: 
[0.04644322 0.03821431 0.05302493 0.06128646 0.05983335]
mean score:
0.051760453169205856
alpha: 5
scores: 
[-8.88187141e-07 -5.51323523e-06 -1.94385776e-05 -9.20856896e-06
 -2.26544823e-05]
mean score:
-1.1540610245930338e-05
alpha: 10
scores: 
[-8.88187141e-07 -5.51323523e-06 -1.94385776e-05 -9.20856896e-06
 -2.26544823e-05]
mean score:
-1.1540610245930338e-05
alpha: 20
scores: 
[-8.88187141e-07 -5.51323523e-06 -1.94385776e-05 -9.20856896e-06
 -2.26544823e-05]
mean score:
-1.1540610245930338e-05
CV
alpha: 1e-15
scores: 
[0.18483245 0.15871013 0.17361629 0.19475832 0.21729382]
mean score:
0.18584220157186565
alpha: 1e-10
scores: 
[0.18483245 

alpha: 20
scores: 
[-4.87555364e-05 -2.68278811e-05 -5.11295821e-06 -6.16073920e-06
 -3.45559504e-05]
mean score:
-2.4282613049031453e-05
CV
alpha: 1e-15
scores: 
[0.31267096 0.28708735 0.36544733 0.34433995 0.324741  ]
mean score:
0.3268573191281945
alpha: 1e-10
scores: 
[0.31267096 0.28708735 0.36544733 0.34433995 0.324741  ]
mean score:
0.3268573191099285
alpha: 1e-08
scores: 
[0.31267096 0.28708735 0.36544733 0.34433995 0.324741  ]
mean score:
0.3268573191095724
alpha: 0.0001
scores: 
[0.31267097 0.28708734 0.36544733 0.34433995 0.324741  ]
mean score:
0.3268573155098646
alpha: 0.001
scores: 
[0.312671   0.28708722 0.36544731 0.34433988 0.324741  ]
mean score:
0.3268572833340582
alpha: 0.01
scores: 
[0.31267137 0.28708616 0.36544712 0.34433923 0.32474104]
mean score:
0.3268569825918293
alpha: 1
scores: 
[0.31267874 0.28706913 0.36545272 0.3443427  0.32474531]
mean score:
0.3268577202373869
alpha: 5
scores: 
[0.31268767 0.28706651 0.36549251 0.34440511 0.32476226]
mean score:
0.3268

CV
scores: 
[0.3075436  0.32607718 0.33191839 0.37741869 0.36508299]
mean score:
0.34160816964746865
CV
alpha: 1e-15
scores: 
[0.3075436  0.32607711 0.33191839 0.37741867 0.36508299]
mean score:
0.3416081531814345
alpha: 1e-10
scores: 
[0.3075436  0.32607711 0.33191839 0.37741867 0.36508299]
mean score:
0.34160815335108474
alpha: 1e-08
scores: 
[0.30754361 0.32607713 0.33191841 0.37741869 0.36508302]
mean score:
0.3416081701466887
alpha: 0.0001
scores: 
[0.30761391 0.32624695 0.33212414 0.37758958 0.36531627]
mean score:
0.3417781702762854
alpha: 0.001
scores: 
[0.30837314 0.32764478 0.33372263 0.37795723 0.36733208]
mean score:
0.3430059709845171
alpha: 0.01
scores: 
[0.31216288 0.32820865 0.32720816 0.37530413 0.3644158 ]
mean score:
0.3414599225890057
alpha: 1
scores: 
[0.0180723  0.0486601  0.04559266 0.03705783 0.05335033]
mean score:
0.040546642337836294
alpha: 5
scores: 
[-1.03310898e-07 -3.74868588e-07 -5.10519590e-07 -4.86447664e-07
 -1.05961684e-06]
mean score:
-5.06952716028

alpha: 20
scores: 
[-2.92151124e-06 -4.82956528e-05 -4.26492992e-05 -2.38572496e-05
 -3.17591053e-06]
mean score:
-2.4179924678602303e-05
CV
alpha: 1e-15
scores: 
[ 0.09639235  0.09235666 -0.07000084  0.10716263  0.10519765]
mean score:
0.06622168922131474
alpha: 1e-10
scores: 
[ 0.09639235  0.09235666 -0.07000084  0.10716263  0.10519765]
mean score:
0.06622168921277213
alpha: 1e-08
scores: 
[ 0.09639235  0.09235666 -0.07000084  0.10716263  0.10519765]
mean score:
0.06622168923796531
alpha: 0.0001
scores: 
[ 0.09639235  0.09235669 -0.07000002  0.10716265  0.10519805]
mean score:
0.06622194308278662
alpha: 0.001
scores: 
[ 0.09639235  0.09235691 -0.06999288  0.10716288  0.10520161]
mean score:
0.06622417247761035
alpha: 0.01
scores: 
[ 0.09639238  0.09235867 -0.06993551  0.10716467  0.10523016]
mean score:
0.06624207209273777
alpha: 1
scores: 
[ 0.09639535  0.0923723  -0.06968098  0.10717361  0.10535379]
mean score:
0.06632281444374175
alpha: 5
scores: 
[ 0.09640721  0.09239982 -0.06965

alpha: 0.01
scores: 
[0.26359305 0.2711903  0.18838907 0.21677257 0.27299106]
mean score:
0.24258720905685388
alpha: 1
scores: 
[0.26363489 0.2711836  0.18840442 0.21680745 0.272979  ]
mean score:
0.24260187157589794
alpha: 5
scores: 
[0.26380126 0.27115677 0.18846542 0.21694601 0.27293066]
mean score:
0.2426600242112091
alpha: 10
scores: 
[0.26400332 0.27112384 0.18853945 0.21711405 0.27287102]
mean score:
0.24273033717042417
alpha: 20
scores: 
[0.26438884 0.27105992 0.18868058 0.21743388 0.27275433]
mean score:
0.2428635096364556
Best CV 0.24317312239649125
{'type': 'lasso', 'alpha': 0.01}
------------------------------------------------------------------------
ALLELE
C1601
CV
scores: 
[0.25229201 0.30155515 0.28002156 0.29207355 0.34815107]
mean score:
0.29481867003950585
CV
alpha: 1e-15
scores: 
[0.25229201 0.30155515 0.28002156 0.29207355 0.34815107]
mean score:
0.2948186700395081
alpha: 1e-10
scores: 
[0.25229201 0.30155516 0.28002156 0.29207355 0.34815107]
mean score:
0.29481867

In [12]:
results_df = pd.DataFrame(results)
results_df.to_pickle("crossval_complex_LR.pkl")
results_df.to_csv("crossval_complex_LR.csv")
results_df

Unnamed: 0,allele,best_oob_param,best_oob_score,best_cv_param,best_cv_scores
0,A0101,,,"{'type': 'ridge', 'alpha': 20}","[0.2983118750472664, 0.47080164862708956, 0.40..."
1,A0201,,,"{'type': 'ridge', 'alpha': 20}","[0.13450900686900025, 0.2430719038517034, 0.21..."
2,A0203,,,"{'type': 'ridge', 'alpha': 20}","[0.31018903668743536, 0.33699359546249275, 0.2..."
3,A0206,,,"{'type': 'ridge', 'alpha': 20}","[0.2810446351934809, 0.2425511363888344, 0.170..."
4,A0301,,,"{'type': 'lasso', 'alpha': 0.001}","[0.10126526918326229, 0.12033199500797997, 0.1..."
5,A1101,,,"{'type': 'lasso', 'alpha': 0.001}","[0.158015338797851, 0.12491787995466196, 0.158..."
6,A2301,,,"{'type': 'lasso', 'alpha': 0.001}","[0.30905182930375996, 0.204975248491207, 0.292..."
7,A2402,,,"{'type': 'lasso', 'alpha': 0.001}","[0.2031982865952364, 0.1509861517732577, 0.191..."
8,A2601,,,"{'type': 'lasso', 'alpha': 0.001}","[0.20517995749678386, 0.18663554881277267, 0.1..."
9,A2902,,,"{'type': 'ridge', 'alpha': 20}","[0.38071979520508853, 0.41263556672031704, 0.4..."


### Training the models with best parameters on the full training dataset

In [16]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    rt = params["type"]
    alpha = params["alpha"]
    if rt == "lr":
        regr_best = linear_model.LinearRegression()
    if rt == "lasso":
        regr_best = linear_model.Lasso(alpha)
    if rt == "ridge":
        regr_best = linear_model.Ridge(alpha)
    regr_best.fit(X_train, y_train)
    with open('./final_LR_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)  

In [17]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "complex") 