# Tuning and training the models for standard-pHLA-score

Input are the standard ref2015 features for complex, already generated in ../Featurization/rosettaComplexEnergies.csv

We tune the parameters in the 5-fold-crossvalidation setting.


In [5]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import sklearn.linear_model as linear_model
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from IPython.display import display
from scipy import stats
import _pickle as cPickle
import statistics
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

## Load the standard ref2015 features

### Full dataset

In [6]:
## 1 - load the energies
def ene_to_array(ene_str):
    ene_str = ene_str.strip("[]")
    ene_str = ene_str.strip("\(\)")
    return np.fromstring(ene_str, dtype=float, count = 20, sep=", ")

complex_ene = pd.read_csv("../Featurization/rosettaComplexEnergies.csv")
complex_ene = complex_ene[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
complex_ene["energies"] = complex_ene["energies"].apply(ene_to_array)
complex_ene

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy
0,A0101,YLEQLHQLY,1,0.574375,"[-2306.77360984, 811.85340549, 1461.80507136, ...",44.784793
1,A0101,HSERHVLLY,1,0.574375,"[-2293.38661156, 760.61067588, 1470.48909157, ...",-15.285789
2,A0101,MTDPEMVEV,1,0.574375,"[-2276.11339533, 793.72608921, 1465.61096698, ...",96.468759
3,A0101,LTDFIREEY,1,0.574375,"[-2295.34585307, 757.61420988, 1473.41382942, ...",59.739557
4,A0101,LLDQRPAWY,1,0.574375,"[-2293.03299876, 860.34732958, 1461.86885467, ...",103.222148
...,...,...,...,...,...,...
77576,C1601,QQTTTSFQN,0,0.000000,"[-2289.00765669, 2426.75217972, 1479.38435418,...",1470.688209
77577,C1601,QQVEQMEIP,0,0.000000,"[-2303.64763419, 2393.02916873, 1484.9113067, ...",1481.597525
77578,C1601,QQWQVFSAE,0,0.000000,"[-2296.25090083, 2349.00146681, 1475.89378426,...",1404.053591
77579,C1601,QRCVVLRFL,0,0.000000,"[-2308.57295453, 2368.37391254, 1478.19777993,...",1426.372734


### Map the full dataset to the training set

In [7]:
#Load split
train_set = pd.read_csv("../Datasets/train_set.csv")
train_set = train_set[["allele", "peptide", "fileloc", "allele_type", "fold_num"]]

#Merge to form the training set
train_dataset = pd.merge(complex_ene, train_set, on=["allele", "peptide"], suffixes=["", "_y"], how="inner")
train_dataset

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy,fileloc,allele_type,fold_num
0,A0101,YLEQLHQLY,1,0.574375,"[-2306.77360984, 811.85340549, 1461.80507136, ...",44.784793,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,2.0
1,A0101,HSERHVLLY,1,0.574375,"[-2293.38661156, 760.61067588, 1470.48909157, ...",-15.285789,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
2,A0101,MTDPEMVEV,1,0.574375,"[-2276.11339533, 793.72608921, 1465.61096698, ...",96.468759,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
3,A0101,LTDFIREEY,1,0.574375,"[-2295.34585307, 757.61420988, 1473.41382942, ...",59.739557,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
4,A0101,LLDQRPAWY,1,0.574375,"[-2293.03299876, 860.34732958, 1461.86885467, ...",103.222148,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
...,...,...,...,...,...,...,...,...,...
69793,C1601,QQTTTSFQN,0,0.000000,"[-2289.00765669, 2426.75217972, 1479.38435418,...",1470.688209,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69794,C1601,QQVEQMEIP,0,0.000000,"[-2303.64763419, 2393.02916873, 1484.9113067, ...",1481.597525,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69795,C1601,QQWQVFSAE,0,0.000000,"[-2296.25090083, 2349.00146681, 1475.89378426,...",1404.053591,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,1.0
69796,C1601,QRCVVLRFL,0,0.000000,"[-2308.57295453, 2368.37391254, 1478.19777993,...",1426.372734,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,2.0


In [8]:
# extracting features in training format
# and get the cross-validation iterator
def extract_features_Xy_cv(merged_df, allele):
    allele_data = merged_df[merged_df["allele"]==allele]
    allele_data["enefeat"] = allele_data["energies"].apply(lambda x: x[:-1])
    allele_data = allele_data.reset_index(drop=True)
    flag = 0
    for index, row in allele_data.iterrows():
        if flag == 0:
            X = np.array(row['enefeat'])
            flag = 1
        else: 
            X = np.vstack((X, row['enefeat']))
    #extract binding energies        
    y = np.array(list(allele_data["ba"]))
    y_l = np.array(list(allele_data["binder"]))
    
    cv_iter = []
    for split in range(5):
        test_ind = allele_data.index[(allele_data['fold_num'] == split)].tolist()
        train_ind = allele_data.index[~(allele_data['fold_num'] == split)].tolist()
        cv_iter.append((train_ind, test_ind))
        
    return (X, y, y_l, cv_iter)
    

In [9]:

def param_tune_allele(allele, train_dataset):
    
    allele_td = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_td, allele)

    
    grid_params = {'C': [1e-2, 1, 10, 100], 
            'kernel': ["linear", "rbf"], 
            'degree': [3, 5, 7],
            }
    
    
    regr_results = {}
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    regr_svr = SVR()
    
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=60)):
        print("CV")
        print(g)
        #cross-validation
        regr_svr.set_params(**g)
        regr = make_pipeline(StandardScaler(), regr_svr)
        cv_scores = cross_val_score(regr, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
  
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (None, None, best_cv_params, best_cv_scores)
            
    
'''
    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    best_oob_score = 0 
    best_oob_params = None
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)

'''   

'''
    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    regr_results = {}
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)
'''

'\n    regr_oob = RandomForestRegressor(n_jobs=-1)\n    regr_cv = RandomForestRegressor(n_jobs=-1)\n\n    regr_results = {}\n    best_cv_mscore = 0\n    best_cv_scores = None\n    best_cv_params = None\n    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):\n        print("Parameter iteration: "+str(i))\n        print("OOB")\n        #out of bag\n        print(g)\n        regr_oob.set_params(**g)\n        regr_oob.fit(X_train_s,y_train_s)\n        print(regr_oob.oob_score_)\n        if regr_oob.oob_score_ > best_oob_score:\n            best_oob_params = g\n            best_oob_score = regr_oob.oob_score_\n        \n        print("CV")\n        #cross-validation\n        regr_cv.set_params(**g)\n        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)\n        cv_mscore = statistics.mean(cv_scores)\n        print(cv_scores)\n        print(cv_mscore)\n        if cv_mscore > best_cv_mscore:\n            best_cv_params = g\n            best_cv_score

### Parameter tuning

In [10]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

------------------------------------------------------------------------
ALLELE
A0101
CV
{'kernel': 'linear', 'degree': 3, 'C': 0.01}




[0.24790635 0.46741333 0.40640486 0.32203194 0.38248775]
0.36524884712619443
CV
{'kernel': 'rbf', 'degree': 3, 'C': 0.01}
[0.39283482 0.50007978 0.44811126 0.36060306 0.41968023]
0.424261831211713
CV
{'kernel': 'linear', 'degree': 5, 'C': 0.01}
[0.24790635 0.46741333 0.40640486 0.32203194 0.38248775]
0.36524884712619443
CV
{'kernel': 'rbf', 'degree': 5, 'C': 0.01}
[0.39283482 0.50007978 0.44811126 0.36060306 0.41968023]
0.424261831211713
CV
{'kernel': 'linear', 'degree': 7, 'C': 0.01}
[0.24790635 0.46741333 0.40640486 0.32203194 0.38248775]
0.36524884712619443
CV
{'kernel': 'rbf', 'degree': 7, 'C': 0.01}
[0.39283482 0.50007978 0.44811126 0.36060306 0.41968023]
0.424261831211713
CV
{'kernel': 'linear', 'degree': 3, 'C': 1}
[0.24137458 0.46655218 0.40604237 0.32161389 0.38334019]
0.3637846417525729
CV
{'kernel': 'rbf', 'degree': 3, 'C': 1}
[0.36356965 0.50844315 0.43187876 0.34806598 0.40138479]
0.4106684670514769
CV
{'kernel': 'linear', 'degree': 5, 'C': 1}
[0.24137458 0.46655218 0.4060

[-0.03551504  0.04257948 -0.07595399  0.01832407 -0.04154285]
-0.01842166708323396
CV
{'kernel': 'linear', 'degree': 3, 'C': 100}
[0.2986874  0.32485742 0.28011725 0.27770057 0.23699384]
0.28367129678392
CV
{'kernel': 'rbf', 'degree': 3, 'C': 100}
[-0.25266251 -0.23504625 -0.26561839 -0.1869369  -0.22662165]
-0.23337714075041044
CV
{'kernel': 'linear', 'degree': 5, 'C': 100}
[0.2986874  0.32485742 0.28011725 0.27770057 0.23699384]
0.28367129678392
CV
{'kernel': 'rbf', 'degree': 5, 'C': 100}
[-0.25266251 -0.23504625 -0.26561839 -0.1869369  -0.22662165]
-0.23337714075041044
CV
{'kernel': 'linear', 'degree': 7, 'C': 100}
[0.2986874  0.32485742 0.28011725 0.27770057 0.23699384]
0.28367129678392
CV
{'kernel': 'rbf', 'degree': 7, 'C': 100}
[-0.25266251 -0.23504625 -0.26561839 -0.1869369  -0.22662165]
-0.23337714075041044
Best CV 0
{'kernel': 'linear', 'degree': 7, 'C': 100}
------------------------------------------------------------------------
ALLELE
A0206
CV
{'kernel': 'linear', 'degree':

[0.13396911 0.08562181 0.12284259 0.11678928 0.07371197]
0.10658695311453428
CV
{'kernel': 'rbf', 'degree': 5, 'C': 1}
[0.18670587 0.1035737  0.18572045 0.14631261 0.03314966]
0.13109245882766235
CV
{'kernel': 'linear', 'degree': 7, 'C': 1}
[0.13396911 0.08562181 0.12284259 0.11678928 0.07371197]
0.10658695311453428
CV
{'kernel': 'rbf', 'degree': 7, 'C': 1}
[0.18670587 0.1035737  0.18572045 0.14631261 0.03314966]
0.13109245882766235
CV
{'kernel': 'linear', 'degree': 3, 'C': 10}
[0.13361336 0.08439676 0.1233514  0.1166407  0.07378692]
0.1063578287124004
CV
{'kernel': 'rbf', 'degree': 3, 'C': 10}
[-0.0517889  -0.09046532 -0.05519214 -0.06763689 -0.18399791]
-0.08981623171964559
CV
{'kernel': 'linear', 'degree': 5, 'C': 10}
[0.13361336 0.08439676 0.1233514  0.1166407  0.07378692]
0.1063578287124004
CV
{'kernel': 'rbf', 'degree': 5, 'C': 10}
[-0.0517889  -0.09046532 -0.05519214 -0.06763689 -0.18399791]
-0.08981623171964559
CV
{'kernel': 'linear', 'degree': 7, 'C': 10}
[0.13361336 0.0843967

CV
{'kernel': 'linear', 'degree': 3, 'C': 0.01}
[0.17909317 0.14831959 0.09901836 0.16755522 0.12648268]
0.1440938024276995
CV
{'kernel': 'rbf', 'degree': 3, 'C': 0.01}
[0.1760618  0.16018526 0.13057891 0.18407729 0.15496803]
0.16117425652518785
CV
{'kernel': 'linear', 'degree': 5, 'C': 0.01}
[0.17909317 0.14831959 0.09901836 0.16755522 0.12648268]
0.1440938024276995
CV
{'kernel': 'rbf', 'degree': 5, 'C': 0.01}
[0.1760618  0.16018526 0.13057891 0.18407729 0.15496803]
0.16117425652518785
CV
{'kernel': 'linear', 'degree': 7, 'C': 0.01}
[0.17909317 0.14831959 0.09901836 0.16755522 0.12648268]
0.1440938024276995
CV
{'kernel': 'rbf', 'degree': 7, 'C': 0.01}
[0.1760618  0.16018526 0.13057891 0.18407729 0.15496803]
0.16117425652518785
CV
{'kernel': 'linear', 'degree': 3, 'C': 1}
[0.18184051 0.14909744 0.10025593 0.16826404 0.11898024]
0.14368763280388636
CV
{'kernel': 'rbf', 'degree': 3, 'C': 1}
[0.17644505 0.26416375 0.11684426 0.23320754 0.15829296]
0.18979071078900583
CV
{'kernel': 'linear

[ 0.03999405 -0.08864055 -0.09662807 -0.00917457  0.01573679]
-0.02774246852908
CV
{'kernel': 'linear', 'degree': 3, 'C': 100}
[0.30384394 0.18836982 0.23105994 0.31151551 0.21332741]
0.24962332453472885
CV
{'kernel': 'rbf', 'degree': 3, 'C': 100}
[-0.05938652 -0.21085484 -0.21111777 -0.10307258 -0.13564384]
-0.14401511158266586
CV
{'kernel': 'linear', 'degree': 5, 'C': 100}
[0.30384394 0.18836982 0.23105994 0.31151551 0.21332741]
0.24962332453472885
CV
{'kernel': 'rbf', 'degree': 5, 'C': 100}
[-0.05938652 -0.21085484 -0.21111777 -0.10307258 -0.13564384]
-0.14401511158266586
CV
{'kernel': 'linear', 'degree': 7, 'C': 100}
[0.30384394 0.18836982 0.23105994 0.31151551 0.21332741]
0.24962332453472885
CV
{'kernel': 'rbf', 'degree': 7, 'C': 100}
[-0.05938652 -0.21085484 -0.21111777 -0.10307258 -0.13564384]
-0.14401511158266586
Best CV 0
{'kernel': 'linear', 'degree': 7, 'C': 100}
------------------------------------------------------------------------
ALLELE
A6801
CV
{'kernel': 'linear', 'de

[0.14661501 0.13388807 0.13909899 0.15539277 0.15342917]
0.14568480218244573
CV
{'kernel': 'rbf', 'degree': 5, 'C': 1}
[0.20720682 0.27241107 0.14905249 0.19370462 0.24886643]
0.21424828637298785
CV
{'kernel': 'linear', 'degree': 7, 'C': 1}
[0.14661501 0.13388807 0.13909899 0.15539277 0.15342917]
0.14568480218244573
CV
{'kernel': 'rbf', 'degree': 7, 'C': 1}
[0.20720682 0.27241107 0.14905249 0.19370462 0.24886643]
0.21424828637298785
CV
{'kernel': 'linear', 'degree': 3, 'C': 10}
[0.14692428 0.13427996 0.13923641 0.15523975 0.15342717]
0.14582151366751145
CV
{'kernel': 'rbf', 'degree': 3, 'C': 10}
[-0.08963806  0.04597985 -0.01129904 -0.06322326  0.02544438]
-0.018547226588931598
CV
{'kernel': 'linear', 'degree': 5, 'C': 10}
[0.14692428 0.13427996 0.13923641 0.15523975 0.15342717]
0.14582151366751145
CV
{'kernel': 'rbf', 'degree': 5, 'C': 10}
[-0.08963806  0.04597985 -0.01129904 -0.06322326  0.02544438]
-0.018547226588931598
CV
{'kernel': 'linear', 'degree': 7, 'C': 10}
[0.14692428 0.134

CV
{'kernel': 'linear', 'degree': 3, 'C': 0.01}
[0.30370233 0.28349142 0.36089166 0.34020613 0.31189588]
0.32003748210386956
CV
{'kernel': 'rbf', 'degree': 3, 'C': 0.01}
[0.30885526 0.27149588 0.30948696 0.32864145 0.33323361]
0.3103426312395036
CV
{'kernel': 'linear', 'degree': 5, 'C': 0.01}
[0.30370233 0.28349142 0.36089166 0.34020613 0.31189588]
0.32003748210386956
CV
{'kernel': 'rbf', 'degree': 5, 'C': 0.01}
[0.30885526 0.27149588 0.30948696 0.32864145 0.33323361]
0.3103426312395036
CV
{'kernel': 'linear', 'degree': 7, 'C': 0.01}
[0.30370233 0.28349142 0.36089166 0.34020613 0.31189588]
0.32003748210386956
CV
{'kernel': 'rbf', 'degree': 7, 'C': 0.01}
[0.30885526 0.27149588 0.30948696 0.32864145 0.33323361]
0.3103426312395036
CV
{'kernel': 'linear', 'degree': 3, 'C': 1}
[0.30167304 0.28077434 0.3632434  0.33958383 0.31252611]
0.31956014518019754
CV
{'kernel': 'rbf', 'degree': 3, 'C': 1}
[0.31015229 0.23576469 0.29056286 0.30647786 0.31849605]
0.2922907514730506
CV
{'kernel': 'linear'

[ 0.04453964  0.01040832  0.01942767  0.03419365 -0.04169896]
0.013374063808608306
CV
{'kernel': 'linear', 'degree': 3, 'C': 100}
[0.29885734 0.2879405  0.26544601 0.27631113 0.31810545]
0.28933208684658296
CV
{'kernel': 'rbf', 'degree': 3, 'C': 100}
[-0.07171276 -0.10972233 -0.10212297 -0.05940626 -0.21276483]
-0.11114583214849069
CV
{'kernel': 'linear', 'degree': 5, 'C': 100}
[0.29885734 0.2879405  0.26544601 0.27631113 0.31810545]
0.28933208684658296
CV
{'kernel': 'rbf', 'degree': 5, 'C': 100}
[-0.07171276 -0.10972233 -0.10212297 -0.05940626 -0.21276483]
-0.11114583214849069
CV
{'kernel': 'linear', 'degree': 7, 'C': 100}
[0.29885734 0.2879405  0.26544601 0.27631113 0.31810545]
0.28933208684658296
CV
{'kernel': 'rbf', 'degree': 7, 'C': 100}
[-0.07171276 -0.10972233 -0.10212297 -0.05940626 -0.21276483]
-0.11114583214849069
Best CV 0
{'kernel': 'linear', 'degree': 7, 'C': 100}
------------------------------------------------------------------------
ALLELE
B3901
CV
{'kernel': 'linear', 

[0.00141365 0.00058769 0.00100647 0.00081174 0.00066255]
0.0008964180695690604
CV
{'kernel': 'rbf', 'degree': 5, 'C': 1}
[-0.0943918  -0.0205241  -0.05637075  0.03478154 -0.00221631]
-0.027744283547790394
CV
{'kernel': 'linear', 'degree': 7, 'C': 1}
[0.00141365 0.00058769 0.00100647 0.00081174 0.00066255]
0.0008964180695690604
CV
{'kernel': 'rbf', 'degree': 7, 'C': 1}
[-0.0943918  -0.0205241  -0.05637075  0.03478154 -0.00221631]
-0.027744283547790394
CV
{'kernel': 'linear', 'degree': 3, 'C': 10}
[0.0013342  0.00033219 0.00101496 0.00073532 0.00038471]
0.0007602742820435049
CV
{'kernel': 'rbf', 'degree': 3, 'C': 10}
[-0.36729549 -0.23850344 -0.28505831 -0.20194096 -0.2388062 ]
-0.26632087977414354
CV
{'kernel': 'linear', 'degree': 5, 'C': 10}
[0.0013342  0.00033219 0.00101496 0.00073532 0.00038471]
0.0007602742820435049
CV
{'kernel': 'rbf', 'degree': 5, 'C': 10}
[-0.36729549 -0.23850344 -0.28505831 -0.20194096 -0.2388062 ]
-0.26632087977414354
CV
{'kernel': 'linear', 'degree': 7, 'C': 1

[-0.52691349 -0.52678759 -0.46211704 -0.50396012 -0.70082972]
-0.5441215921808595
Best CV 0
{'kernel': 'rbf', 'degree': 7, 'C': 1}
------------------------------------------------------------------------
ALLELE
B5701
CV
{'kernel': 'linear', 'degree': 3, 'C': 0.01}
[0.02559032 0.06944203 0.08844078 0.04372285 0.05682567]
0.0568043302743731
CV
{'kernel': 'rbf', 'degree': 3, 'C': 0.01}
[0.09368614 0.14866006 0.15766952 0.09460536 0.1251709 ]
0.12395839509424875
CV
{'kernel': 'linear', 'degree': 5, 'C': 0.01}
[0.02559032 0.06944203 0.08844078 0.04372285 0.05682567]
0.0568043302743731
CV
{'kernel': 'rbf', 'degree': 5, 'C': 0.01}
[0.09368614 0.14866006 0.15766952 0.09460536 0.1251709 ]
0.12395839509424875
CV
{'kernel': 'linear', 'degree': 7, 'C': 0.01}
[0.02559032 0.06944203 0.08844078 0.04372285 0.05682567]
0.0568043302743731
CV
{'kernel': 'rbf', 'degree': 7, 'C': 0.01}
[0.09368614 0.14866006 0.15766952 0.09460536 0.1251709 ]
0.12395839509424875
CV
{'kernel': 'linear', 'degree': 3, 'C': 1}


[ 0.06180877  0.07555738 -0.03727402  0.07102303  0.10631441]
0.05548591576290603
CV
{'kernel': 'linear', 'degree': 7, 'C': 10}
[0.21406561 0.26262674 0.15511269 0.1760547  0.25735715]
0.2130433790587499
CV
{'kernel': 'rbf', 'degree': 7, 'C': 10}
[ 0.06180877  0.07555738 -0.03727402  0.07102303  0.10631441]
0.05548591576290603
CV
{'kernel': 'linear', 'degree': 3, 'C': 100}
[0.21349653 0.26229432 0.15443133 0.17480939 0.25740027]
0.21248636756420813
CV
{'kernel': 'rbf', 'degree': 3, 'C': 100}
[-0.17858372 -0.14943142 -0.35609494 -0.17512511 -0.08419317]
-0.1886856728404877
CV
{'kernel': 'linear', 'degree': 5, 'C': 100}
[0.21349653 0.26229432 0.15443133 0.17480939 0.25740027]
0.21248636756420813
CV
{'kernel': 'rbf', 'degree': 5, 'C': 100}
[-0.17858372 -0.14943142 -0.35609494 -0.17512511 -0.08419317]
-0.1886856728404877
CV
{'kernel': 'linear', 'degree': 7, 'C': 100}
[0.21349653 0.26229432 0.15443133 0.17480939 0.25740027]
0.21248636756420813
CV
{'kernel': 'rbf', 'degree': 7, 'C': 100}
[-0

In [11]:
results_df = pd.DataFrame(results)
results_df.to_pickle("crossval_complex_SVM.pkl")
results_df.to_csv("crossval_complex_SVM.csv")
results_df

Unnamed: 0,allele,best_oob_param,best_oob_score,best_cv_param,best_cv_scores
0,A0101,,,"{'kernel': 'rbf', 'degree': 7, 'C': 100}","[0.04372842412286093, 0.14227586252064262, 0.0..."
1,A0201,,,"{'kernel': 'linear', 'degree': 7, 'C': 100}","[0.10504613665107998, 0.23645627785199086, 0.1..."
2,A0203,,,"{'kernel': 'linear', 'degree': 7, 'C': 100}","[0.2986873952645648, 0.32485742439735543, 0.28..."
3,A0206,,,"{'kernel': 'linear', 'degree': 7, 'C': 100}","[0.26094527069732487, 0.22073818103105436, 0.1..."
4,A0301,,,"{'kernel': 'rbf', 'degree': 7, 'C': 1}","[0.13437570099058238, 0.1429639914369082, 0.10..."
5,A1101,,,"{'kernel': 'linear', 'degree': 7, 'C': 100}","[0.13405036394983838, 0.08513530118638868, 0.1..."
6,A2301,,,"{'kernel': 'linear', 'degree': 7, 'C': 100}","[0.3010458886286296, 0.16086496625276214, 0.25..."
7,A2402,,,"{'kernel': 'linear', 'degree': 7, 'C': 100}","[0.17106271038527573, 0.12961219660830292, 0.1..."
8,A2601,,,"{'kernel': 'linear', 'degree': 7, 'C': 100}","[0.1822133198183865, 0.14968277084343207, 0.10..."
9,A2902,,,"{'kernel': 'rbf', 'degree': 7, 'C': 100}","[0.05547365784286851, 0.050694238412803405, 0...."


### Training the models with best parameters on the full training dataset

In [14]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = SVR()
    regr_best.set_params(**params)
    regr = make_pipeline(StandardScaler(), regr_best)
    regr.fit(X_train, y_train)
    with open('./final_SVM_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr, fid)  

In [15]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "complex") 