# Tuning and training the models for standard-pHLA-score

Input are the per-peptide-position ref2015 features for complex, already generated in ../Featurization/rosettaPPPEnergies.csv

We tune the parameters in the 5-fold-crossvalidation setting.

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import time
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import ParameterSampler, cross_val_score
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from IPython.display import display
from scipy import stats
import _pickle as cPickle
import statistics
from sklearn.linear_model import LinearRegression
from sklearn import linear_model

## Load the standard ref2015 features

### Full dataset

In [2]:
## 1 - load the energies
def pppene_to_array(tmp):
    tmp = tmp.replace("(", "")
    tmp = tmp.replace(")", "")
    tmp = tmp.strip("[]")
    tmp = tmp.replace(" ", "")
    tmp = tmp.replace("\n", ",")
    return np.fromstring(tmp, dtype=float, sep=", ").reshape(9,20)

ppp_ene = pd.read_csv("../Featurization/rosettaPPPEnergies.csv")
ppp_ene = ppp_ene[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
ppp_ene["energies"] = ppp_ene["energies"].apply(pppene_to_array)
ppp_ene

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy
0,A0101,YLEQLHQLY,1,0.574375,"[[-9.37669305, 4.67802037, 8.54111444, 10.3367...",112.623867
1,A0101,HSERHVLLY,1,0.574375,"[[-7.89190954, 0.93707113, 10.06233605, 2.7158...",91.902185
2,A0101,MTDPEMVEV,1,0.574375,"[[-8.25236275, 10.56587939, 7.5793572, 1.14710...",146.451590
3,A0101,LTDFIREEY,1,0.574375,"[[-8.43720197, 10.21830335, 7.113905, 36.51672...",138.735082
4,A0101,LLDQRPAWY,1,0.574375,"[[-8.18944861, 12.37002534, 6.69837214, 8.0238...",142.756344
...,...,...,...,...,...,...
77576,C1601,QQTTTSFQN,0,0.000000,"[[-8.23468271, 4.14167228, 9.40648541, 38.0082...",128.349704
77577,C1601,QQVEQMEIP,0,0.000000,"[[-9.20362039, 22.89414572, 10.40234849, 35.70...",159.872992
77578,C1601,QQWQVFSAE,0,0.000000,"[[-8.46025926, 3.82365938, 10.25920191, 61.414...",97.152888
77579,C1601,QRCVVLRFL,0,0.000000,"[[-7.11160825, 1.35151526, 9.62650699, 20.2230...",116.714004


### Map the full dataset to the training set

In [3]:
#Load split
train_set = pd.read_csv("../Datasets/train_set.csv")
train_set = train_set[["allele", "peptide", "fileloc", "allele_type", "fold_num"]]

#Merge to form the training set
train_dataset = pd.merge(ppp_ene, train_set, on=["allele", "peptide"], suffixes=["", "_y"], how="inner")
train_dataset

Unnamed: 0,allele,peptide,binder,ba,energies,total_energy,fileloc,allele_type,fold_num
0,A0101,YLEQLHQLY,1,0.574375,"[[-9.37669305, 4.67802037, 8.54111444, 10.3367...",112.623867,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,2.0
1,A0101,HSERHVLLY,1,0.574375,"[[-7.89190954, 0.93707113, 10.06233605, 2.7158...",91.902185,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
2,A0101,MTDPEMVEV,1,0.574375,"[[-8.25236275, 10.56587939, 7.5793572, 1.14710...",146.451590,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.0
3,A0101,LTDFIREEY,1,0.574375,"[[-8.43720197, 10.21830335, 7.113905, 36.51672...",138.735082,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
4,A0101,LLDQRPAWY,1,0.574375,"[[-8.18944861, 12.37002534, 6.69837214, 8.0238...",142.756344,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,1.0
...,...,...,...,...,...,...,...,...,...
69793,C1601,QQTTTSFQN,0,0.000000,"[[-8.23468271, 4.14167228, 9.40648541, 38.0082...",128.349704,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69794,C1601,QQVEQMEIP,0,0.000000,"[[-9.20362039, 22.89414572, 10.40234849, 35.70...",159.872992,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,4.0
69795,C1601,QQWQVFSAE,0,0.000000,"[[-8.46025926, 3.82365938, 10.25920191, 61.414...",97.152888,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,1.0
69796,C1601,QRCVVLRFL,0,0.000000,"[[-7.11160825, 1.35151526, 9.62650699, 20.2230...",116.714004,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,2.0


In [4]:
# extracting features in training format
# and get the cross-validation iterator
def get_energies(X):
    ene = np.roll(X, 4, axis = 0)[:9,:19]
    ene = np.roll(ene, -4, axis = 0)
    ene = ene.reshape(9*19)
    return ene

def extract_features_Xy_cv(merged_df, allele):
    allele_data = merged_df[merged_df["allele"]==allele]
    allele_data["enefeat"] = allele_data["energies"].apply(get_energies)
    allele_data = allele_data.reset_index(drop=True)
    flag = 0
    for index, row in allele_data.iterrows():
        if flag == 0:
            X = np.array(row['enefeat'])
            flag = 1
        else: 
            X = np.vstack((X, row['enefeat']))
    #extract binding energies        
    y = np.array(list(allele_data["ba"]))
    y_l = np.array(list(allele_data["binder"]))
    
    cv_iter = []
    for split in range(5):
        test_ind = allele_data.index[(allele_data['fold_num'] == split)].tolist()
        train_ind = allele_data.index[~(allele_data['fold_num'] == split)].tolist()
        cv_iter.append((train_ind, test_ind))
        
    return (X, y, y_l, cv_iter)
    


In [6]:
def param_tune_allele(allele, train_dataset):
    
    allele_td = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_td, allele)

    '''
    grid_params = {'n_estimators': [x for x in range(100, 1100, 100)], 
            'max_depth': [x for x in range(10, 100, 10)], 
            'min_samples_leaf': [x for x in range(10, 100, 10)], 
            'bootstrap': [True],
            'oob_score':[True],
            'max_features':["auto", "sqrt", "log2"]
            }
    '''
    alpha_ridge = [1e-15, 1e-10, 1e-8, 1e-4, 1e-3,1e-2, 1, 5, 10, 20]
    regr_types = ["lr", "lasso", "ridge"]
    
    regr_results = {}
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for rt in regr_types:
        if rt == "lr":
            regr_cv = LinearRegression()
            print("CV")
            #cross-validation
            cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
            cv_mscore = statistics.mean(cv_scores)
            print(cv_scores)
            print(cv_mscore)
            if cv_mscore > best_cv_mscore:
                best_cv_params = {"type":"lr", "alpha": "x"}
                best_cv_scores = cv_scores
                best_cv_mscore = cv_mscore
        else:
            if rt == "lasso":
                regr_cv = linear_model.Lasso()
            if rt == "ridge":
                regr_cv = linear_model.Ridge()
            print("CV")
            #cross-validation
            for alpha in alpha_ridge:
                regr_cv.set_params(**{'alpha': alpha})
                cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
                cv_mscore = statistics.mean(cv_scores)
                print(cv_scores)
                print(cv_mscore)
                if cv_mscore > best_cv_mscore:
                    best_cv_params = {"type":rt, "alpha": alpha}
                    best_cv_scores = cv_scores
                    best_cv_mscore = cv_mscore
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (None, None, best_cv_params, best_cv_scores)
            
    
'''
    regr_oob = RandomForestRegressor(n_jobs=-1)
    regr_cv = RandomForestRegressor(n_jobs=-1)

    best_oob_score = 0 
    best_oob_params = None
    best_cv_mscore = 0
    best_cv_scores = None
    best_cv_params = None
    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):
        print("Parameter iteration: "+str(i))
        print("OOB")
        #out of bag
        print(g)
        regr_oob.set_params(**g)
        regr_oob.fit(X_train_s,y_train_s)
        print(regr_oob.oob_score_)
        if regr_oob.oob_score_ > best_oob_score:
            best_oob_params = g
            best_oob_score = regr_oob.oob_score_
        
        print("CV")
        #cross-validation
        regr_cv.set_params(**g)
        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)
        cv_mscore = statistics.mean(cv_scores)
        print(cv_scores)
        print(cv_mscore)
        if cv_mscore > best_cv_mscore:
            best_cv_params = g
            best_cv_scores = cv_scores
            best_cv_mscore = cv_mscore
            
    print("Best OOB "+str(best_oob_score))
    print(best_oob_params)
    print("Best CV "+str(best_cv_mscore))
    print(best_cv_params)
    return (best_oob_params, best_oob_score, best_cv_params, best_cv_scores)

'''

'\n    regr_oob = RandomForestRegressor(n_jobs=-1)\n    regr_cv = RandomForestRegressor(n_jobs=-1)\n\n    best_oob_score = 0 \n    best_oob_params = None\n    best_cv_mscore = 0\n    best_cv_scores = None\n    best_cv_params = None\n    for i, g in enumerate(ParameterSampler(grid_params, n_iter=100)):\n        print("Parameter iteration: "+str(i))\n        print("OOB")\n        #out of bag\n        print(g)\n        regr_oob.set_params(**g)\n        regr_oob.fit(X_train_s,y_train_s)\n        print(regr_oob.oob_score_)\n        if regr_oob.oob_score_ > best_oob_score:\n            best_oob_params = g\n            best_oob_score = regr_oob.oob_score_\n        \n        print("CV")\n        #cross-validation\n        regr_cv.set_params(**g)\n        cv_scores = cross_val_score(regr_cv, X_train, y_train, cv=cv, n_jobs = -1)\n        cv_mscore = statistics.mean(cv_scores)\n        print(cv_scores)\n        print(cv_mscore)\n        if cv_mscore > best_cv_mscore:\n            best_cv_params 

## Crossvalidation

In [7]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

------------------------------------------------------------------------
ALLELE
A0101
CV
[0.67896801 0.75721685 0.69464213 0.70719421 0.71021095]
0.7096464308167041
CV
[0.67897531 0.75722316 0.69584509 0.7071123  0.71205981]
0.7102431333852128
[0.67897531 0.75722317 0.69584509 0.7071123  0.71205981]
0.7102431352523949
[0.67897552 0.75722326 0.6958452  0.70711249 0.71206013]
0.7102433201059758
[0.68090699 0.75800381 0.69652675 0.70859449 0.7153534 ]
0.711877087014447
[0.68273347 0.75928947 0.69638203 0.71095457 0.72258917]
0.7143897406605515
[0.62937668 0.7038139  0.64290667 0.66821027 0.69217205]
0.6672959138736002
[0.03819065 0.038536   0.03519443 0.03889552 0.03853479]
0.03787027821161435
[-2.17247373e-05 -1.21794921e-05 -1.13312331e-05 -2.32501473e-05
 -7.65534303e-11]
-1.3697137269996772e-05
[-2.17247373e-05 -1.21794921e-05 -1.13312331e-05 -2.32501473e-05
 -7.65534303e-11]
-1.3697137269996772e-05
[-2.17247373e-05 -1.21794921e-05 -1.13312331e-05 -2.32501473e-05
 -7.65534303e-11]
-1.

[-2.86604381e-06 -2.97975289e-08 -2.18510956e-06 -2.98537151e-05
 -2.13009655e-05]
-1.1247126286528797e-05
CV
[0.65500897 0.68986839 0.64789239 0.65925206 0.68809546]
0.6680234547472431
[0.65501721 0.68989495 0.64789499 0.65925216 0.6881006 ]
0.6680319823392028
[0.65501723 0.68989496 0.647895   0.65925216 0.6881006 ]
0.668031991346992
[0.65501725 0.68989497 0.64789501 0.65925216 0.68810061]
0.668032000857541
[0.65501741 0.68989506 0.64789506 0.65925214 0.68810076]
0.6680320856249214
[0.65501905 0.68989598 0.64789553 0.65925193 0.68810219]
0.6680329330424777
[0.65519444 0.68999346 0.6479457  0.65922812 0.68825518]
0.6681233817728945
[0.65581992 0.690336   0.64811753 0.65912375 0.68880102]
0.6684396431272078
[0.65645709 0.69067056 0.64827727 0.65897745 0.68935306]
0.6687470869910832
[0.65741315 0.69112063 0.64847166 0.65864552 0.69015597]
0.6691613865494372
Best CV 0.6691613865494372
{'type': 'ridge', 'alpha': 20}
------------------------------------------------------------------------
A

CV
[0.63587956 0.66641684 0.04285446 0.65481281 0.40477054]
0.4809468420130642
CV
[0.63611056 0.66677317 0.2174565  0.65512317 0.60314639]
0.5557219548595838
[0.63611056 0.66677317 0.21745674 0.65512317 0.60314639]
0.5557220073944853
[0.63611103 0.66677344 0.21748092 0.65512351 0.60314714]
0.5557272083194894
[0.64053144 0.66899124 0.42578283 0.65823213 0.61004106]
0.6007157406732467
[0.65494038 0.67456047 0.67761489 0.67472012 0.63878068]
0.6641233074206115
[0.64496267 0.66508899 0.64942453 0.67919712 0.62228069]
0.6521907992364837
[0.00746609 0.00919834 0.00115095 0.00654196 0.00893152]
0.0066577710921471136
[-4.86511102e-05 -6.42029265e-06 -5.84190588e-05 -7.72360607e-05
 -7.42860532e-06]
-3.9631025532171904e-05
[-4.86511102e-05 -6.42029265e-06 -5.84190588e-05 -7.72360607e-05
 -7.42860532e-06]
-3.9631025532171904e-05
[-4.86511102e-05 -6.42029265e-06 -5.84190588e-05 -7.72360607e-05
 -7.42860532e-06]
-3.9631025532171904e-05
CV
[0.6361331  0.66677422 0.21937282 0.65482651 0.6031327 ]
0.

[-8.88187141e-07 -5.51323523e-06 -1.94385776e-05 -9.20856896e-06
 -2.26544823e-05]
-1.1540610245930338e-05
CV
[0.66004219 0.6471382  0.67748037 0.68651294 0.64113238]
0.6624612152698539
[0.66036397 0.6471619  0.67750486 0.68664082 0.64112513]
0.6625593346900283
[0.66036407 0.64716206 0.67750486 0.68664085 0.64112511]
0.6625593913106189
[0.66036407 0.64716208 0.67750489 0.68664086 0.64112517]
0.6625594150796602
[0.66036409 0.64716226 0.67750519 0.68664091 0.64112567]
0.6625596238955581
[0.66036425 0.64716397 0.67750818 0.68664149 0.64113065]
0.6625617087876179
[0.66038512 0.64734313 0.677762   0.68671134 0.64161158]
0.6627626354164285
[0.66048449 0.64792047 0.67820701 0.68699792 0.64292842]
0.6633076622998663
[0.66057211 0.64841894 0.67842472 0.68725965 0.64402427]
0.663739938156371
[0.66060581 0.64897179 0.67856831 0.68750713 0.64543891]
0.6642183886659769
Best CV 0.6643739390050746
{'type': 'lasso', 'alpha': 0.0001}
---------------------------------------------------------------------

[0.70284576 0.66549267 0.75777879 0.7070203  0.676312  ]
0.7018899037593365
Best CV 0.7030171918859885
{'type': 'lasso', 'alpha': 0.001}
------------------------------------------------------------------------
ALLELE
B3501
CV
[0.55577939 0.56101798 0.54515801 0.54369642 0.60886594]
0.562903548681282
CV
[0.5575596  0.56082814 0.56605169 0.55012422 0.60909648]
0.5687320268279228
[0.5575596  0.56082815 0.56605169 0.55012422 0.60909649]
0.568732029363062
[0.55755986 0.56082842 0.56605194 0.55012455 0.60909663]
0.5687322803414744
[0.56028573 0.56335211 0.5683596  0.55302579 0.61009932]
0.5710245103177328
[0.56479534 0.56839168 0.57604113 0.55796197 0.6091292 ]
0.575263864284238
[0.53338511 0.5015804  0.53477981 0.53031726 0.5290721 ]
0.5258269365102495
[0.01741705 0.02131662 0.02192632 0.02228511 0.02165948]
0.020920916706377746
[-7.34163770e-05 -2.65412669e-07 -1.86041514e-07 -2.44491787e-05
 -5.22307362e-05]
-3.0109549203727325e-05
[-7.34163770e-05 -2.65412669e-07 -1.86041514e-07 -2.44491

[-3.87526576e-06 -5.72476295e-05 -3.61255067e-05 -6.06932041e-05
 -1.67329936e-05]
-3.493491994253084e-05
[-3.87526576e-06 -5.72476295e-05 -3.61255067e-05 -6.06932041e-05
 -1.67329936e-05]
-3.493491994253084e-05
CV
[0.68168241 0.73554669 0.69908733 0.66223967 0.61920342]
0.6795519042457909
[0.68169474 0.73554729 0.6991165  0.66224954 0.6194413 ]
0.6796098751747482
[0.68169485 0.7355473  0.69911651 0.66224963 0.61944134]
0.6796099266605667
[0.6816949  0.73554731 0.69911655 0.66224968 0.61944139]
0.679609967418998
[0.68169528 0.73554741 0.69911696 0.66225016 0.61944184]
0.6796103295637183
[0.68169906 0.73554838 0.69912101 0.66225492 0.61944637]
0.6796139484431571
[0.68209919 0.73563711 0.69953769 0.66274299 0.61991104]
0.6799856049316909
[0.68345221 0.73575049 0.70079857 0.66419531 0.62131291]
0.6811018979752537
[0.6847359  0.73562578 0.70183332 0.66531982 0.62245927]
0.6819948174765247
[0.68650795 0.73510046 0.70311885 0.66648776 0.62385032]
0.6830130660553426
Best CV 0.6872927796050039

CV
[0.60110089 0.71088006 0.62009443 0.64994695 0.63975895]
0.6443562557449636
CV
[0.60024191 0.71113726 0.61820365 0.64884026 0.63560855]
0.6428063259455845
[0.60024191 0.71113726 0.61820366 0.64884026 0.63560855]
0.6428063278210466
[0.60024206 0.71113749 0.61820382 0.64884051 0.63560868]
0.642806513490707
[0.60166649 0.71301883 0.61937926 0.65057685 0.63669608]
0.6442674996650007
[0.60685675 0.71168205 0.61661766 0.6443764  0.6312875 ]
0.6421640729026173
[0.59046665 0.67761321 0.58168416 0.59163769 0.5701786 ]
0.6023160627009875
[0.10291938 0.09825725 0.01671181 0.09733549 0.09224818]
0.08149442169336259
[-1.63665831e-06 -1.63665831e-06 -1.63665831e-06 -1.63665831e-06
 -1.63665831e-06]
-1.6366583093230247e-06
[-1.63665831e-06 -1.63665831e-06 -1.63665831e-06 -1.63665831e-06
 -1.63665831e-06]
-1.6366583093230247e-06
[-1.63665831e-06 -1.63665831e-06 -1.63665831e-06 -1.63665831e-06
 -1.63665831e-06]
-1.6366583093230247e-06
CV
[0.60105694 0.71091791 0.61823208 0.64988391 0.63674822]
0.643

## Tune the best

In [12]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    rt = params["type"]
    alpha = params["alpha"]
    if rt == "lr":
        regr_best = linear_model.LinearRegression()
    if rt == "lasso":
        regr_best = linear_model.Lasso(alpha)
    if rt == "ridge":
        regr_best = linear_model.Ridge(alpha)
    regr_best.fit(X_train, y_train)
    with open('./final_LR_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)    

In [13]:
results_df = pd.DataFrame(results)
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "ppp")   

  positive)


In [14]:
results_df

Unnamed: 0,allele,best_oob_param,best_oob_score,best_cv_param,best_cv_scores
0,A0101,,,"{'type': 'lasso', 'alpha': 0.001}","[0.6827334650190289, 0.7592894698695327, 0.696..."
1,A0201,,,"{'type': 'lasso', 'alpha': 0.0001}","[0.6700070697873949, 0.6491007658469163, 0.633..."
2,A0203,,,"{'type': 'lasso', 'alpha': 0.001}","[0.6509631426811306, 0.7014169903157872, 0.655..."
3,A0206,,,"{'type': 'lasso', 'alpha': 0.001}","[0.6550589700758578, 0.6657211548538113, 0.656..."
4,A0301,,,"{'type': 'ridge', 'alpha': 20}","[0.6574131529020285, 0.6911206273258512, 0.648..."
5,A1101,,,"{'type': 'lasso', 'alpha': 0.001}","[0.6811001502281855, 0.7778283052433044, 0.789..."
6,A2301,,,"{'type': 'ridge', 'alpha': 20}","[0.6988124604347506, 0.6803118704857174, 0.668..."
7,A2402,,,"{'type': 'lasso', 'alpha': 0.001}","[0.6180861532553013, 0.5878024449316734, 0.528..."
8,A2601,,,"{'type': 'lasso', 'alpha': 0.001}","[0.616145208949421, 0.6319712290395507, 0.5558..."
9,A2902,,,"{'type': 'lasso', 'alpha': 0.001}","[0.6549403765745785, 0.6745604711896129, 0.677..."


## --------------------------------------------------- ##
## The middle/anchor position experiments are done for RF only 

## Middle position

In [13]:
# extracting features in training format
# and get the cross-validation iterator
def get_energies(X):
    ene = np.roll(X, -3, axis = 0)[:4,:19]
    ene = ene.reshape(4*19)
    return ene

In [None]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

In [None]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = RandomForestRegressor(n_jobs=-1)
    regr_best.set_params(**params)
    regr_best.fit(X_train, y_train)
    with open('./final_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)    

In [None]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "ppp-middle") 

## Anchor positions

In [14]:
# extracting features in training format
# and get the cross-validation iterator
def get_energies(X):
    ene = np.roll(X, 2, axis = 0)[:5,:19]
    ene = np.roll(ene, -2, axis = 0)
    ene = ene.reshape(5*19)
    return ene

In [None]:
alleles = train_dataset["allele"].unique()
results = {"allele":[], "best_oob_param":[], "best_oob_score":[], "best_cv_param":[], "best_cv_scores":[]}

for allele in alleles:
    print("------------------------------------------------------------------------")
    print("ALLELE")
    print(allele)
    res = param_tune_allele(allele, train_dataset)
    results["allele"].append(allele)
    results["best_oob_param"].append(res[0])
    results["best_oob_score"].append(res[1])
    results["best_cv_param"].append(res[2])
    results["best_cv_scores"].append(res[3])

In [None]:
def train_best_model(allele, params, exp_name):
    allele_train_dataset = train_dataset[train_dataset["allele"]==allele]
    (X_train, y_train, y_l, cv) = extract_features_Xy_cv(allele_train_dataset, allele)
    regr_best = RandomForestRegressor(n_jobs=-1)
    regr_best.set_params(**params)
    regr_best.fit(X_train, y_train)
    with open('./final_REGRmodels/'+allele+exp_name+'.pkl', 'wb') as fid:
        cPickle.dump(regr_best, fid)

In [None]:
for allele in alleles:
    params = list(results_df[results_df["allele"]==allele]["best_cv_param"])[0]
    train_best_model(allele, params, "ppp-anchor") 