## Results - section 1

Comparing the performance of:

1. ref2015-score
2. standard-pHLA-score
3. 3pHLA-score

Metrics: R^2 (coefficient of determination), Pearson's correlation, Spearman's correlation

Sets: test set of dataset 1

In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
import time
from sklearn.model_selection import GridSearchCV
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from IPython.display import display
from scipy import stats
import _pickle as cPickle

## Load the test set

In [2]:
#loading the data and labels
test_set = pd.read_csv("../Datasets/test_set.csv")
test_set = test_set[["allele", "peptide", "fileloc", "allele_type", "ba", "binder"]]
test_set

#loading the energies
def ene_to_array(ene_str):
    ene_str = ene_str.strip("[]")
    ene_str = ene_str.strip("\(\)")
    return np.fromstring(ene_str, dtype=float, count = 20, sep=", ")

complex_ene = pd.read_csv("../Featurization/rosettaComplexEnergies.csv")
complex_ene = complex_ene[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
complex_ene["energies"] = complex_ene["energies"].apply(ene_to_array)
complex_ene.columns = ["allele", "peptide", "binder", "ba", "energies_complex", "total_energy_complex"]
complex_ene

def pppene_to_array(tmp):
    tmp = tmp.replace("(", "")
    tmp = tmp.replace(")", "")
    tmp = tmp.strip("[]")
    tmp = tmp.replace(" ", "")
    tmp = tmp.replace("\n", ",")
    return np.fromstring(tmp, dtype=float, sep=", ").reshape(9,20)

ppp_ene = pd.read_csv("../Featurization/rosettaPPPEnergies.csv")
ppp_ene = ppp_ene[["allele", "peptide", "binder", "ba", "energies", "total_energy"]]
ppp_ene["energies"] = ppp_ene["energies"].apply(pppene_to_array)
ppp_ene.columns = ["allele", "peptide", "binder", "ba", "energies_ppp", "total_energy_ppp"]
ppp_ene

#Merge to form the training set
test_dataset = pd.merge(complex_ene, test_set, on=["allele", "peptide"],  how="inner")
test_dataset = pd.merge(ppp_ene, test_dataset, on=["allele", "peptide"], how="inner")

test_dataset

Unnamed: 0,allele,peptide,binder,ba,energies_ppp,total_energy_ppp,binder_x,ba_x,energies_complex,total_energy_complex,fileloc,allele_type,ba_y,binder_y
0,A0101,WTDINVVVY,1,0.574375,"[[-11.08121476, 14.16692054, 8.63528721, 3.751...",88.191038,1,0.574375,"[-2285.10015517, 757.09563062, 1452.3429686, 3...",44.801386,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.574375,1
1,A0101,ASEDSVLLY,1,0.574375,"[[-5.36496986, 2.12380102, 6.05756007, 0.89668...",78.146488,1,0.574375,"[-2277.61907956, 728.92311147, 1459.25163947, ...",-20.506385,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.574375,1
2,A0101,VLDVVYLVY,1,0.574375,"[[-6.66330027, 10.27173546, 5.88904802, 11.922...",102.722720,1,0.574375,"[-2286.82047111, 784.0399194, 1443.39102736, 3...",1.316455,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.574375,1
3,A0101,SADPGNLKY,1,0.574375,"[[-5.39995792, 9.94366351, 8.01456455, 0.50256...",149.323707,1,0.574375,"[-2284.54072463, 776.33280538, 1475.99470576, ...",201.584772,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.574375,1
4,A0101,YADPGVSFY,1,0.574375,"[[-9.44778283, 2.41249371, 8.12634022, 7.86302...",106.151854,1,0.574375,"[-2275.77705267, 742.96904558, 1445.47514472, ...",20.430956,/home/anja/Documents/jayvee_data/singleconf/al...,HLA-A,0.574375,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7778,C1601,QLIVFGEQL,0,0.000000,"[[-6.53195111, 3.96266327, 7.63746576, 23.7161...",123.306026,0,0.000000,"[-2299.39948485, 2387.4589554, 1459.58978704, ...",1436.625712,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,0.000000,0
7779,C1601,QLSPLKGLS,0,0.000000,"[[-6.03841067, 2.68022159, 7.35152561, 31.6184...",100.506258,0,0.000000,"[-2287.65325545, 2429.69920933, 1474.58026015,...",1436.618356,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,0.000000,0
7780,C1601,QMKERLQQI,0,0.000000,"[[-8.6387949, 33.96029077, 10.17907233, 27.298...",174.672763,0,0.000000,"[-2319.92158398, 2520.69114898, 1493.9033635, ...",1477.402787,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,0.000000,0
7781,C1601,QPGHLCCVR,0,0.000000,"[[-8.49143909, 113.75451439, 9.01264723, 48.00...",327.946605,0,0.000000,"[-2295.53582639, 2583.51601822, 1468.09552272,...",1665.489060,/home/anja/Documents/COMP590P/C_decoys/confs/C...,HLA-C,0.000000,0


## Get ref2015-score

In [3]:
root_dir = "../Experiment1 - train ref2015-score, standard-pHLA-score and 3pHLA-score/"
ref2015_score = pd.read_csv(root_dir+"ref2015-score_testset_results.csv")

In [4]:
ref2015_score = ref2015_score[["allele", "peptide", "ba", "binder", "path", "allele_type", "ref2015-score"]]
ref2015_score

Unnamed: 0,allele,peptide,ba,binder,path,allele_type,ref2015-score
0,A0101,WTDINVVVY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-WT...,HLA-A,-22.184743
1,A0101,ASEDSVLLY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-AS...,HLA-A,-37.244441
2,A0101,VLDVVYLVY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-VL...,HLA-A,-34.866134
3,A0101,SADPGNLKY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-SA...,HLA-A,-6.404157
4,A0101,YADPGVSFY,0.574375,1,../../jayvee_data/singleconf/all_data/A0101-YA...,HLA-A,-23.324601
...,...,...,...,...,...,...,...
7778,C1601,QLIVFGEQL,0.000000,0,../../COMP590P/C_decoys/confs/C1601-QLIVFGEQL/...,HLA-C,-9.237634
7779,C1601,QLSPLKGLS,0.000000,0,../../COMP590P/C_decoys/confs/C1601-QLSPLKGLS/...,HLA-C,-13.274659
7780,C1601,QMKERLQQI,0.000000,0,../../COMP590P/C_decoys/confs/C1601-QMKERLQQI/...,HLA-C,28.000787
7781,C1601,QPGHLCCVR,0.000000,0,../../COMP590P/C_decoys/confs/C1601-QPGHLCCVR/...,HLA-C,3.222600


In [5]:
test_dataset["ref2015-score"] = ref2015_score["ref2015-score"]

## Predictions standard-pHLA-score

In [11]:
import pickle as pk


def score_elem_std(row, dir_name):
    allele = row["allele"]
    fileloc = row["fileloc"]
    feat = [row["energies_complex"][:-1]]

    model_name = root_dir+"/"+dir_name+"/"+allele+"complex.pkl"

    model = pk.load(open(model_name, 'rb'))
    pred = model.predict(feat)[0]
    return pred

def reformat_path(path):
    prefix = "/home/anja/Documents/"
    new = "../../"
    if not prefix in path: return path
    res = path[len(prefix):]
    return new+res

In [7]:
test_dataset["fileloc"] = test_dataset["fileloc"].apply(reformat_path)

In [51]:
dir_name = "final_LRk_REGRmodels"

test_dataset["standard-3pHLA-score"] = test_dataset.apply(lambda x :\
                                                          score_elem_std(x, dir_name), \
                                                          axis=1)

TypeError: score_elem() takes 1 positional argument but 2 were given

## Predictions 3pHLA-score

In [12]:
import pickle as pk

def get_energies(X):
    ene = np.roll(X, 4, axis = 0)[:9,:19]
    ene = np.roll(ene, -4, axis = 0)
    ene = ene.reshape(9*19)
    return ene

def get_energies_mid(X):
    ene = np.roll(X, -3, axis = 0)[:4,:19]
    ene = ene.reshape(4*19)
    return ene

def get_energies_anch(X):
    ene = np.roll(X, 2, axis = 0)[:5,:19]
    ene = np.roll(ene, -2, axis = 0)
    ene = ene.reshape(5*19)
    return ene

def score_elem_3p(row, dir_name):
    allele = row["allele"]
    fileloc = row["fileloc"]
    feat = row["energies_ppp"]

    model_9n = root_dir+"/"+dir_name+"/"+allele+"ppp.pkl"
    #model_anchn = root_dir+"/final_REGRmodels/"+allele+"ppp-anchor.pkl"
    #model_midn = root_dir+"/final_REGRmodels/"+allele+"ppp-middle.pkl"

    model_9 = pk.load(open(model_9n, 'rb'))
    #model_anch = pk.load(open(model_anchn, 'rb'))
    #model_mid = pk.load(open(model_midn, 'rb'))
    pred1 = model_9.predict([get_energies(feat)])[0]
    #pred2 = model_anch.predict([get_energies_anch(feat)])[0]
    #pred3 = model_mid.predict([get_energies_mid(feat)])[0]
    return (pred1, None, None)#pred2, pred3)

In [None]:
test_dataset["3pHLA-score"] = test_dataset.apply(lambda x: \
                                                 score_elem_3p(x, dir_name) \
                                                 , axis=1)

In [49]:
test_dataset

Unnamed: 0,allele,peptide,binder,ba,energies_ppp,total_energy_ppp,binder_x,ba_x,energies_complex,total_energy_complex,fileloc,allele_type,ba_y,binder_y,ref2015-score,standard-3pHLA-score,3pHLA-score
0,A0101,WTDINVVVY,1,0.574375,"[[-11.08121476, 14.16692054, 8.63528721, 3.751...",88.191038,1,0.574375,"[-2285.10015517, 757.09563062, 1452.3429686, 3...",44.801386,../../jayvee_data/singleconf/all_data/A0101-WT...,HLA-A,0.574375,1,-22.184743,0.251734,"(0.5597280623677534, None, None)"
1,A0101,ASEDSVLLY,1,0.574375,"[[-5.36496986, 2.12380102, 6.05756007, 0.89668...",78.146488,1,0.574375,"[-2277.61907956, 728.92311147, 1459.25163947, ...",-20.506385,../../jayvee_data/singleconf/all_data/A0101-AS...,HLA-A,0.574375,1,-37.244441,0.516161,"(0.5880058018492491, None, None)"
2,A0101,VLDVVYLVY,1,0.574375,"[[-6.66330027, 10.27173546, 5.88904802, 11.922...",102.722720,1,0.574375,"[-2286.82047111, 784.0399194, 1443.39102736, 3...",1.316455,../../jayvee_data/singleconf/all_data/A0101-VL...,HLA-A,0.574375,1,-34.866134,0.336228,"(0.5276514859474446, None, None)"
3,A0101,SADPGNLKY,1,0.574375,"[[-5.39995792, 9.94366351, 8.01456455, 0.50256...",149.323707,1,0.574375,"[-2284.54072463, 776.33280538, 1475.99470576, ...",201.584772,../../jayvee_data/singleconf/all_data/A0101-SA...,HLA-A,0.574375,1,-6.404157,0.437956,"(0.5698939301452128, None, None)"
4,A0101,YADPGVSFY,1,0.574375,"[[-9.44778283, 2.41249371, 8.12634022, 7.86302...",106.151854,1,0.574375,"[-2275.77705267, 742.96904558, 1445.47514472, ...",20.430956,../../jayvee_data/singleconf/all_data/A0101-YA...,HLA-A,0.574375,1,-23.324601,0.306986,"(0.5322876663777834, None, None)"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7778,C1601,QLIVFGEQL,0,0.000000,"[[-6.53195111, 3.96266327, 7.63746576, 23.7161...",123.306026,0,0.000000,"[-2299.39948485, 2387.4589554, 1459.58978704, ...",1436.625712,../../COMP590P/C_decoys/confs/C1601-QLIVFGEQL/...,HLA-C,0.000000,0,-9.237634,0.265939,"(-0.011882285502338163, None, None)"
7779,C1601,QLSPLKGLS,0,0.000000,"[[-6.03841067, 2.68022159, 7.35152561, 31.6184...",100.506258,0,0.000000,"[-2287.65325545, 2429.69920933, 1474.58026015,...",1436.618356,../../COMP590P/C_decoys/confs/C1601-QLSPLKGLS/...,HLA-C,0.000000,0,-13.274659,0.234074,"(-0.08078759050374629, None, None)"
7780,C1601,QMKERLQQI,0,0.000000,"[[-8.6387949, 33.96029077, 10.17907233, 27.298...",174.672763,0,0.000000,"[-2319.92158398, 2520.69114898, 1493.9033635, ...",1477.402787,../../COMP590P/C_decoys/confs/C1601-QMKERLQQI/...,HLA-C,0.000000,0,28.000787,0.487277,"(0.4570463829438154, None, None)"
7781,C1601,QPGHLCCVR,0,0.000000,"[[-8.49143909, 113.75451439, 9.01264723, 48.00...",327.946605,0,0.000000,"[-2295.53582639, 2583.51601822, 1468.09552272,...",1665.489060,../../COMP590P/C_decoys/confs/C1601-QPGHLCCVR/...,HLA-C,0.000000,0,3.222600,0.169281,"(0.017978546448830457, None, None)"


In [50]:
test_dataset.to_csv("Results1.1_final_data_"+dir_name+".csv")

#test_dataset.to_csv("Results1.1_final_data_MLP.csv")
#test_dataset.to_csv("Results1.1_final_data.csv")

## Other models: LR, SVM, PLS

In [13]:
dir_names = [ \
            "final_LRk_REGRmodels", \
             "final_LR_REGRmodels", \
             "final_SVM_REGRmodels", \
             "final_PLS_REGRmodels", \
             "final_REGRmodels"]
methods = ["LRk", "LR", "SVM", "PLS", "RF"]

In [15]:

for i, method in enumerate(methods):
    print(method)
    dir_name = dir_names[i]
    test_dataset["std-score-"+method] = test_dataset.apply(lambda x :\
                                                          score_elem_std(x, dir_name), \
                                                          axis=1)
    
    test_dataset["3p-score-"+method] = test_dataset.apply(lambda x :\
                                                          score_elem_3p(x, dir_name), \
                                                          axis=1)

LRk
LR
SVM
PLS
RF


In [21]:
#reformat PLS results
test_dataset["std-score-PLS"] = test_dataset["std-score-PLS"].apply(lambda x: x[0]) 
test_dataset["3p-score-PLS"] =  test_dataset["3p-score-PLS"].apply(lambda x: (x[0][0], x[1], x[2])) 

In [27]:
test_dataset.to_csv("Results1.1_final_data_all_models.csv")