In [None]:
from pan_allele_data_helpers import *
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score
log_transformed_ic50_cutoff = 1 - np.log(500)/np.log(5000)
from convolution_model import convolution_graph, convolution_graph_reshape
from sequence_encoding import padded_indices
from amino_acid import amino_acid_letter_indices

In [None]:
def normalize_allele_name(allele_name):
    allele_name = allele_name.upper()
    # old school HLA-C serotypes look like "Cw"
    allele_name = allele_name.replace("CW", "C")
    patterns = [
        "HLA-",
        "-",
        "*",
        ":"
    ]
    for pattern in patterns:
        allele_name = allele_name.replace(pattern, "")
    return allele_name
allele_groups, df = load_binding_data('files/bdata.2009.mhci.public.1.txt')
allele_sequence_data, max_allele_length = load_allele_sequence_data('files/trimmed-human-class1.fasta')
allele_list = sorted(create_allele_list(allele_groups, allele_sequence_data))
peptide_train, mhc_train, Y_train = get_model_data(allele_list,
                                                            allele_sequence_data,
                                                            allele_groups,
                                                            dense_mhc_model=None,
                                                            peptide_length = 9,
                                                            mhc_length=181,
                                                            mhc_dense = None
                                                            )
max_sequence_length = max_allele_length
nb_epoch = 30

In [None]:
graph = convolution_graph_reshape(maxlen_mhc = max_sequence_length, nb_epoch = nb_epoch )
graph.fit(
                    {'peptide':peptide_train, 'mhc':mhc_train, 'output': Y_train},
                    batch_size=32,
                    nb_epoch=nb_epoch,
                    verbose = 1
    )

In [41]:
test_allele = 'C0304' 
test_peptide = 'GAVDPLLAL'
test_allele_seq = padded_indices([allele_sequence_data[test_allele]], 
                                      index_dict=amino_acid_letter_indices,
                                      add_start_symbol=False, 
                                      add_end_symbol=False)
test_peptide_seq = padded_indices([test_peptide], 
                                      index_dict=amino_acid_letter_indices,
                                      add_start_symbol=False, 
                                      add_end_symbol=False)

test_peptide_seq

array([[ 4,  0, 18,  3, 13, 10, 10,  0, 10]])

In [44]:
output = graph.predict({'peptide':test_peptide_seq,'mhc':test_allele_seq})['output']

In [50]:
print 5000**(1-output[0][0])

174.981437477


In [78]:
measures = [257.8, 1701,2309,2397,0,0,0,0,0,0,0,0,0,0]
actual_measure = np.array([720,588,18,180,78,2220,378,30,1080,324,12,972,720,120])

In [83]:
actual_bool = 1 * np.less(actual_measure,500)
measures_bool = 1* np.less(measures, 500)
measures = [5000**(1-measure) for measure in measures]
print actual_bool
print measures_bool

[0 0 1 1 1 0 1 1 0 1 1 0 0 1]
[1 0 0 1 1 1 1 1 1 1 1 1 1 1]


In [85]:
roc_auc_score(actual_bool, measures)

0.60416666666666674