In [1]:
from pan_allele_data_helpers import *
import pandas as pd
import numpy as np
np.random.seed(1)
from sklearn.metrics import roc_auc_score, accuracy_score
log_transformed_ic50_cutoff = 1 - np.log(500)/np.log(5000)
from keras.models import Graph
import theano.tensor as T
from keras.models import Sequential, Graph
from keras.layers.embeddings import Embedding
from keras.layers.core import Dense, Activation, RepeatVector, Dropout, Reshape, Flatten, Merge, Permute
import theano
from keras.layers.convolutional import Convolution1D, MaxPooling1D



Couldn't import dot_parser, loading of dot files will not be possible.


In [2]:
def normalize_allele_name(allele_name):
    allele_name = allele_name.upper()
    # old school HLA-C serotypes look like "Cw"
    allele_name = allele_name.replace("CW", "C")
    patterns = [
        "HLA-",
        "-",
        "*",
        ":"
    ]
    for pattern in patterns:
        allele_name = allele_name.replace(pattern, "")
    return allele_name
allele_groups, df = load_binding_data('files/bdata.2009.mhci.public.1.txt')
allele_sequence_data, max_allele_length = load_allele_sequence_data('files/trimmed-human-class1.fasta')
allele_list = sorted(create_allele_list(allele_groups, allele_sequence_data))


In [3]:
training_list = sorted(create_allele_list(allele_groups, allele_sequence_data))
allele = 'A0101'
training_list.remove(allele)

In [4]:

peptide_train, mhc_train, Y_train = get_model_data(training_list,
                                                            allele_sequence_data,
                                                            allele_groups,
                                                            dense_mhc_model=None,
                                                            peptide_length = 9,
                                                            mhc_length=181,
                                                            mhc_dense = None
                                                            )
peptide_test, mhc_test, Y_test = get_model_data([allele],
                                                        allele_sequence_data,
                                                        allele_groups,
                                                        dense_mhc_model=None,
                                                        peptide_length = 9,
                                                        mhc_length=181,
                                                        mhc_dense=None)
peptide_train = np.matrix(peptide_train, dtype=int)
print("train",peptide_train.shape, mhc_train.shape, Y_train.shape)
print("test", peptide_test.shape)
np.random.seed(1)
print peptide_train[1]
arr = np.arange(len(peptide_train))
np.random.shuffle(arr)
peptide_train = peptide_train[arr]
print peptide_train[1], 
##[[ 8 14  8 11  6 11  6  2 18]]

('train', (87680, 9), (87680, 181), (87680,))
('test', (3169, 9))
[[ 0  0  0  8  0  0  0  0 18]]
[[ 8 14  8 11  6 11  6  2 18]]


In [9]:

optimizer='rmsprop'
peptide_length = 9
maxlen_mhc = 181
mhc_activation = 'tanh'
peptide_activation = 'tanh'

hyperparameters = {'sizes':[128,64,128,128,128,64],'dropouts':[0.2,0.2,0.2], 'mult_size':[16,16]}
size_names = [
        'peptide_embedding_size',
        'mhc_embedding_size',
        'peptide_hidden_size',
        'mhc_hidden_size',
        'combined_hidden_size',
        'combined_hidden_final'
        ]

dropout_names = [
        'dropout_merged',
        'dropout_peptide',
        'dropout_mhc'
        ]
mult_size = {
        'mhc_m':hyperparameters['mult_size'][0],
        'mhc_n':hyperparameters['mult_size'][1]
}

size_dict = {}
dropout_dict = {}

for idx, name in enumerate(size_names):
    print idx,name
    size_dict[name] = hyperparameters['sizes'][idx]

for idx, name in enumerate(dropout_names):
    dropout_dict[name] = hyperparameters['dropouts'][idx]


print size_dict, dropout_dict

print("Building Graph Model")
graph = Graph()

graph.add_input(name='peptide', ndim=2)

graph.add_input(name='mhc', ndim=2)

graph.inputs['peptide'].input = T.imatrix()
graph.inputs['mhc'].input = T.imatrix()

##PEPTIDE

graph.add_node(
            Embedding(20,size_dict['peptide_embedding_size']),
            name='peptide_embedding',
            input='peptide')

graph.add_node(
            Flatten(),
            name = 'peptide_flatten',
            input = 'peptide_embedding')

graph.add_node(
                Dense(size_dict['peptide_embedding_size'] * peptide_length, mult_size['mhc_m'],
                activation = peptide_activation ),
            name='peptide_dense',
            input='peptide_flatten'   )


dropout_output_peptide = 'peptide_dense'
if(dropout_dict['dropout_peptide']):
    dropout_output_peptide = 'peptide_dropout'
    graph.add_node(Dropout(dropout_dict['dropout_peptide']), name = 'peptide_dropout', input='peptide_dense')



##MHC

graph.add_node(
            Embedding(20,size_dict['mhc_embedding_size']),
            name='mhc_embedding',
            input='mhc')

graph.add_node(
        Flatten(),
        name = 'mhc_flatten',
        input = 'mhc_embedding')

graph.add_node(
        Dense(size_dict['mhc_embedding_size'] * maxlen_mhc, size_dict['mhc_hidden_size'],
            activation = mhc_activation),
        name='mhc_dense',
        input='mhc_flatten')

dropout_output_mhc = 'mhc_dense'
if(dropout_dict['dropout_mhc']):
    dropout_output_mhc = 'mhc_dropout'
    graph.add_node(Dropout(dropout_dict['dropout_mhc']), name = dropout_output_mhc, input='mhc_dense')

graph.add_node(Dense(size_dict['mhc_hidden_size'], mult_size['mhc_m'] * mult_size['mhc_n'], activation = mhc_activation),
                name = 'mhc_dense_2', input = dropout_output_mhc)
graph.add_node(Reshape(mult_size['mhc_m'],mult_size['mhc_n']), name = 'mhc_final', input = 'mhc_dense_2')
##MERGE

graph.add_node(
        Dense(mult_size['mhc_n'], size_dict['combined_hidden_size'],
            activation = "relu"),
        name='dense_merged_1',
        inputs=[dropout_output_peptide,'mhc_final'],
        merge_mode='matmul')


graph.add_node(
        Dropout(dropout_dict['dropout_merged']),
        name = 'dense_dropout_1',
        input='dense_merged_1')


graph.add_node(
        Dense(size_dict['combined_hidden_size'],size_dict['combined_hidden_final'],activation = "relu"),
        name = 'dense_merged_2',
        input = 'dense_dropout_1')


graph.add_node(
        Dropout(dropout_dict['dropout_merged']),
        name = 'dense_dropout_2',
        input='dense_merged_2')


graph.add_node(
        Dense(size_dict['combined_hidden_final'],1,activation = "sigmoid"),
        name = 'dense_output',
        input = 'dense_dropout_2')

graph.add_output(
        name='output',
        input='dense_output')

graph.compile(optimizer,{'output':'mse'})


0 peptide_embedding_size
1 mhc_embedding_size
2 peptide_hidden_size
3 mhc_hidden_size
4 combined_hidden_size
5 combined_hidden_final
{'combined_hidden_final': 64, 'peptide_embedding_size': 128, 'peptide_hidden_size': 128, 'mhc_embedding_size': 64, 'mhc_hidden_size': 128, 'combined_hidden_size': 128} {'dropout_peptide': 0.2, 'dropout_mhc': 0.2, 'dropout_merged': 0.2}
Building Graph Model


In [10]:
print(peptide_train.shape)
print(type(peptide_train), type(Y_train))
print(mhc_train.shape)
print(Y_train.shape)
graph.fit(
            {'peptide':peptide_train,'mhc':mhc_train, 'output': Y_train},
            batch_size=32,
            nb_epoch=1,
            verbose = 1
    )

(87680, 9)
(<class 'numpy.matrixlib.defmatrix.matrix'>, <type 'numpy.ndarray'>)
(87680, 181)
(87680,)
Epoch 0
 1824/87680 [..............................] - ETA: 153s - output: 0.1312

KeyboardInterrupt: 

In [40]:
model = graph
input_shapes = {'peptide':(10,9),'mhc':(10,181)}
input_dummy = [np.zeros(input_shapes[name], dtype=np.float32)
                       for name in model.input_order]
inputs = [model.inputs[name].input
                  for name in model.input_order]
for l in [graph.nodes[c['name']] for c in graph.node_config]:
    shape_i = theano.function(inputs, l.get_input(train=False).shape,
                                  on_unused_input='ignore', allow_input_downcast=True) 
    shape_o = theano.function(inputs, l.get_output(train=False).shape,
                                  on_unused_input='ignore', allow_input_downcast=True)
    try:
        in_shape = tuple(shape_i(*input_dummy))
        out_shape = tuple(shape_o(*input_dummy))
        config = l.get_config()
        print("%-20s %-20s %-20s" % (config['name'],  in_shape, out_shape))
    except Exception as e:
        print("Error: %s" % e)
        

Embedding            (10, 181)            (10, 181, 32)       
Convolution1D        (10, 181, 32)        (10, 178, 64)       
Permute              (10, 178, 64)        (10, 64, 178)       
Error: 'MaxPooling1D' object has no attribute 'subsample_length'
Permute              (10, 64, 89)         (10, 89, 64)        
Convolution1D        (10, 89, 64)         (10, 86, 64)        
Permute              (10, 86, 64)         (10, 64, 86)        
Error: 'MaxPooling1D' object has no attribute 'subsample_length'
Permute              (10, 64, 43)         (10, 43, 64)        
Flatten              (10, 43, 64)         (10, 2752)          
Dense                (10, 2752)           (10, 64)            
Reshape              (10, 64)             (10, 8, 8)          
Embedding            (10, 9)              (10, 9, 32)         
Convolution1D        (10, 9, 32)          (10, 6, 64)         
Flatten              (10, 6, 64)          (10, 384)           
Dense                (10, 384)            (10, 8)  

In [None]:
for l in [graph.nodes[c['name']] for c in graph.node_config]:
    

In [7]:
maxlen_mhc = 181 
nb_epoch = 64
optimizer='rmsprop'
mhc_activation='relu' 
peptide_activation= 'relu'

max_features = 20
maxlen_peptide = 9
batch_size = 32
embedding_dims = 32
nb_filters =  50
filter_length = 4
hidden_dims = 64

graph = Graph()
graph.add_input(name='peptide', ndim=2)
graph.inputs['peptide'].input = T.imatrix()


graph.add_node( Embedding(max_features, embedding_dims),
                name = 'peptide_embedding',
                input = 'peptide'
                )
graph.add_node( Convolution1D(
                    input_dim=embedding_dims,
                    nb_filter=nb_filters,
                    filter_length=filter_length,
                    border_mode="valid",
                    activation="relu",
                    subsample_length=1),

                name = 'peptide_conv',
                input = 'peptide_embedding'
                )

graph.add_node(Flatten(), name = 'peptide_flatten',input='peptide_embedding')
graph.add_node(Dense(288,6), name= 'peptide_dense', input= 'peptide_flatten')
##MERGE
last_peptide = 'peptide_dense'
last_mhc = 'peptide_conv'
graph.add_node( Dense(50, 1),
                name='merged_output',
                inputs=[last_peptide,last_mhc,],
                merge_mode='mul',
                
                )
graph.add_output(   name='output',  input='merged_output')


graph.compile(  optimizer,  {'output':'mse'})

