In [32]:
import os
import sys
sys.path.append(os.path.split(os.getcwd())[0])
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score, precision_score, accuracy_score
from collections import namedtuple
from pan_allele.helpers.pan_allele_data_helpers import *
from pan_allele.helpers.sequence_encoding import *
from pan_allele.helpers.amino_acid import *
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.optimizers import SGD

In [2]:
def load_binding_data_2(filename, max_ic50 = 50000):

    df = pd.read_csv(filename, sep="\t")
    AlleleData = namedtuple("AlleleData", "Y peptides ic50")
    allele_groups = {}
    peptide_mask_1 = df['sequence'].str.find('-')<0
    peptide_mask_2 = df['sequence'].str.find('+')<0
    df = df[peptide_mask_1&peptide_mask_2]
    for allele, group in df.groupby("mhc"):
        bad_hla_name_elements = [
            "HLA-",
            "-",
            "*",
            ":"
        ]
        
        
        for substring in bad_hla_name_elements:
            allele = allele.replace(substring, "")
        ic50 = np.array(group["meas"])
        log_ic50 = 1.0 - np.log(ic50) / np.log(max_ic50)
        Y = np.maximum(0.0, log_ic50)
        Y = np.minimum(1.0, Y)
        peptides = list(group["sequence"])
        
        allele_groups[allele] = AlleleData(
            Y=Y,
            ic50=ic50,
            peptides=peptides)
        
    return allele_groups, df

In [3]:
allele_groups, df = load_binding_data_2('bdata.classii.2010.csv')

In [55]:
Y_true, peptides, ic50 = allele_groups['DRB10101']

In [5]:
s = df.sequence
np.sum(s.str.find('-')>0)

0

In [76]:
affinity_model = Sequential()
affinity_model.add(Dense(64, input_dim=180, activation='tanh'))
affinity_model.add(Dropout(0.2))
affinity_model.add(Dense(128, input_dim=64, activation='tanh'))
affinity_model.add(Dropout(0.2))
affinity_model.add(Dense(1, input_dim=128, activation='sigmoid'))
affinity_model.compile(loss='mse', optimizer='sgd')



In [83]:
nb_epoch = 50

for i in range(nb_epoch):
    X_final = []
    Y_final = []
    probs = []
    for idx, peptide in enumerate(peptides):
        if len(peptide) >=9:
            #get 9-mers from each peptide
            split_peptides = [peptide[pos:pos+9] for pos in range(0, len(peptide)-9 + 1) ]
            X = onehot(split_peptides, index_dict=amino_acid_letter_indices)
            X = X.reshape(X.shape[0], X.shape[1]*X.shape[2]) 
            #predict binding strength values
            predictions = affinity_model.predict(X)
            #probability of 9-mer being a binding core
            #nromalize predictions to get probability of binding for each context
            prob = predictions/np.sum(predictions)
            X_final.extend(X)
            Y_final.extend([Y_true[idx]]*len(split_peptides))
            probs.extend(prob)
    X_final = np.array(X_final)
    Y_final = np.array(Y_final)
    probs = np.array(probs)
    affinity_model.fit(X_final,Y_final, nb_epoch=1, sample_weight=probs, verbose=0)
       
            
     

In [84]:
Y_pred = []
Y_true_score = []
for idx, peptide in enumerate(peptides):
      if len(peptide) >=9:
            #get 9-mers from each peptide
            split_peptides = [peptide[pos:pos+9] for pos in range(0, len(peptide)-9 + 1) ]
            X = onehot(split_peptides, index_dict=amino_acid_letter_indices)
            X = X.reshape(X.shape[0], X.shape[1]*X.shape[2]) 
            predictions = affinity_model.predict(X)
            Y_pred.append(np.mean(predictions))
            Y_true_score.append(Y_true[idx])
    
Y_true_score = np.array(Y_true_score)
Y_pred = np.array(Y_pred)

In [85]:
Y_binary = 50000**(1-Y_true_score)
print len(Y_binary)
Y_binary = Y_binary<500
Y_pred_binary = 50000**(1-Y_pred)
Y_pred_binary = Y_pred_binary<500
print "Training AUC:", roc_auc_score(Y_binary,Y_pred)
print "Training Accuracy", accuracy_score(Y_binary, Y_pred_binary)

7233
Training AUC: 0.761575594003
Training Accuracy 0.712982165077


In [52]:
all_pep = []
Y  = affinity_model.predict(X_final)
print Y.shape, Y_final.shape
Y_binary = 50000**(1-Y_final)
Y_binary = Y_binary<500
Y_pred_binary = Y<500
print roc_auc_score(Y_binary,Y), accuracy_score(Y_binary, Y_pred_binary)

 (52220, 1) (52220,)
0.703011654801 0.621179624665


In [9]:
allele_groups_class1, random_df = load_binding_data('/Users/NanditaD/Intern/pan_allele/pan_allele/files/bdata.2009.mhci.public.1.txt')

In [20]:
allele_sequence_data, mhclength = load_allele_sequence_data('/Users/NanditaD/Intern/pan_allele/pan_allele/files/trimmed-human-class1-IEDB.fasta')

In [21]:
allele_list = ['A0201']

In [24]:
peptides, mhc, Y_class1 = get_model_data(allele_list, allele_sequence_data, allele_groups_class1, mhc_length=mhclength)

In [25]:
peptides

array([[  6.,  10.,  15., ...,   4.,   2.,  10.],
       [  8.,   2.,   8., ...,   5.,  15.,   3.],
       [ 14.,   4.,  12., ...,  15.,  16.,  14.],
       ..., 
       [ 14.,   0.,   6., ...,  18.,   0.,  16.],
       [ 16.,  18.,   6., ...,  17.,  10.,  10.],
       [  8.,  14.,   1., ...,   1.,  18.,  19.]])