In [1]:
from __future__ import print_function

%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.sparse import * 
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, Normalizer
from sklearn.feature_extraction.text import TfidfVectorizer,HashingVectorizer, TfidfTransformer
from sklearn.cluster import KMeans, DBSCAN, AffinityPropagation
from math import log
from gensim.models import Word2Vec

from hyperopt import hp, tpe
from hyperopt.fmin import fmin

from sklearn.datasets import fetch_20newsgroups
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, make_scorer
import lightgbm as lgb
import gc
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.decomposition import PCA

import logging
from optparse import OptionParser
import sys
from time import time
import json
from sklearn.metrics import roc_auc_score

import warnings
warnings.filterwarnings("ignore")

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
def labelenc(df, enc):
    for c in df.columns:
        df[c] = enc.transform(df[c])
    
    return df

def tear(serie, n, col):
    clus_df = pd.DataFrame(serie.copy())
    for i in range(1,n+1):
        col_name = col + '_P' + str(i)
        clus_df[col_name] = clus_df[col].apply(lambda x : x[i-1] if i<=len(x) else 'not_exist')

    cols = [c for c in clus_df.columns if c not in [col]]

    return clus_df[cols]

def tfidf(df, col):
    vectorizer = TfidfVectorizer(analyzer = 'char')
    X = np.array(vectorizer.fit_transform(df[col]).toarray(), dtype=np.float16)

    for i in range(len(vectorizer.get_feature_names())):
        df[col + '_' + vectorizer.get_feature_names()[i] + '_tfidf'] = X[:, i]
    
    return df

def hla_preprocess(df):
    #df['allele_type'] = df['hla'].apply(
    #lambda x: (x.startswith('DRB') and x[0:3]) or (x.startswith('HLA-DQ') and x[0:6]) or (x.startswith('HLA-DP') and x[0:6]) or x[0:3])

    hla_encoder = LabelEncoder()
    hla_encoder.fit(df['hla'])
    #np.save('Data/hla_encoder_classes.npy', hla_encoder.classes_)
    df['hla'] = hla_encoder.transform(df['hla'])

    #allele_type_encoder = LabelEncoder()
    #allele_type_encoder.fit(df['allele_type'])
    #np.save('Data/allele_type_encoder_classes.npy', allele_type_encoder.classes_)
    #df['allele_type'] = allele_type_encoder.transform(df['allele_type'])

    return df
#df = hla_preprocess(df)

In [4]:
def load_data():
  df1 = pd.read_csv('./Data/train1.csv', header = None)
  df2 = pd.read_csv('./Data/test1.csv', header = None)
  df = df1.append(df2)
  df.drop_duplicates(inplace = True)
  df.reset_index(inplace = True, drop = True)
  df.columns = ['peptide', 'aff', 'hla']
  df = df.groupby(['peptide','hla']).mean().reset_index()
  df = df.groupby('hla').filter(lambda x : len(x)>=20).reset_index(drop = True)
  
  df['tmp'] = df['aff'].apply(lambda x: 1 if x >= (1-log (500)/log(50000)) else 0)
  alpha = df.groupby('hla').agg({'tmp':'sum'})<4
  alpha = alpha.reset_index()
  df = pd.merge(df, alpha, on = 'hla', how = 'left')
  df = df[df['tmp_y'] == False]
  df.drop(columns = ['tmp_x','tmp_y'], inplace = True)
  
  print('Load_data...done')
  return df
df = load_data()
df.shape

Load_data...done


(133268, 3)

In [5]:
df

Unnamed: 0,peptide,hla,aff
0,AAAAAAAAAAA,H-2-IAd,0.324088
1,AAAAAGTTVYGAFAA,HLA-DPA10103-DPB10401,0.129502
2,AAAAAGTTVYGAFAA,HLA-DPA10103-DPB10601,0.000000
3,AAAAAGTTVYGAFAA,HLA-DQA10102-DQB10602,0.856229
4,AAAAAGTTVYGAFAA,HLA-DQA10401-DQB10402,0.541205
5,AAAAAGTTVYGAFAA,HLA-DQA10501-DQB10301,0.645158
6,AAAAAVAAEAY,DRB1_0101,0.239722
7,AAAAAVAAEAY,DRB1_0301,0.000000
8,AAAAAVAAEAY,DRB1_0401,0.000000
9,AAAAAVAAEAY,DRB1_0404,0.000000


In [6]:
def vector_generator(string, hla, dim):
  final = []
  for s in string:
    k = [0]*dim
    k += model.wv[s]
    k = pd.Series(np.append(k, hla))
  #final.append(k)
  
  return k#final

def pca_process(vectors, N_COM):
  pca = PCA(n_components=N_COM)
  return pd.DataFrame(pca.fit_transform(vectors))

model = Word2Vec.load('./Data/HLA-Vec_Object2.model')
df_ = df[['peptide','hla']].apply(lambda x: vector_generator(x['peptide'], x['hla'], 2), axis = 1)

df_ = df_.drop(columns = 2)
#df_ = pca_process(df_, 20)
df_col = []
for k in df_.columns:
  df_col.append('peptide_vector_{}'.format(k))
df_.columns = df_col
df = pd.concat([df,df_], axis = 1)

In [5]:
onehot_encoder = OneHotEncoder(sparse=False)
alpha = np.array(df['hla'])
alpha = alpha.reshape(len(alpha), 1)
onehot_encoded = onehot_encoder.fit_transform(alpha)
print(onehot_encoded)
# invert first example

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 1. 0.]
 [1. 0. 0. ... 0. 0. 0.]]


In [6]:
_df = pd.concat([df_, pd.DataFrame(onehot_encoded)], axis = 1)

In [59]:
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping

# The LSTM architecture
regressor = Sequential()
# First LSTM layer with Dropout regularisation
regressor.add(LSTM(units=20, return_sequences=True, input_shape=(X_train.shape[1],1)))
regressor.add(Dropout(0.2))
# Second LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Third LSTM layer
regressor.add(LSTM(units=50, return_sequences=True))
regressor.add(Dropout(0.2))
# Fourth LSTM layer
regressor.add(LSTM(units=20))
regressor.add(Dropout(0.2))
# The output layer
regressor.add(Dense(units=1))

# Compiling the RNN
regressor.compile(optimizer='rmsprop',loss='mean_squared_error')

earlystop = EarlyStopping(monitor='mean_squared_error', min_delta=0, patience=3, verbose=0, mode='auto')
X = _df.copy()
y = df['aff']

oof_preds = np.zeros(np.array(y).shape)
for train_index, test_index in KFold(n_splits=N_FOLDS).split(X):
    
  X_train, X_test = X.loc[train_index], X.loc[test_index]
  y_train, y_test = y.loc[train_index], y.loc[test_index]

  X_train = np.reshape(np.array(X_train), (X_train.shape[0],X_train.shape[1],1))
  X_test = np.reshape(np.array(X_test), (X_test.shape[0],X_test.shape[1],1))
  
  y_train, y_test = np.array(y_train), np.array(y_test)
  
  regressor.fit(X_train,y_train,batch_size=512, epochs=20, 
          verbose=1, validation_data=(X_test, y_test), callbacks=[earlystop])
  oof_preds[test_index] = pd.DataFrame(regressor.predict(X_test))[0]

Train on 106677 samples, validate on 26670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 106677 samples, validate on 26670 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20
Train on 106678 samples, validate on 26669 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20

KeyboardInterrupt: 

In [7]:
#a = tear(df['peptide'], df['peptide'].map(len).max(), 'peptide')
#df = pd.concat([df, labelenc(a, LabelEncoder().fit(a.values.flatten()))], axis=1)
df = tfidf(df, 'peptide')

In [8]:
with open('Data/blosum62.json') as json_data:
    d = json.load(json_data)
    
output = []
for string in tqdm(df['peptide']):
    cnt = 1
    tmp = 0
    for s in string:
        for k in string:
            tmp += d[s][k]
    output.append(tmp)    
    
df['blosum_peptide'] = output

100%|██████████| 133348/133348 [35:25<00:00, 62.74it/s]   


In [69]:
#df.drop(columns='aff').to_csv('random_try.csv', index = False)

In [8]:
vectors = df.columns[[c[0:len('peptide_vector_')]=='peptide_vector_' for c in df.columns]]
cols_to_drop = ['aff_x','aff_y','peptide','core','LPFR','RPFR','oof_preds']+list(vectors)

df.sample(frac=1, replace=True, random_state = 0).reset_index(drop = True)
#df.rename(columns={'aff_x':'aff'},  inplace = True)
X = df.drop(columns= cols_to_drop, axis = 1)
y = df['aff_x'].apply(lambda x: 1 if x >= (1-log (500)/log(50000)) else 0)

# Parameters
N_FOLDS = 5
MAX_BOOST_ROUNDS = 100000
LEARNING_RATE = 0.01
    
oof_preds = np.zeros(X.shape[0])
#sub_preds = np.zeros(test_df.shape[0])
feature_importance_df = pd.DataFrame()
feats = [f for f in X.columns]

for train_index, test_index in KFold(n_splits=N_FOLDS).split(X):
    
    X_train, X_test = X.loc[train_index], X.loc[test_index]
    y_train, y_test = y.loc[train_index], y.loc[test_index]

    #x_train = x_train.values.astype(np.float32, copy=False)
    d_train = lgb.Dataset(X_train, label= y_train)
    d_valid = lgb.Dataset(X_test, label = y_test)
    # Params
    params = {
        'objective':'binary',
        'metric': 'auc',
        "boosting": 'gbdt', 
        'learning_rate': LEARNING_RATE,
        #'is_unbalance': True,
    }
    #Model
    clf = lgb.train(
            params=params,
            train_set=d_train,
            num_boost_round = MAX_BOOST_ROUNDS,
            valid_sets=[d_train, d_valid],
            early_stopping_rounds=200,
            verbose_eval=1000
        )
    
    oof_preds[test_index] = clf.predict(X_test)
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = feats
    fold_importance_df["importance"] = clf.feature_importance(importance_type='gain')
    fold_importance_df["fold"] = N_FOLDS + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
feature_importance_df.to_csv('stage2_FI_1.csv', index = False)
#!cp ./feature_importance_df2.csv -d /content/../gdrive/My\ Drive/MHC/Project/Data/ 

Training until validation scores don't improve for 200 rounds.
[1000]	training's auc: 0.846528	valid_1's auc: 0.791576
[2000]	training's auc: 0.875724	valid_1's auc: 0.802428
[3000]	training's auc: 0.894957	valid_1's auc: 0.809082
[4000]	training's auc: 0.909049	valid_1's auc: 0.814544
[5000]	training's auc: 0.920638	valid_1's auc: 0.818717
[6000]	training's auc: 0.929941	valid_1's auc: 0.821985


KeyboardInterrupt: 

In [11]:
clf.save_model('mode_stage2.txt')
#bst = lgb.Booster(model_file='mode.txt')
#bst.predict(X_test)

AttributeError: 'LinearSVC' object has no attribute 'save_model'

In [None]:
roc_auc_score(df['aff'].apply(lambda x: 1 if x >= (1-log (500)/log(50000)) else 0), oof_preds)
#df.drop(columns='oof_preds',axis = 1, inplace = True)

In [19]:
from sklearn.svm import SVC
f = pd.DataFrame(df['oof_preds'])
f['oof_pred_2'] = oof_preds
clf = SVC()
oof_preds_2 = np.zeros(X.shape[0])

for train_index, test_index in KFold(n_splits=N_FOLDS).split(f):
  print('cnt')
  X_train, X_test = X.loc[train_index], X.loc[test_index]
  y_train, y_test = y.loc[train_index], y.loc[test_index]
  clf.fit(X_train, y_train) 
  oof_preds_2[test_index] = clf.predict(X_test)
roc_auc_score(df['aff'].apply(lambda x: 1 if x >= 0.5 else 0), oof_preds_2)

cnt


KeyboardInterrupt: 

In [None]:
roc_auc_score(df['aff'].apply(lambda x: 1 if x >= (1-log (500)/log(50000)) else 0), (f['oof_preds']+f['oof_pred_2'])/2)