In [20]:
import numpy as np
import pandas as pd
import re
from itertools import zip_longest
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from math import log
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

def read_train_test(typ, N_FOLD):
  train = pd.read_csv('./Data/MHCPanII/{}{}.txt'.format(typ,N_FOLD), sep="\t", header=None)
  train.columns = ['peptide', 'aff', 'hla']
  
  return train

def slicing(string):
  s = string
  t = iter(s)
  k = ','.join(a+b+c  for a,b,c in zip_longest(t, t, t, fillvalue=""))
  k = k.split(',')
  return k

def transform_hla_name(string):
  string = string.replace('*', '')
  string = string.replace(':', '')
  r = re.compile("([a-zA-Z]+)([0-9]+)")
  
  return r.match(string).group(1) + '_' + r.match(string).group(2)

def hla_name_cut(string, position = 2):
  if string.startswith('HLA'):
    string = string.split('-')
    r = re.compile("([a-zA-Z]+)([0-9]+)")
    string[position] = r.match(string[position]).group(1) + '_' + r.match(string[position]).group(2)
    return string[position]
  elif (string.startswith('DRB') & position ==1):
    string = string.replace('_', '')
    r = re.compile("([a-zA-Z]+)([0-9]+)")
    string = r.match(string).group(1) + '_' + r.match(string).group(2)
    return string
  else:
    return np.nan
  
def tovec(serie, vec_size, window):
  #serie = serie.apply(lambda x : slicing(str(x)))
  documents = [TaggedDocument(doc, [i]) for i, doc in enumerate(serie)]
  model = Doc2Vec(documents, vector_size = vec_size, window = window, min_count=1, workers=12)
  return model

def rename_col(data, string):
  name = []
  for c in data.columns:
    name.append(string + str(c))

  return name 

def model2vec(train, exon_name):
  exon = Doc2Vec.load('./Data/'+ exon_name +'_Doc2Vec')
  
  alpha = train[exon_name+'_x'].dropna().apply(lambda x : exon.infer_vector(x))
  k1 = alpha.apply(pd.Series)
  
  alpha = train[exon_name+'_y'].dropna().apply(lambda x : exon.infer_vector(x))
  k2 = alpha.apply(pd.Series)
  
  k = pd.concat([k1, k2], axis=1, sort=False)
  k.columns = rename_col(k, exon_name+'_')
  
  return k

In [2]:
df = pd.read_csv('./Data/exon_Sheet1.csv', header = 1)
df['4digit'] = df['4digit'].apply(lambda x: transform_hla_name(x))
df.rename(columns={'4digit':'hla'},inplace = True)
for i in range(1, 7):
  name = 'exon'+ str(i)
  df[name] = df[name].apply(lambda x: slicing(str(x)))

train = read_train_test('train',1)
test = read_train_test('test',1)

peptide = Doc2Vec.load('./Data/Doc2Vec')
df['hla_1'] = df['hla']; df['hla_2'] = df['hla']

In [14]:
def preprocess_train_test(train):
  #split DPA/DPB DQA/DQB
  train['hla_1'] = train['hla'].apply(lambda x: hla_name_cut(x, position = 1))
  train['hla_2'] = train['hla'].apply(lambda x: hla_name_cut(x))
  #merge train with exon
  train = train[train['hla_1'].notna()]
  train = pd.merge(train, df.drop(columns = ['8digit', 'hla_2', 'hla']), on = 'hla_1', how = 'left')
  train = pd.merge(train, df.drop(columns = ['8digit', 'hla_1', 'hla']), on = 'hla_2', how = 'left')

  alpha = train['peptide'].apply(lambda x : peptide.infer_vector(x))
  data = alpha.apply(pd.Series)
  data.columns = rename_col(data, 'peptide_')

  exon1 = model2vec(train, 'exon1')
  exon2 = model2vec(train, 'exon2')
  exon3 = model2vec(train, 'exon3')
  exon4 = model2vec(train, 'exon4')
  exon5 = model2vec(train, 'exon5')
  exon6 = model2vec(train, 'exon6')

  y = train.aff.apply(lambda x: 1 if x >= (1-log (500)/log(50000)) else 0)
  add = pd.concat([data,exon1,exon2,exon3,exon4,exon5,exon6], axis = 1, sort = False)
  
  return add, y

In [3]:
#split DPA/DPB DQA/DQB
train['hla_1'] = train['hla'].apply(lambda x: hla_name_cut(x, position = 1))
train['hla_2'] = train['hla'].apply(lambda x: hla_name_cut(x))
#merge train with exon
train = train[train['hla_1'].notna()]
train = pd.merge(train, df.drop(columns = ['8digit', 'hla_2', 'hla']), on = 'hla_1', how = 'left')
train = pd.merge(train, df.drop(columns = ['8digit', 'hla_1', 'hla']), on = 'hla_2', how = 'left')



In [10]:
alpha = train['peptide'].apply(lambda x : peptide.infer_vector(x))
data = alpha.apply(pd.Series)
data.columns = rename_col(data, 'peptide_')

exon1 = model2vec(train, 'exon1')
exon2 = model2vec(train, 'exon2')
exon3 = model2vec(train, 'exon3')
exon4 = model2vec(train, 'exon4')
exon5 = model2vec(train, 'exon5')
exon6 = model2vec(train, 'exon6')

y = train.aff.apply(lambda x: 1 if x >= (1-log (500)/log(50000)) else 0)
add = pd.concat([data,exon1,exon2,exon3,exon4,exon5,exon6], axis = 1, sort = False)

In [18]:
add_valid, y_valid = preprocess_train_test(test)
add.fillna(0,inplace = True)
add_valid.fillna(0,inplace = True)

In [19]:
LR = LogisticRegression(C=1.0, solver = 'liblinear')
LR.fit(add, y)
LR_predictions = LR.predict_proba(add_valid)[:,1]


vector:6, AUC:0.6263846753148181


In [29]:
import lightgbm as lgb

MAX_BOOST_ROUNDS = 100000
LEARNING_RATE = 0.05

#x_train = x_train.values.astype(np.float32, copy=False)
d_train = lgb.Dataset(add, label= y)
d_valid = lgb.Dataset(add_valid, label = y_valid)
# Params
params = {
    'objective':'binary',
    'metric': 'auc',
    "boosting": 'gbdt', 
    'learning_rate': LEARNING_RATE,
    'seed': 0,
    #'is_unbalance': True,
}
#Model
clf = lgb.train(
        params=params,
        train_set=d_train,
        num_boost_round = MAX_BOOST_ROUNDS,
        valid_sets=[d_train, d_valid],
        early_stopping_rounds=200,
        verbose_eval=500
    )

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


Training until validation scores don't improve for 200 rounds.
[500]	training's auc: 0.825122	valid_1's auc: 0.653022
[1000]	training's auc: 0.897223	valid_1's auc: 0.653468
Early stopping, best iteration is:
[816]	training's auc: 0.874565	valid_1's auc: 0.654003
