In [35]:
# %load MHCII_Preprocessing.py
#!/usr/bin/env python

# In[12]:


import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import pickle, json
from time import time
from math import sqrt, log
import json
import HLA_TFIDF_preprocess

import warnings
warnings.filterwarnings("ignore")

def read_train_test(typ, N_FOLD):
  df = pd.read_csv('./Data/MHCPanII/{}{}.txt'.format(typ,N_FOLD), sep="\t", header=None)
  df.columns = ['peptide', 'aff', 'hla']
  
  return df

def load_data(typ, N_FOLD):
  df = read_train_test(typ, N_FOLD)
  hla_to_drop = ['HLA-DQA10102-DQB10604','HLA-DQA10201-DQB10201','HLA-DQA10302-DQB10401',
                'DRB1_0302', 'DRB1_0411', 'DRB1_0804','DRB5_0102', 'H-2-IAq', 'HLA-DQA10103-DQB10302',
                'HLA-DQA10302-DQB10303', 'HLA-DQA10505-DQB10301', 'DRB1_1402', 'HLA-DQA10301-DQB10201',
                'DRB1_1503']
  df = df[~df['hla'].isin(hla_to_drop)]
  if typ == 'train':
    df = df.groupby('hla').filter(lambda x : len(x)>=20).reset_index(drop = True)

  print('Load_data...done')
  print("{},{}".format(df.index.max(), df.shape))
  
  return df


def tear(serie, n, col):
  clus_df = pd.DataFrame(serie.copy())
  for i in range(1,n+1):
    col_name = col + '_P' + str(i)
    clus_df[col_name] = clus_df[col].apply(lambda x : x[i-1])

  cols = [c for c in clus_df.columns if c not in [col]]

  return clus_df[cols]

def hla_preprocess(df):
  df['allele_type'] = df['hla'].apply(
  lambda x: (x.startswith('DRB') and x[0:3]) or (x.startswith('HLA-DQ') and x[0:6]) or (x.startswith('HLA-DP') and x[0:6]) or x[0:3])

  hla_encoder = LabelEncoder()
  #hla_encoder.fit(df['hla'])
  #np.save('Data/hla_encoder_classes.npy', hla_encoder.classes_)
  hla_encoder.classes_ = np.load('./Data/hla_encoder_classes.npy')
  df['hla_str'] = df['hla']
  df['hla'] = hla_encoder.transform(df['hla'])

  allele_type_encoder = LabelEncoder()
  #allele_type_encoder.fit(df['allele_type'])
  #np.save('Data/allele_type_encoder_classes.npy', allele_type_encoder.classes_)
  allele_type_encoder.classes_ = np.load('./Data/allele_type_encoder_classes.npy')
  df['allele_type'] = allele_type_encoder.transform(df['allele_type'])

  print('HLA_encode...done')
  return df

def multiply(df):
  n_iteration = np.int32(df['length'] - 9 +1)
  df_result = df.loc[np.repeat(df.index.values,n_iteration)]
  df_result.reset_index(inplace = True)
  df_result.rename(columns = {'index':'true_index'}, inplace = True)

  return df_result

def L_impute_space(s):
  k = 3 - len(s)
  return ' '* k + s

def R_impute_space(s):
  k = 3 - len(s)
  return s + ' '* k
    
def window_select(k):
  #tqdm.pandas()
  k['true_index'] += 1
  k['n_label'] = k[['true_index', 'peptide','hla']].groupby(['peptide','hla']).agg('cumsum')
  k['n_label'] /= k['true_index']
  k['n_label'] = k['n_label'].map(int) - 1

  print('String Distinguishing...')
  k['core'] = k[['peptide', 'n_label']].apply(lambda x : x['peptide'][x['n_label']:x['n_label'] + 9], axis=1)
  k['LPFR'] = k[['peptide', 'n_label']].apply(lambda x : x['peptide'][max(x['n_label'] - 3, 0): x['n_label']], axis=1)
  k['RPFR'] = k[['peptide', 'n_label','length']].apply(lambda x : x['peptide'][x['n_label'] + 9 : min(x['n_label'] + 9 + 3, x['length'])], axis=1)
  print('Done String Distinguishing...')

  print('String Transforming...')
  k['len_LPFR'] = k['LPFR'].apply(lambda x: len(x)/3)
  k['len_RPFR'] = k['RPFR'].apply(lambda x: len(x)/3)
  k['len_peptide_transform'] = k['length'].apply(lambda x : 1/(1+np.exp((x - 15)/2)))

  k['LPFR'] = k['LPFR'].apply(lambda x:L_impute_space(x))
  k['RPFR'] = k['RPFR'].apply(lambda x:L_impute_space(x))
  
  print('window_select...done')
  k = k.drop(k[k['core'].map(len)<9].index)
  
  return k

def labelenc(df, enc):
  enc = LabelEncoder()
  enc.classes_ = np.load('./Data/tfidf_label.npy')
  for c in df.columns:
    df[c] = enc.transform(df[c])

  return df

def tfidf(df, col, json):
  with open("./Data/tfidf.pickle", "rb") as input_file:
    vectorizer = pickle.load(input_file)
  X = np.array(vectorizer.transform(df[col]).toarray(), dtype=np.float16)
  #pickle.dump(vectorizer.fit(df[col]), open("./Data/tfidf.pickle", "wb"))
  for i in range(len(vectorizer.get_feature_names())):
    name = str(col) + '_' + str(vectorizer.get_feature_names()[i]) + 'tfidf'
    df[name] = X[:, i]
    json['core_tfidf'].append(name)

  print('TFIDF...done')
  return df, json

def countvec(df, col, json):
  with open("./Data/count_vectorizer.pickle", "rb") as input_file:
    vectorizer = pickle.load(input_file)
  X = np.array(vectorizer.fit_transform(list(df['core'])).toarray(), dtype=np.float16)

  for i in range(len(vectorizer.get_feature_names())):
    name = str(col) + '_' + str(vectorizer.get_feature_names()[i]) + '_count'
    df[name] = X[:, i]
    json['core_count'].append(name)
  print('countvec...done')
  return df, json

def cluster(serie):
  k = serie.apply(lambda x: " ".join(x))
  dataset = k


  #labels = dataset.target
  true_k = int(sqrt(k.nunique()))

  print("Extracting features from the training dataset "
        "using a sparse vectorizer")
  t0 = time()

  with open("./Data/cluster_tfidf.pickle", "rb") as input_file:
    vectorizer = pickle.load(input_file)

  X = vectorizer.transform(dataset)

  print("done in %fs" % (time() - t0))
  print("n_samples: %d, n_features: %d" % X.shape)
  print()

  # #############################################################################
  # Do the actual clustering
  with open("./Data/Kmeans_cluster.pickle", "rb") as input_file:
    km = pickle.load(input_file)

  print("Clustering sparse data with %s" % km)
  t0 = time()
  #km.fit(X)
  print("done in %0.3fs" % (time() - t0))

  output = km.predict(X)
  print('Clustering...done')
  return output

def mean_encoding(df):
  groupby_list = ['hla', 'allele_type', 'core_P1', 'core_P2', 'core_P3', 'core_P4',
                  'core_P5', 'core_P6', 'core_P7', 'core_P8', 'core_P9', 'LPFR_P1',
                  'LPFR_P2', 'LPFR_P3', 'RPFR_P1', 'RPFR_P2', 'RPFR_P3', 'cluster',]

  for cat in groupby_list:
    tmp = df.groupby(cat).agg({'aff':['mean', 'std']})

    tmp.columns = [cat + '_' + s for s in ['mean', 'std']]
    tmp[cat] = tmp.index
    df = pd.merge(df, tmp, on = cat, how = 'left')
  print('Mean_encoding...done')
  return df

def blosum(serie):
  with open('Data/blosum62.json') as json_data:
    d = json.load(json_data)
  output = {
      'p1':[],
      'p2':[],
      'p3':[],
      'p4':[],
      'p5':[],
      'p6':[],
      'p7':[],
      'p8':[],
      'p9':[],
  }

  for string in serie:
    cnt = 1
    for s in string:
      tmp = 0
      for k in string:
        tmp += d[s][k]
      loc = 'p'+str(cnt)
      cnt += 1
      output[loc].append(tmp)
          
  return output

def start(typ, N_FOLD):
  groupby_list = ['hla', 'allele_type', 'core_P1', 'core_P2', 'core_P3', 'core_P4',
                  'core_P5', 'core_P6', 'core_P7', 'core_P8', 'core_P9', 'LPFR_P1',
                  'LPFR_P2', 'LPFR_P3', 'RPFR_P1', 'RPFR_P2', 'RPFR_P3', 'cluster',]
  feature_engineered = {
    'hla':['hla', 'allele_type'],
    'length':['length'],
    'len_PFR':['len_LPFR', 'len_RPFR', 'len_peptide_transform'],
    'core':['core_P1', 'core_P2', 'core_P3', 'core_P4', 'core_P5', 'core_P6', 'core_P7', 'core_P8', 'core_P9'],
    'PFR':['LPFR_P1', 'LPFR_P2', 'LPFR_P3', 'RPFR_P1', 'RPFR_P2', 'RPFR_P3'],
    'core_tfidf':[],
    'core_count':[],
    'cluster':['cluster'],
    'unique':['peptide_nunique','unique_rate'],
    'blosum_sum':['blosum_sum']
    
  }
  feature_engineered['cat_mean'] = [c + '_mean' for c in groupby_list]
  feature_engineered['cat_std'] = [c + '_std' for c in groupby_list]
  
  df = load_data(typ, N_FOLD)
  hla_tfidf = HLA_TFIDF_preprocess.start(df)
  df = pd.concat([df, hla_tfidf], axis = 1)
  
  df = hla_preprocess(df)
  df['length'] = df['peptide'].apply(lambda x: len(x))
  df = multiply(df)
  df = window_select(df)
  print(df.columns)
  
  for col in ['core','LPFR', 'RPFR']:
    df = pd.concat([df, labelenc(tear(df[col], df[col].map(len).max(), col), LabelEncoder().fit(tear(df[col], df[col].map(len).max(), col).values.flatten()))], axis=1)
  df, feature_engineered = tfidf(df, 'core', feature_engineered)
  df, feature_engineered = countvec(df, 'core', feature_engineered)
  df['cluster'] = cluster(df['core'])
  df = mean_encoding(df)
  df['peptide_nunique'] = df['peptide'].apply(lambda x: len(''.join(set(x))))
  df['unique_rate'] = df['peptide_nunique']/ df['length']
  
  df.reset_index(inplace = True)
  alpha = blosum(df['core'])
  feature_engineered['blosum'] = list(alpha.keys())
  df = pd.concat([df,pd.DataFrame(alpha)], axis = 1)
  df['blosum_sum'] = df['p1'] + df['p2'] + df['p3'] + df['p4'] + df['p5'] + df['p6'] + df['p7'] + df['p8'] + df['p9']

  return df, feature_engineered

In [31]:
df = load_data('test', 1)


Load_data...done
27331,(27287, 3)


In [34]:
df.iloc[18126]

peptide    GSHEVNGTWMIHTLE
aff               0.493461
hla              DRB1_0901
Name: 18166, dtype: object

In [29]:
df[df['hla'].isna()]

Unnamed: 0,peptide,aff,hla,hla_1,hla_2,HLA_aa_tfidf,HLA_aaa_tfidf,HLA_aac_tfidf,HLA_aag_tfidf,HLA_aat_tfidf,...,HLA_tct_tfidf,HLA_tg_tfidf,HLA_tga_tfidf,HLA_tgc_tfidf,HLA_tgg_tfidf,HLA_tgt_tfidf,HLA_tta_tfidf,HLA_ttc_tfidf,HLA_ttg_tfidf,HLA_ttt_tfidf
5221,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5222,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5223,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5224,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5225,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5226,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5227,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5228,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5229,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929
5230,,,,,,0.0,0.086121,0.166016,0.127686,0.051086,...,0.140503,0.0,0.063843,0.063843,0.140503,0.07666,0.012772,0.160034,0.127686,0.114929


In [25]:
c = df.dropna()

In [26]:
c.shape

(10084, 82)