In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import seaborn as sns
from hyperopt import hp, fmin, tpe, Trials
from collections import OrderedDict
import csv
from moa.model import DenseNet, Model, DenseBlock
import joblib
import os

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

data_dir = './DATA/lish-moa/'
SEED = 123
np.random.seed(SEED)
torch.manual_seed(SEED)

## Load the dataset 

In [None]:
X = pd.read_csv(data_dir+'train_features.csv', index_col='sig_id')
y = pd.read_csv(data_dir+'train_targets_scored.csv', index_col='sig_id')
X_sub = pd.read_csv(data_dir+'test_features.csv', index_col='sig_id')
# One-hot encoding for cp_type and cp_dose
X['cp_type'].replace({'trt_cp':1., 'ctl_vehicle':0.}, inplace=True)
X['cp_dose'].replace({'D1':1., 'D2':0.}, inplace=True)
X_sub['cp_type'].replace({'trt_cp':1., 'ctl_vehicle':0.}, inplace=True)
X_sub['cp_dose'].replace({'D1':1., 'D2':0.}, inplace=True)

## Preprocessing

In [None]:
X1 = X.copy()
X1_sub = X_sub.copy()

# Rank Gauss
GENES = [col for col in X.columns if col.startswith('g-')]
CELLS = [col for col in X.columns if col.startswith('c-')]
for col in (GENES + CELLS):
    transformer = QuantileTransformer(n_quantiles=100, random_state=0, output_distribution='normal')
    n_sample = len(X[col].values)
    n_sample_sub = len(X_sub[col].values)
    raw_vec = X[col].values.reshape(n_sample, 1)
    raw_vec_sub = X_sub[col].values.reshape(n_sample_sub, 1)
    transformer.fit(raw_vec)
    X1[col]=transformer.transform(raw_vec).reshape(1, n_sample)[0]
    X1_sub[col] = transformer.transform(raw_vec_sub).reshape(1, n_sample_sub)[0]
    
# PCA : GENES
n_comp = 600
pca = PCA(n_components=n_comp, random_state = 42)
data = pd.concat([X1[GENES], X1_sub[GENES]])
data2 = pca.fit_transform(data)            # data leakage?
X_pca_GENES = pd.DataFrame(data2[:X1.shape[0]], columns=[f"pca_g-{i}" for i in range(n_comp)], index=X1.index)
X_pca_sub_GENES = pd.DataFrame(data2[X1.shape[0]:], columns=[f"pca_g-{i}" for i in range(n_comp)], index=X1_sub.index)
X1 = pd.concat((X1, X_pca_GENES), axis=1)
X1_sub = pd.concat((X1_sub, X_pca_sub_GENES), axis=1)

# X_tmp = pd.concat((X1.iloc[:,:3], X_pca_GENES), axis=1)
# X_sub_tmp = pd.concat((X1_sub.iloc[:,:3], X_pca_sub_GENES), axis=1)

# PCA : CELLS
n_comp = 80
pca = PCA(n_components=n_comp, random_state = 42)
data = pd.concat([X1[CELLS], X1_sub[CELLS]])
data2 = pca.fit_transform(data)            # data leakage?
X_pca_CELLS = pd.DataFrame(data2[:X1.shape[0]], columns=[f"pca_c-{i}" for i in range(n_comp)], index=X1.index)
X_pca_sub_CELLS = pd.DataFrame(data2[X1.shape[0]:], columns=[f"pca_c-{i}" for i in range(n_comp)], index=X1_sub.index)
X1 = pd.concat((X1, X_pca_CELLS), axis=1)
X1_sub = pd.concat((X1_sub, X_pca_sub_CELLS), axis=1)

# X1 = pd.concat((X_tmp, X_pca_CELLS), axis=1)
# X1_sub = pd.concat((X_sub_tmp, X_pca_sub_CELLS), axis=1)

# feature selection using Variance Encoding
var_thresh = VarianceThreshold(0.8)      # 0.8, 0.38
data = X1.append(X1_sub)
data_transformed = var_thresh.fit_transform(data.iloc[:,3:])
X_transformed = pd.DataFrame(data_transformed[:X.shape[0]], index=X.index)
X_sub_transformed = pd.DataFrame(data_transformed[X.shape[0]:], index=X_sub.index)

X1 = pd.concat((X1.iloc[:,:3], X_transformed), axis=1)
X1_sub = pd.concat((X1_sub.iloc[:,:3], X_sub_transformed), axis=1)

# only keep "cp_type==1"
X1 = X1[X1.cp_type == 1].drop('cp_type', axis=1)
X1_sub = X1_sub[X1_sub.cp_type == 1].drop('cp_type', axis=1)
y1 = y.loc[X1.index]

In [None]:
X1.shape

## Training

In [None]:
n_SEED = 2
n_fold = 5
torch.manual_seed(42)
def objective(params):
    epoch, lr, batch_size, L1, L2 = params['epoch'], params['lr'],params['batch_size'],params['L1'],params['L2']
    train_loss = []
    test_loss = []
    for SEED in range(n_SEED):
        kfold = KFold(n_splits=n_fold, shuffle=True, random_state=SEED)
        for n, (train_idx, test_idx) in enumerate(kfold.split(X1)):
            X_train = X1.iloc[train_idx].values
            X_test = X1.iloc[test_idx].values
            y_train = y1.iloc[train_idx].values
            y_test = y1.iloc[test_idx].values
            
            params_net = {'input_size': X1.shape[1],
                     'hidden_size': hidden_size, # 
                     'output_size': y1.shape[1],
                     'dropout': dropout} # 长度比hidden_size长度多1
            params_fit = {'X':X_train,
                         'y': y_train,
                         'epoch': epoch,
                         'lr': lr, 
                         'batch_size':batch_size, # 64, 128
                         'L1': L1,
                         'L2': L2,
                         'pos_weight':1,
                         'patience':5,
                         'verbose':False}
            net = DenseNet(**params_net)
            model = Model(net)
            model.fit(**params_fit)
            y_train_pred = model.predict_proba(X_train)
            y_test_pred = model.predict_proba(X_test)
            train_loss.append(log_loss(y_train.ravel(), y_train_pred.ravel()))
            test_loss.append(log_loss(y_test.ravel(), y_test_pred.ravel()))
            print(f"SEED {SEED+1}/{n_SEED}, KFOLD {n+1}/{n_fold} completed.")
    train_loss_avg = np.average(np.array(train_loss))
    test_loss_avg = np.average(np.array(test_loss))
    print('Training loss : ', train_loss_avg)
    print('Validation loss : ', test_loss_avg)
    writer.writerow([epoch, lr, batch_size, L1, L2, train_loss_avg, test_loss_avg])
    return test_loss_avg

In [None]:
# hidden_size = [2048, 2048]
# dropout = [0.0, 0.4, 0.2]

# params = {'epoch':100,
#           'lr': 5e-4,
#           'batch_size':64,
#           'L1':1e-5,
#           'L2':5e-5
# }
# filename = 'Manual.csv'
# if not os.path.exists(filename):
#     with open(filename, 'w') as file:
#         writer = csv.writer(file)
#         writer.writerow(['NN hyperparamerter tuning'])
#         writer.writerow(['hidden_size', 'dropout'])
#         writer.writerow(['epoch', 'lr', 'batch_size', 'L1', 'L2', 'Train_loss', 'Val_loss'])
        
# with open(filename, 'a') as file:
#     writer = csv.writer(file)
#     writer.writerow([hidden_size, dropout])
#     objective(params);

## Automatic hyper parameter tuning

In [None]:
hidden_size = [2048, 2048]
dropout = [0.0, 0.35, 0.2]

search_space=OrderedDict([
    ('epoch', hp.choice('epoch', [125])),
    ('lr', hp.loguniform('lr', np.log(1e-4), np.log(1e-3))),
    ('batch_size', hp.uniform('batch_size', 32, 128)),
    ('L1', hp.uniform('L1', 1e-5, 8e-5)),
    ('L2', hp.uniform('L2', 1e-5, 8e-5))
])

filename = 'Trails.csv'
if not os.path.exists(filename):
    with open(filename, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['NN hyperparamerter tuning'])

with open(filename, 'a') as file:
    writer = csv.writer(file)
    writer.writerow([])
    writer.writerow(['hidden_size', 'dropout'])
    writer.writerow([hidden_size, dropout])
    writer.writerow(['epoch', 'lr', 'batch_size', 'L1', 'L2','Train_loss', 'Val_loss'])
    trials = Trials() 
#     trials = joblib.load('hyperopt_trials.pkl')
    _ = fmin(objective, search_space, trials=trials, algo=tpe.suggest, max_evals=80)
    joblib.dump(trials, 'hyperopt_trials.pkl')