In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import torch 
from sklearn.metrics import log_loss
from sklearn.model_selection import KFold
from hyperopt import hp, fmin, tpe, Trials
from collections import OrderedDict
import csv
import joblib
import os

from moa.model import DenseNet, Model, DenseBlock
from moa.preprocess import cp_mapping, cp_filter, scaling, rankgauss, fe_stats, c_squared, fe_pca, variance_thresh

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

data_dir = './DATA/lish-moa/'
SEED = 123
np.random.seed(SEED)

## Load the dataset 

In [None]:
X = pd.read_csv(data_dir+'train_features.csv', index_col='sig_id')
y = pd.read_csv(data_dir+'train_targets_scored.csv', index_col='sig_id')
X_sub = pd.read_csv(data_dir+'test_features.csv', index_col='sig_id')

## Preprocessing

In [None]:
# # 1. preprocessing
# X1, X1_sub = cp_mapping(X, X_sub); y1=y.copy()
# X1, X1_sub = rankgauss(X1, X1_sub)
# X1, X1_sub = fe_pca(X1, X1_sub, n_components_g = 600, n_components_c = 80, SEED = 42)
# X1, X1_sub = variance_thresh(X1, X1_sub, 0.8, 3)
# X1, y1, X1_sub = cp_filter(X1, y1, X1_sub)
# X1.shape

In [None]:
# 2. preprocessing
X1, X1_sub = cp_mapping(X, X_sub); y1=y.copy()
X1, y1, X1_sub = cp_filter(X1, y1, X1_sub)
X1, X1_sub = fe_stats(X1, X1_sub)
X1, X1_sub = c_squared(X1, X1_sub)
X1, X1_sub = scaling(X1, X1_sub)
X1, X1_sub = fe_pca(X1, X1_sub, n_components_g = 70, n_components_c = 10, SEED = 233)
X1, X1_sub = variance_thresh(X1, X1_sub, 0.8, 2)
X1.shape

## Training

In [None]:
n_SEED = 2
n_fold = 5
torch.manual_seed(42)
def objective(params):
    epoch, lr, batch_size, L1, L2 = int(params['epoch']), params['lr'],int(params['batch_size']),params['L1'],params['L2']
    train_loss = []
    test_loss = []
    for i_SEED in range(n_SEED):
        SEED = 65 + 123*i_SEED
        kfold = KFold(n_splits=n_fold, shuffle=True, random_state=SEED)
        for n, (train_idx, test_idx) in enumerate(kfold.split(X1)):
            X_train = X1.iloc[train_idx].values
            X_test = X1.iloc[test_idx].values
            y_train = y1.iloc[train_idx].values
            y_test = y1.iloc[test_idx].values
            
            params_net = {'input_size': X1.shape[1],
                     'hidden_size': hidden_size, # 
                     'output_size': y1.shape[1],
                     'dropout': dropout} # 长度比hidden_size长度多1
            params_fit = {'X':X_train,
                         'y': y_train,
                         'epoch': epoch,
                         'lr': lr, 
                         'batch_size':batch_size, # 64, 128
                         'L1': L1,
                         'L2': L2,
                         'pos_weight':1,
                         'patience':5,
                         'verbose':False}
            net = DenseNet(**params_net)
            model = Model(net)
            model.fit(**params_fit)
            y_train_pred = model.predict_proba(X_train)
            y_test_pred = model.predict_proba(X_test)
            train_loss.append(log_loss(y_train.ravel(), y_train_pred.ravel()))
            test_loss.append(log_loss(y_test.ravel(), y_test_pred.ravel()))
            print(f"SEED {i_SEED+1}/{n_SEED}, KFOLD {n+1}/{n_fold} completed.")
    train_loss_avg = np.average(np.array(train_loss))
    test_loss_avg = np.average(np.array(test_loss))
    print('Training loss : ', train_loss_avg)
    print('Validation loss : ', test_loss_avg)
    writer.writerow([epoch, lr, batch_size, L1, L2, train_loss_avg, test_loss_avg])
    return test_loss_avg

In [None]:
# hidden_size = [2048, 2048]
# dropout = [0, 0.35, 0.2]

# params = {'epoch': 125,
#           'lr': 0.0005302154778849957, # 1e-4 ~ 1e-3
#           'batch_size':85, # 64, 128, 256, 512
#           'L1': 1e-5,
#           'L2': 5e-5
# }
# filename = 'Manual.csv'
# if not os.path.exists(filename):
#     with open(filename, 'w') as file:
#         writer = csv.writer(file)
#         writer.writerow(['NN hyperparamerter tuning'])
#         writer.writerow(['hidden_size', 'dropout'])
#         writer.writerow(['epoch', 'lr', 'batch_size', 'L1', 'L2', 'Train_loss', 'Val_loss'])

# with open(filename, 'a') as file:
#     writer = csv.writer(file)
#     writer.writerow([hidden_size, dropout])
#     objective(params);

## Automatic hyper parameter tuning

In [None]:
hidden_size = [2048, 2048]
dropout = [0.0, 0.3, 0.2]

search_space=OrderedDict([
    ('epoch', hp.choice('epoch', [125])),
    ('lr', hp.loguniform('lr', np.log(1e-4), np.log(1e-3))),
    ('batch_size', hp.uniform('batch_size', 32, 128)),
    ('L1', hp.uniform('L1', 1e-5, 8e-5)),
    ('L2', hp.uniform('L2', 1e-5, 8e-5))
])

filename = 'Trails.csv'
if not os.path.exists(filename):
    with open(filename, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['NN hyperparamerter tuning'])
        
with open(filename, 'a') as file:
    writer = csv.writer(file)
    writer.writerow([])
    writer.writerow(['hidden_size', 'dropout'])
    writer.writerow([hidden_size, dropout])
    writer.writerow(['epoch', 'lr', 'batch_size', 'L1', 'L2','Train_loss', 'Val_loss'])

for i in range(40):
    with open(filename, 'a') as file:
        writer = csv.writer(file)
        #         trials = Trials() 
        trials = joblib.load('hyperopt_trials.pkl')
        _ = fmin(objective, search_space, trials=trials, algo=tpe.suggest, max_evals=len(trials.trials)+1)
        joblib.dump(trials, 'hyperopt_trials.pkl')