In [31]:
import pandas as pd
import numpy as np
import os
import csv
import matplotlib.pyplot as plt
%matplotlib inline
import torch 
from sklearn.metrics import log_loss

from moa.model import DenseNet, Model, DenseBlock
from moa.preprocess import cp_mapping, cp_filter, fe_pca, fe_stats_all,variance_thresh_all
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold
from rank_gauss.gauss_rank_scaler import GaussRankScaler

from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

%load_ext autoreload
%autoreload 2

data_dir = './DATA/lish-moa/'
SEED = 123
np.random.seed(SEED)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Load the dataset

In [4]:
X = pd.read_csv(data_dir+'train_features.csv', index_col='sig_id')
y = pd.read_csv(data_dir+'train_targets_scored.csv', index_col='sig_id')
X_sub = pd.read_csv(data_dir+'test_features.csv', index_col='sig_id')

## Preprocessing

In [9]:
train, test = cp_mapping(X, X_sub); targets=y.copy()
train, targets, test = cp_filter(train, targets, test)

data_all = variance_thresh_all(pd.concat([train, test]), 0.7)
print(data_all.shape)

scaler = GaussRankScaler()
data_all.iloc[:,2:] = scaler.fit_transform(data_all.iloc[:,2:])
print(data_all.shape)

train, test = data_all[:len(targets)], data_all[len(targets):]
train, test = fe_pca(train, test, n_components_g = 80, n_components_c = 10)
data_all = fe_stats_all(pd.concat([train, test]))
print(data_all.shape)

train, test = data_all[:len(targets)], data_all[len(targets):]
print(train.shape)
print(test.shape)

(25572, 839)
(25572, 839)


100%|██████████| 5/5 [00:08<00:00,  1.71s/it]


(25572, 944)
(21948, 944)
(3624, 944)


## Training

In [19]:
n_SEED = 5 # 5
n_fold = 7 # 7
torch.manual_seed(42)
def objective(params):
    dropout, epoch, lr, batch_size, wd = params['dropout'], int(params['epoch']), params['lr'],int(params['batch_size']),params['weight_decay']
    val_loss = []
    for i_SEED in range(n_SEED):
        SEED = 65 + 123*i_SEED
        kfold = MultilabelStratifiedKFold(n_splits=n_fold, shuffle=True, random_state=SEED)
        for n, (train_idx, val_idx) in enumerate(kfold.split(train, targets)):
            X_train, X_val = train.iloc[train_idx].values, train.iloc[val_idx].values
            y_train, y_val = targets.iloc[train_idx].values, targets.iloc[val_idx].values
            train_loss = []
            val_loss = []
            params_net = {'input_size': train.shape[1],
                     'hidden_size': hidden_size, 
                     'output_size': targets.shape[1],
                     'dropout': dropout} 
            params_fit = {'X_train':X_train,'y_train': y_train,
                          'X_val':X_val,'y_val': y_val,
                          'epoch': epoch,
                          'lr': lr, 
                          'batch_size':batch_size, #
                          'weight_decay':wd,
                          'patience':10,
                          'smoothing':0.001,
                          'p_min': 0.001,
                          'scheduler': 'OneCycleLR', # ['OneCycleLR', 'ReduceLROnPlateau', 'both']
                          'verbose':True}
            net = DenseNet(**params_net)
            model = Model(net)
            model.fit(**params_fit)
            y_train_pred = model.predict_proba(X_train)
            y_val_pred = model.predict_proba(X_val)
            train_loss.append(log_loss(y_train.ravel(), y_train_pred.ravel()))
            val_loss.append(log_loss(y_val.ravel(), y_val_pred.ravel()))
            
#             y_sub_pred_list.append(model.predict_proba(test))
            
            print(f"SEED {i_SEED+1}/{n_SEED}, KFOLD {n+1}/{n_fold} completed.")
    train_loss_avg = np.average(np.array(train_loss))
    val_loss_avg = np.average(np.array(val_loss))
    print('Training loss : ', train_loss_avg)
    print('Validation loss : ', val_loss_avg)
    writer.writerow([dropout, epoch, lr, batch_size, wd, train_loss_avg, val_loss_avg])
    return val_loss_avg

In [None]:
hidden_size = [1024, 1024]
# dropout = [0.2, 0.2, 0.2]

params = {'dropout':0.2,
          'epoch': 20,
          'lr': 0.001, 
          'batch_size':128, 
          'weight_decay':1e-5
}

filename = 'Manual.csv'
if not os.path.exists(filename):
    with open(filename, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['hidden_size', 'dropout'])
        writer.writerow(['dropout', 'epoch', 'lr', 'batch_size', 'weight_decay', 'Train_loss', 'Val_loss'])

global y_sub_pred_list
y_sub_pred_list = []
with open(filename, 'a') as file:
    writer = csv.writer(file)
    writer.writerow([hidden_size, dropout])
    objective(params);
    
# # submission 
# y_sub_pred = np.zeros((test.shape[0], targets.shape[1]))
# for tmp in y_sub_pred_list:
#     y_sub_pred += tmp
# y_sub_pred1 = pd.DataFrame(y_sub_pred/(n_SEED*n_fold), index=test.index, columns=y.columns)
# ctr_idx = X_sub[X_sub.cp_type=='ctl_vehicle'].index
# y_sub_pred2 = pd.DataFrame(np.zeros((len(ctr_idx), y.shape[1])).astype('float'), index=ctr_idx, columns=y.columns)

# y_sub_predfull = pd.concat((y_sub_pred1, y_sub_pred2))
# y_sub_predfull.sort_index(inplace=True)
# y_sub_predfull.to_csv('submission.csv')

In [None]:
hidden_size = [1024, 1024]

search_space=OrderedDict([
    ('dropout', hp.uniform('dropout', 0.05, 0.4)),
    ('epoch', hp.choice('epoch', [25])),
    ('lr', hp.loguniform('lr', np.log(5e-4), np.log(2e-3))),
    ('batch_size', hp.uniform('batch_size', 64, 150)),
    ('weight_decay', hp.uniform('weight_decay', 5e-6, 5e-5))
])

filename = 'Trails.csv'
if not os.path.exists(filename):
    with open(filename, 'w') as file:
        writer = csv.writer(file)
        writer.writerow(['NN hyperparamerter tuning'])
        
with open(filename, 'a') as file:
    writer = csv.writer(file)
    writer.writerow([])
    writer.writerow(['hidden_size'])
    writer.writerow([hidden_size])
    writer.writerow(['dropout', 'epoch', 'lr', 'batch_size', 'Train_loss', 'Val_loss'])

for i in range(40):
    with open(filename, 'a') as file:
        writer = csv.writer(file)
        #         trials = Trials() 
        trials = joblib.load('hyperopt_trials.pkl')
        _ = fmin(objective, search_space, trials=trials, algo=tpe.suggest, max_evals=len(trials.trials)+1)
        joblib.dump(trials, 'hyperopt_trials.pkl')