In [1]:
import gc
import sys
import glob
import logging

import optuna
import numpy as np
import pandas as pd
from pathlib import Path
import torch

sys.path.append('../src')
import const
import factory
from utils import DataHandler, seed_everything, reduce_mem_usage
from trainer import NNTrainer

import warnings
warnings.filterwarnings('ignore')

In [2]:
dh = DataHandler()

In [3]:
cfg = dh.load('../configs/exp/tabnet_001.yml')
device = "cuda" if torch.cuda.is_available() else "cpu"
run_name = 'optuna_tabnet'

In [4]:
features_params = dh.load(f'../configs/feature/{cfg.data.features.name}.yml')
features = features_params.features

In [5]:
logger_path = Path(f'../logs/{run_name}')

seed_everything(cfg.common.seed)

logger_path.mkdir(exist_ok=True)
logging.basicConfig(filename=logger_path / 'train.log', level=logging.DEBUG)

dh.save(logger_path / 'config.yml', cfg)

In [6]:
train_x = dh.load('../data/team/X_tra_wo_lec_20M.feather')
val_x = dh.load('../data/team/X_val_wo_lec.feather')

train_x['is_val'] = 0
val_x['is_val'] = 1

train_x = pd.concat([train_x, val_x], axis=0, sort=False, ignore_index=True)
train_y = train_x[const.TARGET_COLS[0]]

use_row_id = train_x['row_id'].values
val_idx = train_x[train_x['is_val'] == 1].index
drop_cols = set(train_x.columns) - set(features + const.TARGET_COLS)
train_x = train_x.drop(drop_cols, axis=1)

In [7]:
add_df = pd.DataFrame(index=train_x.index)

additional_cols = set(features) - set(train_x.columns)
for col in additional_cols:
    feat_df = pd.read_feather(f'../features/{col}_train.feather')
    add_df[col] = feat_df.loc[use_row_id, col].values

add_df = reduce_mem_usage(add_df)
train_x = pd.concat([train_x, add_df], axis=1)

del add_df; gc.collect()

11

In [8]:
for col in train_x.columns:
    if col != const.TARGET_COLS[0]:
        inf_idx = train_x[train_x[col] == np.inf].index.values

        if len(inf_idx) > 0:
            train_x.loc[inf_idx, col] = np.nan
        null_count = train_x[col].isnull().sum()

        if null_count > 0:
            mean_ = train_x[col].mean()
            train_x[col] = train_x[col].fillna(mean_)

        train_x[col] = (train_x[col] - train_x[col].mean()) / train_x[col].std()

In [9]:
fold_df = pd.DataFrame(index=range(len(train_x)))
fold_df['fold_0'] = 0
fold_df.loc[val_idx, 'fold_0'] += 1

In [10]:
cfg.model.epochs = 1
# cfg.data.train.loader.batch_size = 2048

In [11]:
def objective(trial):
#     lr = trial.suggest_loguniform('lr', 1e-3, 5e-2)
    n_d = trial.suggest_int('n_d', 128, 256)
    n_a = trial.suggest_int('n_a', 128, 256)
#     factor = trial.suggest_loguniform('factor', 0.1, 0.9)
    
#     cfg.optimizer.params.lr = lr
    cfg.model.params.n_d = n_d
    cfg.model.params.n_a = n_a
#     cfg.scheduler.params.factor = factor
    
    trainer = NNTrainer(run_name, fold_df, cfg)
    cv = trainer.train(train_df=train_x, target_df=train_y)

    return cv

In [None]:
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)   # n_trials, timeout

[32m[I 2020-12-21 20:35:43,331][0m A new study created in memory with name: no-name-1d3221fa-e48a-43ca-8e77-d50595404c4f[0m




█

[32m[I 2020-12-21 20:45:21,018][0m Trial 0 finished with value: 0.7610108240629369 and parameters: {'n_d': 166, 'n_a': 178}. Best is trial 0 with value: 0.7610108240629369.[0m


Epoch 1 - avg_train_loss: 0.542419  avg_val_loss: 0.549337 val_score: 0.761011 time: 558s

Epoch 1 - val_score: 0.761011



CV: 0.761011







[I 2020-12-21 20:15:24,048] Trial 0 finished with value: 0.762468262919172 and parameters: {'n_d': 17, 'n_a': 109}. Best is trial 0 with value: 0.762468262919172.