# Training PoC

In [1]:
import sys

sys.path.append("..")
sys.path.append("../../inputs")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    stream=sys.stdout,
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

In [3]:
log = logging.getLogger(__name__)

In [4]:
from omegaconf import OmegaConf

In [5]:
c = OmegaConf.load("../config/main.yaml")

In [6]:
c.settings.debug = True
c.wandb.enabled = False
c.settings.dirs.working = ".."
c.settings.dirs.input = "../../inputs/"

In [7]:
log.info(OmegaConf.to_yaml(c))

2022-02-08 22:33:43,696 [INFO] [3244290467] defaults:
- _self_
hydra:
  run:
    dir: ../outputs/${now:%Y-%m-%d_%H-%M-%S}
  job_logging:
    formatters:
      simple:
        format: '%(asctime)s [%(levelname)s][%(module)s] %(message)s'
wandb:
  enabled: false
  entity: imokuri
  project: ump
  dir: ${hydra:runtime.cwd}/../cache
  group: default
settings:
  print_freq: 100
  gpus: 6,7
  dirs:
    working: ..
    input: ../../inputs/
    feature: ${settings.dirs.input}features/
    preprocess: ${settings.dirs.input}preprocess/
  inputs:
  - train.csv
  - example_test.csv
  - example_sample_submission.csv
  debug: true
  n_debug_data: 100000
  amp: true
  multi_gpu: true
  training_method: nn
params:
  seed: 440
  n_class: 1
  preprocess: false
  n_fold: 5
  skip_training: false
  epoch: 20
  es_patience: 0
  batch_size: 640
  gradient_acc_step: 1
  max_grad_norm: 1000
  fold: simple_cpcv
  group_name: investment_id
  time_name: time_id
  label_name: target
  use_feature: true
  feature_

## Main

In [8]:
import os
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch.cuda.amp as amp
from pytorch_tabnet.tab_model import TabNetRegressor
from torch.optim import Adam
from torch.optim.lr_scheduler import CosineAnnealingLR, CosineAnnealingWarmRestarts

In [9]:
import src.utils as utils
from src.get_score import TNPearson
from src.load_data import InputData
from src.make_dataset import make_dataset_general
from src.make_fold import train_test_split
from src.make_loss import RMSELoss, make_criterion, make_optimizer, make_scheduler
from src.make_model import make_model
from src.run_loop import EarlyStopping
from src.time_series_api import TimeSeriesAPI
from src.utils import AverageMeter

In [10]:
utils.debug_settings(c)

2022-02-08 22:33:45,364 [INFO] [utils] Enable debug mode.


In [11]:
run = utils.setup_wandb(c)

In [12]:
utils.fix_seed(c.params.seed)

2022-02-08 22:33:45,374 [INFO] [utils] Fix seed: 440


In [13]:
device = utils.gpu_settings(c)

2022-02-08 22:33:45,380 [INFO] [utils] CUDA_VISIBLE_DEVICES: 6,7
2022-02-08 22:33:45,416 [INFO] [utils] torch device: cuda, device count: 2


In [14]:
input = InputData(c)

2022-02-08 22:33:45,420 [INFO] [load_data] Load feather file. path: ../../inputs/train.f
2022-02-08 22:34:10,625 [INFO] [utils] Mem. usage decreased to 366Mb: 49% reduction
2022-02-08 22:34:10,659 [INFO] [load_data] Load feather file. path: ../../inputs/example_test.f
2022-02-08 22:34:10,870 [INFO] [utils] Mem. usage decreased to 0.0Mb: 49% reduction
2022-02-08 22:34:10,871 [INFO] [load_data] Load feather file. path: ../../inputs/example_sample_submission.f
2022-02-08 22:34:10,875 [INFO] [utils] Mem. usage decreased to 0.0Mb: 34% reduction


In [15]:
input.train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3141410 entries, 0 to 3141409
Columns: 306 entries, row_id to time_fold
dtypes: float32(303), int16(2), object(1)
memory usage: 3.6+ GB


In [16]:
oof_df = pd.DataFrame()
losses = utils.AverageMeter()
evaluation_results = {}
callbacks = []

In [17]:
for fold in range(c.params.n_fold):
    log.info(f"========== fold {fold} training ==========")
    utils.fix_seed(c.params.seed + fold)

    ####################################################
    # _oof_df, score, loss = train_fold(c, input.train, fold, device)
    df = input.train

    train_folds, valid_folds = train_test_split(c, df, fold)
    train_ds, train_labels, valid_ds, valid_labels = make_dataset_general(c, train_folds, valid_folds)

    clf = TabNetRegressor(
        n_d=16,
        n_a=16,
        n_steps=2,
        gamma=1.4,
        n_independent=6,
        n_shared=3,
        lambda_sparse=0,
        optimizer_fn=Adam,
        optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
        mask_type="entmax",
        scheduler_fn=CosineAnnealingWarmRestarts,
        scheduler_params=dict(T_0=200, T_mult=1, eta_min=1e-4, last_epoch=-1),
        seed=c.params.seed,
        verbose=10,
    )
    clf.fit(
        train_ds,
        train_labels.reshape(-1, 1),
        eval_set=[(valid_ds, valid_labels.reshape(-1, 1))],
        eval_name=["valid"],
        eval_metric=["rmse"],
        max_epochs=200,
        patience=15,
        #loss_fn=RMSELoss(),
        batch_size=1024 * 10,
        virtual_batch_size=128 * 10,
        num_workers=4,
        drop_last=True,
        # callbacks=[],
    )

    break

    ####################################################

    oof_df = pd.concat([oof_df, _oof_df])
    losses.update(loss)

    log.info(f"========== fold {fold} result ==========")
    record_result(c, _oof_df, fold, loss)

    if c.settings.debug:
        break

2022-02-08 22:34:10,895 [INFO] [utils] Fix seed: 440
2022-02-08 22:34:11,831 [INFO] [make_fold] Num of training data: 1676930, num of validation data: 1047138
Device used : cuda
epoch 0  | loss: 0.91369 | valid_rmse: 0.9150800108909607|  0:00:20s
epoch 10 | loss: 0.82273 | valid_rmse: 0.9086999893188477|  0:03:34s
epoch 20 | loss: 0.81737 | valid_rmse: 0.9083099961280823|  0:06:54s


KeyboardInterrupt: 

In [None]:
raise

In [None]:
booster.best_score

In [None]:
fig, axs = plt.subplots(1, 2, figsize=[15, 4])

# Plot the log loss during training
axs[0].plot(evaluation_results["train"]["rmse"], label="train")
axs[0].plot(evaluation_results["valid"]["rmse"], label="valid")
axs[0].set_ylabel("RMSE")
axs[0].set_xlabel("Boosting round")
axs[0].set_title("Training performance")
axs[0].legend()

# Plot feature importance
importances = pd.DataFrame(
    {"features": booster.feature_name(), "importance": booster.feature_importance()}
).sort_values("importance", ascending=False)[:10]
axs[1].bar(x=np.arange(len(importances)), height=importances["importance"])
axs[1].set_xticks(np.arange(len(importances)))
axs[1].set_xticklabels(importances["features"], rotation=45)
axs[1].set_ylabel("Feature importance (# times used to split)")
axs[1].set_title("Feature importance")

plt.show()

In [None]:
importances.sort_values("importance", ascending=False)

In [None]:
booster.best_iteration

In [None]:
oof = booster.predict(
    valid_folds.drop(["row_id", "time_id", "investment_id", "target", "group_fold", "time_fold"], axis=1),
    num_iteration=booster.best_iteration,
)

In [None]:
oof

In [None]:
len(oof)