# Cross Validation

In [1]:
import sys

sys.path.append("..")
sys.path.append("../../inputs")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    stream=sys.stdout,
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [3]:
from omegaconf import OmegaConf

c = OmegaConf.load("../config/main.yaml")

c.settings.debug = False
c.wandb.enabled = False
c.wandb.group = "LB"
c.wandb.dir = "../../cache/"
c.settings.dirs.working = ".."
c.settings.dirs.input = "../../inputs/"

pretraind_dir = "../../datasets/trainings"

In [4]:
pretrained = f"""
- dir: {pretraind_dir}/2022-03-04_07-58-24/fold1/ 
  model: ump_lstm
- dir: {pretraind_dir}/2022-03-04_07-58-30/fold2/ 
  model: ump_lstm
- dir: {pretraind_dir}/2022-03-04_07-59-03/fold3/ 
  model: ump_lstm
- dir: {pretraind_dir}/2022-03-04_07-59-08/fold4/ 
  model: ump_lstm
- dir: {pretraind_dir}/2022-03-04_07-59-14/fold5/ 
  model: ump_lstm
- dir: {pretraind_dir}/2022-03-04_13-14-39/fold0/ 
  model: ump_lstm

"""

_pretrained = f"""
"""

c.params.pretrained = OmegaConf.create(pretrained)

In [5]:
log.info(OmegaConf.to_yaml(c))

2022-03-05 20:08:57,283 [INFO] [3244290467] defaults:
- _self_
hydra:
  run:
    dir: ../outputs/${now:%Y-%m-%d_%H-%M-%S}
  job_logging:
    formatters:
      simple:
        format: '%(asctime)s [%(levelname)s][%(module)s] %(message)s'
wandb:
  enabled: true
  entity: imokuri
  project: ump
  dir: ../../cache/
  group: LB
settings:
  print_freq: 100
  gpus: '6'
  dirs:
    working: ..
    input: ../../inputs/
    input_minimal: ${hydra:runtime.cwd}/../datasets/inputs/
    feature: ${settings.dirs.input}features/
    preprocess: ${settings.dirs.input}preprocess/
  inputs:
  - train.csv
  - example_test.csv
  - example_sample_submission.csv
  debug: false
  n_debug_data: 100000
  amp: true
  multi_gpu: true
  training_method: nn
params:
  seed: 440
  n_class: 10
  preprocess: []
  pca_n_components: 50
  n_fold: 5
  skip_training: false
  epoch: 10
  es_patience: 0
  batch_size: 640
  gradient_acc_step: 1
  max_grad_norm: 1000
  fold: combinational_purged
  group_name: investment_id
  ti

In [6]:
import os

import pandas as pd
import src.utils as utils
from src.get_score import record_result
from tqdm.notebook import tqdm

In [7]:
run = utils.setup_wandb(c)

2022-03-05 20:08:58,931 [ERROR] [jupyter] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.


[34m[1mwandb[0m: Currently logged in as: [33mimokuri[0m (use `wandb login --relogin` to force relogin)
[34m[1mwandb[0m: wandb version 0.12.11 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade


2022-03-05 20:09:02,690 [INFO] [utils] WandB initialized. name: lyric-shape-360, id: 1ft3vx1j


In [8]:
train = pd.read_feather("../../inputs/train.f")
train = train.loc[:, ["row_id", "time_id", "target"]]
train.set_index("row_id", inplace=True)

In [9]:
preds_col = []

# 各 OOF の結果を読み取り
for n, training in tqdm(enumerate(c.params.pretrained), total=len(c.params.pretrained)):
    preds_col.append(f"preds{n}")
    oof_df = pd.read_feather(os.path.join(training.dir.rsplit("/", 2)[0], "oof_df.f")).set_index("row_id")

    if training.model == "lightgbm":
        train[f"preds{n}"] = oof_df["preds"].groupby("row_id").sum()
    else:
        train[f"preds{n}"] = oof_df["preds"]

# 各行の OOF の結果の数をカウントする
train["count_oof"] = len(c.params.pretrained) - train.isnull().sum(axis=1)  # + 4

# OOF の結果がない行を 0 埋め
train.fillna(0, inplace=True)

# OOF の結果をマージ
train["preds"] = 0
for col in preds_col:
    train["preds"] += train[col]

# 推論結果がないものは除外
train = train[train["preds"] != 0.0]

# 複数 OOF を加算している場合は、OOFの数で割る
train["preds"] = train["preds"] / train["count_oof"]

  0%|          | 0/6 [00:00<?, ?it/s]

In [10]:
train

Unnamed: 0_level_0,time_id,target,preds0,preds1,preds2,preds3,preds4,preds5,count_oof,preds
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0_1,0,-0.300875,0.0,0.0,0.0,0.0,0.000000,0.000833,1,0.000833
0_2,0,-0.231040,0.0,0.0,0.0,0.0,0.000000,-0.034454,1,-0.034454
0_6,0,0.568807,0.0,0.0,0.0,0.0,0.000000,0.031128,1,0.031128
0_7,0,-1.064780,0.0,0.0,0.0,0.0,0.000000,-0.079834,1,-0.079834
0_8,0,-0.531940,0.0,0.0,0.0,0.0,0.000000,0.025497,1,0.025497
...,...,...,...,...,...,...,...,...,...,...
1219_3768,1219,0.033600,0.0,0.0,0.0,0.0,-0.143311,0.000000,1,-0.143311
1219_3769,1219,-0.223264,0.0,0.0,0.0,0.0,-0.153076,0.000000,1,-0.153076
1219_3770,1219,-0.559415,0.0,0.0,0.0,0.0,0.076843,0.000000,1,0.076843
1219_3772,1219,0.009599,0.0,0.0,0.0,0.0,0.099121,0.000000,1,0.099121


In [11]:
train["count_oof"].value_counts()

1    3141291
Name: count_oof, dtype: int64

In [12]:
record_result(c, train, c.params.n_fold)

2022-03-05 20:09:12,249 [INFO] [get_score] Score: 0.11832


0.11831549679775508