# Cross Validation

In [1]:
import sys

sys.path.append("..")
sys.path.append("../../inputs")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [3]:
from omegaconf import OmegaConf

c = OmegaConf.load("../config/main.yaml")

c.settings.debug = False
c.wandb.enabled = True
c.wandb.group = "LB"
c.wandb.dir = "../../cache/"
c.settings.dirs.working = ".."
c.settings.dirs.input = "../../inputs/"

pretraind_dir = "../../datasets/trainings"

pretrained = f"""
- dir: {pretraind_dir}/2022-01-30_22-12-14/fold0/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-16/fold1/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-19/fold2/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-21/fold3/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-23/fold4/
  model: ump_1
"""

_pretrained = f"""
- dir: {pretraind_dir}/2022-01-26_07-47-01/fold0/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-03/fold1/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-05/fold2/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-07/fold3/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-10/fold4/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-12/fold5/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-14/fold6/
  model: ump_1
"""

c.params.pretrained = OmegaConf.create(pretrained)

log.info(OmegaConf.to_yaml(c))

2022-02-01 12:02:58,190 [INFO] [3904336845] defaults:
- _self_
hydra:
  run:
    dir: ../outputs/${now:%Y-%m-%d_%H-%M-%S}
  job_logging:
    formatters:
      simple:
        format: '%(asctime)s [%(levelname)s][%(module)s] %(message)s'
wandb:
  enabled: true
  entity: imokuri
  project: ump
  dir: ../../cache/
  group: LB
settings:
  print_freq: 100
  gpus: 6,7
  dirs:
    working: ..
    input: ../../inputs/
  inputs:
  - train.csv
  - example_test.csv
  - example_sample_submission.csv
  debug: false
  n_debug_data: 100000
  amp: true
  multi_gpu: true
params:
  seed: 440
  n_class: 1
  n_fold: 5
  skip_training: false
  epoch: 20
  es_patience: 0
  batch_size: 640
  gradient_acc_step: 1
  max_grad_norm: 1000
  fold: simple_cpcv
  group_name: investment_id
  time_name: time_id
  label_name: target
  feature_set:
  - f000
  dataset: ump_1
  model: ump_1
  pretrained:
  - dir: ../../datasets/trainings/2022-01-30_22-12-14/fold0/
    model: ump_1
  - dir: ../../datasets/trainings/2022-01

In [4]:
import os

import pandas as pd
import src.utils as utils
from src.get_score import record_result

In [5]:
run = utils.setup_wandb(c)

2022-02-01 12:02:59,808 [ERROR] [jupyter] Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mimokuri[0m (use `wandb login --relogin` to force relogin)


2022-02-01 12:03:03,304 [INFO] [utils] WandB initialized. name: bumbling-plant-52, id: 18tu4fi2


In [6]:
train = pd.read_feather("../../inputs/train.f")
train = train.loc[:, ["row_id", "target"]]
train.set_index("row_id", inplace=True)

In [7]:
preds_col = []

# 各 OOF の結果を読み取り
for n, training in enumerate(c.params.pretrained):
    preds_col.append(f"preds{n}")
    oof_df = pd.read_feather(os.path.join(training.dir.rsplit("/", 2)[0], "oof_df.f")).set_index("row_id")
    train[f"preds{n}"] = oof_df["preds"]

# OOF の結果がない行を 0 埋め
train.fillna(0, inplace=True)

# OOF の結果をマージ
train["preds"] = 0
for col in preds_col:
    train["preds"] += train[col]

# 推論結果がないものは除外
train = train[train["preds"] != 0.0]

# 複数 OOF を加算している場合は、OOFの数で割る
train["preds"] = train["preds"] / 5.0

In [8]:
train

Unnamed: 0_level_0,target,preds0,preds1,preds2,preds3,preds4,preds
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1059_947,-0.019546,-0.021912,-0.030762,0.000328,-0.027710,-0.027786,-0.021568
1059_950,0.369678,-0.147705,-0.208374,-0.160767,-0.183594,-0.177368,-0.175562
1059_951,-0.398294,-0.012146,0.001702,0.021698,0.004936,-0.003584,0.002521
1059_952,0.018593,0.060272,0.065857,0.052643,0.034332,0.038177,0.050256
1059_953,-0.148126,0.036346,0.040161,0.034882,0.021179,0.022446,0.031003
...,...,...,...,...,...,...,...
1219_3768,0.033600,-0.023300,-0.041321,-0.033722,-0.051727,-0.034943,-0.037003
1219_3769,-0.223264,-0.023209,-0.055420,-0.043152,-0.052948,-0.045654,-0.044077
1219_3770,-0.559415,0.024734,0.079834,0.057465,0.085510,0.076843,0.064877
1219_3772,0.009599,-0.006653,-0.006962,-0.010132,-0.002348,0.005989,-0.004021


In [9]:
record_result(c, oof_df, c.params.n_fold)

2022-02-01 12:03:11,242 [INFO] [get_score] Score: 0.14896


0.14895712630902377