# Cross Validation

In [1]:
import sys

sys.path.append("..")
sys.path.append("../../inputs")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [3]:
from omegaconf import OmegaConf

c = OmegaConf.load("../config/main.yaml")

c.settings.debug = False
c.wandb.enabled = False
c.wandb.group = "LB"
c.wandb.dir = "../../cache/"
c.settings.dirs.working = ".."
c.settings.dirs.input = "../../inputs/"

pretraind_dir = "../../datasets/trainings"

pretrained = f"""
- dir: {pretraind_dir}/2022-01-26_07-47-01/fold0/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-03/fold1/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-05/fold2/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-07/fold3/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-10/fold4/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-12/fold5/
  model: ump_1
- dir: {pretraind_dir}/2022-01-26_07-47-14/fold6/
  model: ump_1
  

"""

_pretrained = f"""
- dir: {pretraind_dir}/2022-01-30_22-12-14/fold0/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-16/fold1/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-19/fold2/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-21/fold3/
  model: ump_1
- dir: {pretraind_dir}/2022-01-30_22-12-23/fold4/
  model: ump_1
"""

c.params.pretrained = OmegaConf.create(pretrained)

log.info(OmegaConf.to_yaml(c))

2022-02-01 12:46:55,202 [INFO] [2790189602] defaults:
- _self_
hydra:
  run:
    dir: ../outputs/${now:%Y-%m-%d_%H-%M-%S}
  job_logging:
    formatters:
      simple:
        format: '%(asctime)s [%(levelname)s][%(module)s] %(message)s'
wandb:
  enabled: false
  entity: imokuri
  project: ump
  dir: ../../cache/
  group: LB
settings:
  print_freq: 100
  gpus: 6,7
  dirs:
    working: ..
    input: ../../inputs/
  inputs:
  - train.csv
  - example_test.csv
  - example_sample_submission.csv
  debug: false
  n_debug_data: 100000
  amp: true
  multi_gpu: true
params:
  seed: 440
  n_class: 1
  n_fold: 5
  skip_training: false
  epoch: 20
  es_patience: 0
  batch_size: 640
  gradient_acc_step: 1
  max_grad_norm: 1000
  fold: simple_cpcv
  group_name: investment_id
  time_name: time_id
  label_name: target
  feature_set:
  - f000
  dataset: ump_1
  model: ump_1
  pretrained:
  - dir: ../../datasets/trainings/2022-01-26_07-47-01/fold0/
    model: ump_1
  - dir: ../../datasets/trainings/2022-0

In [4]:
import os

import pandas as pd
import src.utils as utils
from src.get_score import record_result

In [5]:
run = utils.setup_wandb(c)

In [6]:
train = pd.read_feather("../../inputs/train.f")
train = train.loc[:, ["row_id", "target"]]
train.set_index("row_id", inplace=True)

In [7]:
preds_col = []

# 各 OOF の結果を読み取り
for n, training in enumerate(c.params.pretrained):
    preds_col.append(f"preds{n}")
    oof_df = pd.read_feather(os.path.join(training.dir.rsplit("/", 2)[0], "oof_df.f")).set_index("row_id")

    train[f"preds{n}"] = oof_df["preds"]

# 各行の OOF の結果の数をカウントする
train["count_oof"] = len(c.params.pretrained) - train.isnull().sum(axis=1)

# OOF の結果がない行を 0 埋め
train.fillna(0, inplace=True)

# OOF の結果をマージ
train["preds"] = 0
for col in preds_col:
    train["preds"] += train[col]

# 推論結果がないものは除外
train = train[train["preds"] != 0.0]

# 複数 OOF を加算している場合は、OOFの数で割る
train["preds"] = train["preds"] / train["count_oof"]

In [8]:
train

Unnamed: 0_level_0,target,preds0,preds1,preds2,preds3,preds4,preds5,preds6,count_oof,preds
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0_1,-0.300875,0.0,0.194702,0.000000,0.000000,0.000000,0.00000,0.000000,1,0.194702
0_2,-0.231040,0.0,0.000000,0.000000,0.000000,0.000000,-0.04184,0.000000,1,-0.041840
0_6,0.568807,0.0,0.000000,0.000000,0.000000,0.110657,0.00000,0.000000,1,0.110657
0_7,-1.064780,0.0,0.000000,0.000000,0.000000,-0.005516,0.00000,0.000000,1,-0.005516
0_8,-0.531940,0.0,0.019150,0.000000,0.000000,0.000000,0.00000,0.000000,1,0.019150
...,...,...,...,...,...,...,...,...,...,...
1219_3768,0.033600,0.0,0.000000,0.000000,-0.041534,0.000000,0.00000,0.000000,1,-0.041534
1219_3769,-0.223264,0.0,0.000000,-0.019928,0.000000,0.000000,0.00000,0.000000,1,-0.019928
1219_3770,-0.559415,0.0,0.000000,0.000000,0.000000,0.065308,0.00000,0.000000,1,0.065308
1219_3772,0.009599,0.0,0.000000,0.000000,0.000000,0.000000,0.00000,0.013855,1,0.013855


In [9]:
record_result(c, train, c.params.n_fold)

2022-02-01 12:47:05,213 [INFO] [get_score] Score: 0.13359


0.1335864693046604