# Cross Validation

In [1]:
import sys

sys.path.append("..")
sys.path.append("../../inputs")

In [2]:
import logging

logging.basicConfig(
    # filename=__file__.replace('.py', '.log'),
    stream=sys.stdout,
    level=logging.getLevelName("INFO"),
    format="%(asctime)s [%(levelname)s] [%(module)s] %(message)s",
)

log = logging.getLogger(__name__)

In [3]:
from omegaconf import OmegaConf

c = OmegaConf.load("../config/main.yaml")

c.settings.debug = False
c.wandb.enabled = False
c.wandb.group = "LB"
c.wandb.dir = "../../cache/"
c.settings.dirs.working = ".."
c.settings.dirs.input = "../../inputs/"

pretraind_dir = "../../datasets/trainings"

In [4]:
pretrained = f"""
- dir: {pretraind_dir}/2022-02-08_16-16-05/fold0/
  model: lightgbm

  
"""

_pretrained = f"""
- dir: {pretraind_dir}/2022-02-04_23-43-25/fold0/
  model: ump_1
- dir: {pretraind_dir}/2022-02-04_23-43-27/fold1/
  model: ump_1
- dir: {pretraind_dir}/2022-02-04_23-43-29/fold2/
  model: ump_1
- dir: {pretraind_dir}/2022-02-04_23-43-31/fold3/
  model: ump_1
- dir: {pretraind_dir}/2022-02-04_23-43-33/fold4/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-00/fold5/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-02/fold6/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-04/fold7/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-06/fold8/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-08/fold9/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-09-55/fold10/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-09-57/fold11/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-09-59/fold12/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-10-01/fold13/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-10-04/fold14/
  model: ump_1
  
"""

c.params.pretrained = OmegaConf.create(pretrained)

In [5]:
log.info(OmegaConf.to_yaml(c))

2022-02-09 12:54:34,769 [INFO] [3244290467] defaults:
- _self_
hydra:
  run:
    dir: ../outputs/${now:%Y-%m-%d_%H-%M-%S}
  job_logging:
    formatters:
      simple:
        format: '%(asctime)s [%(levelname)s][%(module)s] %(message)s'
wandb:
  enabled: false
  entity: imokuri
  project: ump
  dir: ../../cache/
  group: LB
settings:
  print_freq: 100
  gpus: 6,7
  dirs:
    working: ..
    input: ../../inputs/
    feature: ${settings.dirs.input}features/
    preprocess: ${settings.dirs.input}preprocess/
  inputs:
  - train.csv
  - example_test.csv
  - example_sample_submission.csv
  debug: false
  n_debug_data: 100000
  amp: true
  multi_gpu: true
  training_method: nn
params:
  seed: 440
  n_class: 1
  preprocess: false
  n_fold: 5
  skip_training: false
  epoch: 20
  es_patience: 0
  batch_size: 640
  gradient_acc_step: 1
  max_grad_norm: 1000
  fold: simple_cpcv
  group_name: investment_id
  time_name: time_id
  label_name: target
  use_feature: true
  feature_set:
  - f000
  datas

In [6]:
import os

import pandas as pd
import src.utils as utils
from src.get_score import record_result

In [7]:
run = utils.setup_wandb(c)

In [8]:
train = pd.read_feather("../../inputs/train.f")
train = train.loc[:, ["row_id", "time_id", "target"]]
train.set_index("row_id", inplace=True)

In [9]:
preds_col = []

# 各 OOF の結果を読み取り
for n, training in enumerate(c.params.pretrained):
    preds_col.append(f"preds{n}")
    oof_df = pd.read_feather(os.path.join(training.dir.rsplit("/", 2)[0], "oof_df.f")).set_index("row_id")

    train[f"preds{n}"] = oof_df["preds"]

# 各行の OOF の結果の数をカウントする
train["count_oof"] = len(c.params.pretrained) - train.isnull().sum(axis=1)

# OOF の結果がない行を 0 埋め
train.fillna(0, inplace=True)

# OOF の結果をマージ
train["preds"] = 0
for col in preds_col:
    train["preds"] += train[col]

# 推論結果がないものは除外
train = train[train["preds"] != 0.0]

# 複数 OOF を加算している場合は、OOFの数で割る
train["preds"] = train["preds"] / train["count_oof"]

ValueError: cannot reindex from a duplicate axis

In [None]:
train

In [None]:
train["count_oof"].value_counts()

In [None]:
record_result(c, train, c.params.n_fold)