# Inference notebook

In [1]:
import sys


def in_kaggle():
    return "kaggle_web_client" in sys.modules

In [2]:
import logging


def init_logger():
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    handler1 = logging.StreamHandler(stream=sys.stdout)
    handler1.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] [%(module)s] %(message)s"))
    # handler2 = logging.FileHandler(filename="train.log")
    # handler2.setFormatter(logging.Formatter("%(asctime)s [%(levelname)s] [%(module)s] %(message)s"))
    logger.addHandler(handler1)
    # logger.addHandler(handler2)
    return logger


log = init_logger()

In [3]:
##### INSERT SOURCE CODE HERE FOR SUBMISSION #####

if in_kaggle():
    sys.path.append("../input/nptyping")
    sys.path.append("../input/typish")
else:
    sys.path.append("..")
    sys.path.append("../../inputs")

In [4]:
if in_kaggle():
    # https://www.kaggle.com/speeddemon/install-hydra-offline-from-dataset

    !cp -r /kaggle/input/hydracore105 /kaggle/working
    !mv /kaggle/working/hydracore105/antlr4-python3-runtime-4.8.tar.gz.tmp /kaggle/working/hydracore105/antlr4-python3-runtime-4.8.tar.gz
    !ls /kaggle/working/hydracore105

    !pip install -qq /kaggle/working/hydracore105/* --ignore-installed PyYAML
    
    sys.path.append("../input/omegaconf/omegaconf-master")
    omega_conf_path = "config/main.yaml"
    
else:
    omega_conf_path = "../config/main.yaml"

In [5]:
from omegaconf import OmegaConf

c = OmegaConf.load(omega_conf_path)

c.settings.debug = False
c.wandb.enabled = False

if in_kaggle():
    c.settings.gpus = "0"

    c.settings.dirs.working = "."
    c.settings.dirs.input = "../input/ubiquant-market-prediction/"
    c.settings.dirs.feature = "../input/ubiquant-parquet/"

    pretraind_dir = "../input/ump-models"

else:
    c.settings.dirs.working = ".."
    c.settings.dirs.input = "../../inputs/"

    pretraind_dir = "../../datasets/trainings"

In [6]:
pretrained = f"""
- dir: {pretraind_dir}/2022-02-04_23-43-25/fold0/
  model: ump_1
"""

pretrained_lgb = f"""
- dir: {pretraind_dir}/2022-02-08_16-16-05/fold0/
  model: lightgbm
"""

_pretrained = f"""
- dir: {pretraind_dir}/2022-02-04_23-43-27/fold1/
  model: ump_1
- dir: {pretraind_dir}/2022-02-04_23-43-29/fold2/
  model: ump_1
- dir: {pretraind_dir}/2022-02-04_23-43-31/fold3/
  model: ump_1
- dir: {pretraind_dir}/2022-02-04_23-43-33/fold4/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-00/fold5/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-02/fold6/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-04/fold7/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-06/fold8/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_08-05-08/fold9/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-09-55/fold10/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-09-57/fold11/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-09-59/fold12/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-10-01/fold13/
  model: ump_1
- dir: {pretraind_dir}/2022-02-05_20-10-04/fold14/
  model: ump_1
"""

c.params.pretrained = OmegaConf.create(pretrained)
c.params.pretrained_lgb = OmegaConf.create(pretrained_lgb)

In [7]:
log.info(OmegaConf.to_yaml(c))

2022-02-08 20:40:20,149 [INFO] [3244290467] defaults:
- _self_
hydra:
  run:
    dir: ../outputs/${now:%Y-%m-%d_%H-%M-%S}
  job_logging:
    formatters:
      simple:
        format: '%(asctime)s [%(levelname)s][%(module)s] %(message)s'
wandb:
  enabled: false
  entity: imokuri
  project: ump
  dir: ${hydra:runtime.cwd}/../cache
  group: default
settings:
  print_freq: 100
  gpus: 6,7
  dirs:
    working: ..
    input: ../../inputs/
    feature: ${settings.dirs.input}features/
    preprocess: ${settings.dirs.input}preprocess/
  inputs:
  - train.csv
  - example_test.csv
  - example_sample_submission.csv
  debug: false
  n_debug_data: 100000
  amp: true
  multi_gpu: true
  training_method: nn
params:
  seed: 440
  n_class: 1
  preprocess: false
  n_fold: 5
  skip_training: false
  epoch: 20
  es_patience: 0
  batch_size: 640
  gradient_acc_step: 1
  max_grad_norm: 1000
  fold: simple_cpcv
  group_name: investment_id
  time_name: time_id
  label_name: target
  use_feature: true
  feature

In [8]:
import gc
import traceback

import numpy as np
import src.utils as utils
import ubiquant
from src.feature_store import Store
from src.features.base import get_feature
from src.make_feature import make_feature
from src.make_model import load_model
from src.run_loop import inference, inference_lightgbm

In [9]:
utils.fix_seed(c.params.seed)
utils.debug_settings(c)
device = utils.gpu_settings(c)

In [10]:
feature_set = ["f000"]

feature_set = list(sorted(list(set(feature_set))))
log.info(f"feature set: {feature_set}")

feature_func = [get_feature(f) for f in feature_set]

2022-02-08 20:40:21,825 [INFO] [2937476863] feature set: ['f000']


In [11]:
store = Store.empty()
# store = Store.train(c)  # Your notebook tried to allocate more memory than is available.0

In [12]:
models = load_model(c, device)
models_lgb = load_model(c, device, c.params.pretrained_lgb)

In [13]:
env = ubiquant.make_env()  # initialize the environment
iter_test = env.iter_test()  # an iterator which loops over the test set and sample submission

In [14]:
feature_cols = [f"f_{n}" for n in range(300)]

In [15]:
for test_df, sample_prediction_df in iter_test:
    gc.collect()

    # log.info(test_df["investment_id"].dtype)  # int16
    # log.info(test_df[feature_cols].values.dtype)  # float64

    try:
        assert len(test_df["investment_id"].unique()) == len(test_df["investment_id"]), "investment_id is not unique."

        if c.params.use_feature:
            for row in test_df.values:
                # investment_id_ = int(row[1])
                # features_ = row[2:302].astype(np.float32)

                # log.info(f"investment_id: {investment_id_}({type(investment_id_)}), features: {len(features_)}({features_.dtype})")
                store.append(row)

                # log.info(f"store: {store.investments[investment_id_].features.last_n(1).squeeze().shape}, input: {features_.shape}")
                # assert np.array_equal(
                #     store.investments[investment_id_].features.last_n(1).squeeze(), features_
                # ), "Features are different before and after storing in the store"

            pred_df = make_feature(
                test_df,
                store,
                feature_set,
                load_from_store=False,
                save_to_store=False,
                debug=c.settings.debug,
            )

            assert len(test_df) == len(pred_df), "test_df and pred_df do not same size."
            assert list(pred_df.columns) == feature_cols, "pred_df has feature_cols columns."

            # assert (
            #     test_df[feature_cols].astype("float32").equals(pred_df)
            # ), "Default features do not match between test_df and pred_df."

            preds = inference(c, pred_df, device, models)
            preds_lgb = inference_lightgbm(pred_df, models_lgb)

            # else:
            preds_ = inference(c, test_df, device, models)
            preds_lgb_ = inference_lightgbm(test_df, models_lgb)

        # assert np.array_equal(preds_, preds), "Predictions do not match between test_df and pred_df."
        # assert np.array_equal(preds_lgb_, preds_lgb), "LightGBM predictions do not match between test_df and pred_df."

        predictions = np.hstack([preds_, preds_lgb_])
        sample_prediction_df["target"] = np.nanmean(predictions, axis=1)

        # DEBUG
        # sample_prediction_df.fillna({"target": 0}, inplace=True)

    except Exception as e:
        log.warning(traceback.format_exc())
        raise

    env.predict(sample_prediction_df)  # register your predictions

This version of the API is not optimized and should not be used to estimate the runtime of your code on the hidden test set.
