# Imports

In [1]:
from catboost import CatBoostRegressor, Pool
import joblib
import json
import mlflow
import numpy as np
import optuna
import pickle
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import TimeSeriesSplit
import sys

  "class": algorithms.Blowfish,
  from .autonotebook import tqdm as notebook_tqdm


In [2]:
sys.path.append('../src/')
import h5_utils
from train import drop_target_nan

# Configs

In [3]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

In [4]:
folder_daily_h5 = f"../{config['RAW_DATA_DIR']}/daily/"

metadata_filename = 'metadata.h5'
metadata_filepath = f"../{config['RAW_DATA_DIR']}/{metadata_filename}"

In [5]:
with open(f"../{config['MODEL_DIR']}/top_300_feats_nms.pkl", 'rb') as f:
    top_feature_names = pickle.load(f)

In [6]:
optuna_selected_feat_filename = 'optuna_selected_300.pkl'  #'optuna_selected_300_4_folds.pkl'
optuna_selected_feat_filenpath = f"../{config['RAW_DATA_DIR']}/{optuna_selected_feat_filename}"

In [7]:
# Train test split
DAY_TEST_START = 400 # train size in day terms

# MlFlow
experiment_name = 'optiver_300_features'

# features precision
REDUCE_TO_FLOAT32 = True # guaranties < 9.3Gb train pool of 300 features 

# Daily load trick

In [8]:
date_ids = h5_utils.load_metadata(metadata_filepath)
len(date_ids)

481

# Final optuna on Catboost with selected 300 features

## Trial

In [9]:
fixed_params = {
    "loss_function" : "MAE",
    "eval_metric" : "MAE",
    "thread_count" : -1, 
    "random_seed" : 42,
    "iterations" : 1000,  
    "od_type": "Iter",  # Activate CatBoost overfitting detector
    "od_wait": 20
    #"used_ram_limit": "14gb",
}

In [10]:
def params_catboost(trial):
    
    param = {
        "depth": trial.suggest_int("depth", 5, 10),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 
                                      3, 50),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", 
                             ["Bayesian", "Bernoulli", "MVS"])
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 
                                                 1, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 
                                         0.5, 1)
    return param

In [19]:
def mae_of_fit_model(train_pair, test_pair,
              param):
    train_data, train_labels = train_pair
    
    train_pool = Pool(train_data, 
                label=train_labels)
    
    model = CatBoostRegressor(**param)

    test_data, test_labels = test_pair
    test_pool = Pool(test_data, label=test_labels)
    
    model.fit(X=train_pool, 
              eval_set=test_pool,
              verbose=1)

    # Score
    ## Load validation data

    valid_preds = model.predict(test_data)
    ## Calculate metric
    mae = mean_absolute_error(test_labels, valid_preds)

    return mae

In [12]:
def objective_hard(trial, n_splits=4):
    tscv = TimeSeriesSplit(n_splits=n_splits)
    fold_scores = []

    for fold, (train_index, test_index) in enumerate(tscv.split(date_ids)):
        train_ids = date_ids[train_index]
        test_ids = date_ids[test_index]

        param = params_catboost(trial)
        model = fit_model(train_ids, param)

        valid_data, valid_labels = h5_utils.stacked_daily_data(test_ids,
                                                               folder_daily_h5,
                                                               top_feature_names,
                                                               float32=REDUCE_TO_FLOAT32)
        valid_data, valid_labels = drop_target_nan(valid_data, valid_labels)
        valid_preds = model.predict(valid_data)

        mae = mean_absolute_error(valid_labels, valid_preds)
        fold_scores.append(mae)

        # MLflow part
        with mlflow.start_run():
            mlflow.log_param("fold", fold)
            mlflow.log_params(param)
            mlflow.log_metric("mae", mae)

    return np.mean(fold_scores)

In [12]:
def load_and_process(data_ids):
    data_features, data_labels = h5_utils.stacked_daily_data(data_ids,
                                                       folder_daily_h5,
                                                       top_feature_names,
                                                       float32=REDUCE_TO_FLOAT32)
    data_features, data_labels = drop_target_nan(data_features, 
                                               data_labels)

    return data_features, data_labels

In [20]:
def objective(trial):
    # CV
    train_ids = date_ids[:DAY_TEST_START]
    test_ids = date_ids[DAY_TEST_START:]
    
    param = params_catboost(trial)

    # Data
    train_data, train_labels = load_and_process(train_ids) 
    ## NaN in target is not scored by the rules
    valid_data, valid_labels = load_and_process(test_ids)

    train_pair = train_data, train_labels
    test_pair = valid_data, valid_labels
    # Train
    mae = fit_model(train_pair, test_pair, 
                    {**param, **fixed_params})

    # MlFLow part
    with mlflow.start_run():
        mlflow.log_params(param)
        mlflow.log_params(fixed_params)
        mlflow.log_metric("mae", mae)
    
    return mae

## Mlflow

In [15]:
if mlflow.__version__<='1.26.1':
    from mlflow.tracking import MlflowClient
else:
    from mlflow import MlflowClient

client = MlflowClient()

experiment_info = client.get_experiment_by_name(experiment_name)

if experiment_info is None:
    mlflow.create_experiment(name=experiment_name) 
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
print(experiment_id)

1


## Search

In [None]:
%%time

hours = 7

study = optuna.create_study(direction='minimize',
                pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, 
               n_trials=120,
               timeout = 60*60*hours
              )
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-07-20 04:25:32,265] A new study created in memory with name: no-name-9c2680dc-8ffe-4db8-9032-8c514866e4e0


0:	learn: 6.3709798	test: 5.8710595	best: 5.8710595 (0)	total: 3.06s	remaining: 50m 54s
1:	learn: 6.2533215	test: 5.7559373	best: 5.7559373 (1)	total: 5.69s	remaining: 47m 18s
2:	learn: 6.1412319	test: 5.6462427	best: 5.6462427 (2)	total: 7.95s	remaining: 44m
3:	learn: 6.0341890	test: 5.5408784	best: 5.5408784 (3)	total: 10.6s	remaining: 44m 10s
4:	learn: 5.9221244	test: 5.4319727	best: 5.4319727 (4)	total: 13.7s	remaining: 45m 17s
5:	learn: 5.8143054	test: 5.3256023	best: 5.3256023 (5)	total: 17.1s	remaining: 47m 10s
6:	learn: 5.7248343	test: 5.2387495	best: 5.2387495 (6)	total: 19.5s	remaining: 46m 5s
7:	learn: 5.6387233	test: 5.1542387	best: 5.1542387 (7)	total: 22.4s	remaining: 46m 11s
8:	learn: 5.5542577	test: 5.0721873	best: 5.0721873 (8)	total: 24.5s	remaining: 45m 1s
9:	learn: 5.4569948	test: 4.9763336	best: 4.9763336 (9)	total: 27.6s	remaining: 45m 33s
10:	learn: 5.3617498	test: 4.8821803	best: 4.8821803 (10)	total: 30.9s	remaining: 46m 19s
11:	learn: 5.2740472	test: 4.7968831

In [None]:
print("Best trial: score {}, params {}".format(study.best_trial.value, 
					       study.best_trial.params))

In [None]:
len(study.trials)

## Store study (optional)

In [None]:
optuna_selected_feat_filenpath

In [None]:
joblib.dump(study, optuna_selected_feat_filenpath)

## Store best on selected 300 features for submission initial

In [None]:
best_params = study.best_trial.params.copy()
best_params

In [13]:
best_params = {'depth': 9,
 'l2_leaf_reg': 18,
 'bootstrap_type': 'Bayesian',
 'bagging_temperature': 2.855446277852597}

In [None]:
del study

In [None]:
import gc 
gc.collect()

In [16]:
best_params

{'depth': 9,
 'l2_leaf_reg': 18,
 'bootstrap_type': 'Bayesian',
 'bagging_temperature': 2.855446277852597}

In [17]:
fixed_params

{'loss_function': 'MAE',
 'eval_metric': 'MAE',
 'thread_count': -1,
 'random_seed': 42,
 'iterations': 1000,
 'od_type': 'Iter',
 'od_wait': 20}

In [None]:
%%time
data_whole = load_and_process(date_ids)

In [24]:
train_pool = Pool(data_whole[0], 
                  label=data_whole[1])

best_model = CatBoostRegressor(**best_params, 
                          **fixed_params)

best_model.fit(X=train_pool, 
          #eval_set=test_pool,
          verbose=1)

0:	learn: 6.2867236	total: 6.79s	remaining: 1h 53m 3s
1:	learn: 6.1682897	total: 10.8s	remaining: 1h 29m 30s
2:	learn: 6.0529457	total: 14.8s	remaining: 1h 21m 45s
3:	learn: 5.9404710	total: 18.6s	remaining: 1h 17m 23s
4:	learn: 5.8304982	total: 22.6s	remaining: 1h 14m 54s
5:	learn: 5.7227778	total: 26.6s	remaining: 1h 13m 19s
6:	learn: 5.6178822	total: 30.5s	remaining: 1h 12m 1s
7:	learn: 5.5153097	total: 34.4s	remaining: 1h 11m 3s
8:	learn: 5.4154015	total: 38.2s	remaining: 1h 10m 11s
9:	learn: 5.3175526	total: 42.2s	remaining: 1h 9m 40s
10:	learn: 5.2221840	total: 46.1s	remaining: 1h 9m 5s
11:	learn: 5.1281178	total: 50s	remaining: 1h 8m 39s
12:	learn: 5.0363666	total: 54s	remaining: 1h 8m 18s
13:	learn: 4.9464258	total: 57.9s	remaining: 1h 7m 57s
14:	learn: 4.8591066	total: 1m 1s	remaining: 1h 7m 36s
15:	learn: 4.7738411	total: 1m 5s	remaining: 1h 7m 17s
16:	learn: 4.6900268	total: 1m 9s	remaining: 1h 7m 5s
17:	learn: 4.6086423	total: 1m 13s	remaining: 1h 6m 56s
18:	learn: 4.528942

<catboost.core.CatBoostRegressor at 0xac4d7e0>

In [29]:
%%time
start_day = date_ids[0]
end_day = date_ids[-1]

#path = f"../{config['MODEL_DIR']}/catboost_best_daya_{start_day}_{end_day}_4_folds"
path = f"../{config['MODEL_DIR']}/catboost_best_daya_{start_day}_{end_day}"

best_model.save_model(path,
           format="cbm")

CPU times: user 3.69 ms, sys: 8 ms, total: 11.7 ms
Wall time: 45.2 ms


In [None]:
# 10:30