# Imports

In [19]:
from catboost import CatBoostRegressor, Pool
import joblib
import json
import mlflow
import optuna
import pickle
from sklearn.metrics import mean_absolute_error
import sys

In [2]:
sys.path.append('../src/')
import h5_utils
from train import drop_target_nan

# Configs

In [3]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

In [4]:
folder_daily_h5 = f"../{config['RAW_DATA_DIR']}/daily/"

metadata_filename = 'metadata.h5'
metadata_filepath = f"../{config['RAW_DATA_DIR']}/{metadata_filename}"

In [5]:
with open(f"../{config['MODEL_DIR']}/top_300_feats_nms.pkl", 'rb') as f:
    top_feature_names = pickle.load(f)

In [17]:
optuna_selected_feat_filename = 'optuna_selected_300.pkl'
optuna_selected_feat_filenpath = f"../{config['RAW_DATA_DIR']}/{optuna_all_feat_filename}"

In [6]:
# Train test split
DAY_TEST_START = 400 # train size in day terms

# MlFlow
experiment_name = 'optiver_300_features'

# features precision
REDUCE_TO_FLOAT32 = True # guaranties < 9.3Gb train pool of 300 features 

# Daily load trick

In [7]:
date_ids = h5_utils.load_metadata(metadata_filepath)
len(date_ids)

481

# Final optuna on Catboost with selected 300 features

## Trial

In [8]:
def params_catboost(trial):
    
    param = {
        "loss_function" : "MAE",
        "eval_metric" : "MAE",
        "iterations" : 1000,  
        "thread_count" : -1, 
        "random_seed" : 42,
        #"used_ram_limit": "14gb",
        #"learning_rate": trial.suggest_float("learning_rate", 
        #                              0.2, 0.9), # will optimize it later, catboost has good inner mechanism of choice
        "depth": trial.suggest_int("depth", 5, 10),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 
                                      3, 50),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", 
                             ["Bayesian", "Bernoulli", "MVS"])
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 
                                                 1, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 
                                         0.1, 0.8)
    return param

In [21]:
def fit_model(data_ids, param):
    train_data, train_labels = h5_utils.stacked_daily_data(data_ids,
                                                           folder_daily_h5,
                                                           top_feature_names,
                                                           float32=REDUCE_TO_FLOAT32)

    train_data, train_labels = drop_target_nan(train_data, 
                                               train_labels)

    data_pool = Pool(train_data, 
                label=train_labels)

    model = CatBoostRegressor(**param)
    
    model.fit(X=data_pool, verbose=0)

    return model

In [9]:
def objective(trial):
    # CV
    train_ids = date_ids[:DAY_TEST_START]
    test_ids = date_ids[DAY_TEST_START:]
    
    param = params_catboost(trial)
    # Train
    model = fit_model(train_ids, param)
    
    # Score
    ## Load validation data
    valid_data, valid_labels = h5_utils.stacked_daily_data(test_ids,
                                                           folder_daily_h5,
                                                           top_feature_names,
                                                           float32=REDUCE_TO_FLOAT32)
    ## NaN in target is not scored by the rules
    valid_data, valid_labels = drop_target_nan(valid_data, 
                                               valid_labels)
    valid_preds = model.predict(valid_data)
    ## Calculate metric
    mae = mean_absolute_error(valid_labels, valid_preds)

    # MlFLow part
    with mlflow.start_run():
        mlflow.log_params(param)
        mlflow.log_metric("mae", mae)
    
    return mae

## Mlflow

In [10]:
if mlflow.__version__<='1.26.1':
    from mlflow.tracking import MlflowClient
else:
    from mlflow import MlflowClient

client = MlflowClient()

experiment_info = client.get_experiment_by_name(experiment_name)

if experiment_info is None:
    mlflow.create_experiment(name=experiment_name) 
experiment_id = client.get_experiment_by_name(experiment_name).experiment_id
print(experiment_id)

1


## Search

In [None]:
%%time

hours = 10

study = optuna.create_study(direction='minimize',
                pruner=optuna.pruners.MedianPruner(n_warmup_steps=5))
study.optimize(objective, 
               n_trials=120,
               timeout = 60*60*hours
              )
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-07-10 07:39:17,192] A new study created in memory with name: no-name-2b0bde28-f357-4dfd-858e-a5a75a6c562a
[I 2024-07-10 08:28:12,954] Trial 0 finished with value: 0.6785094677660589 and parameters: {'depth': 10, 'l2_leaf_reg': 20, 'bootstrap_type': 'Bernoulli', 'subsample': 0.27582087672093225}. Best is trial 0 with value: 0.6785094677660589.
[I 2024-07-10 08:44:35,567] Trial 1 finished with value: 0.6883500243528615 and parameters: {'depth': 7, 'l2_leaf_reg': 34, 'bootstrap_type': 'Bernoulli', 'subsample': 0.20210913811777667}. Best is trial 0 with value: 0.6785094677660589.
[I 2024-07-10 09:24:54,348] Trial 2 finished with value: 0.6726286397902979 and parameters: {'depth': 9, 'l2_leaf_reg': 38, 'bootstrap_type': 'MVS'}. Best is trial 2 with value: 0.6726286397902979.


In [14]:
print("Best trial: score {}, params {}".format(study.best_trial.value, 
					       study.best_trial.params))

Best trial: score 0.6693457766093003, params {'depth': 10, 'l2_leaf_reg': 49, 'bootstrap_type': 'MVS'}


In [15]:
len(study.trials)

13

## Store study (optional)

In [20]:
joblib.dump(study, optuna_selected_feat_filenpath)

['.././data//optuna_selected_300.pkl']

## Store best on selected 300 features for submission initial

In [25]:
data_whole = date_ids
best_model = fit_model(data_whole, study.best_trial.params)

In [26]:
start_day = date_ids[0]
end_day = date_ids[-1]

best_model.save_model(f"../{config['MODEL_DIR']}/catboost_best_daya_{start_day}_{end_day}",
           format="cbm")