# Imports

In [74]:
from catboost import CatBoostRegressor, Pool, sum_models
import h5py
import json
import mlflow
import numpy as np
import optuna
import os
import pandas as pd
import polars as pl
import sys

from sklearn.metrics import mean_squared_error
from tqdm import tqdm

In [6]:
sys.path.append('../src/')

# Configs

In [7]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

In [32]:
folder_daily_h5 = f"../{config['RAW_DATA_DIR']}/daily/"
folder_daily_h5

metadata_filename = 'metadata.h5'
metadata_filepath = f"../{config['RAW_DATA_DIR']}/{metadata_filename}"
metadata_filepath

'.././data//metadata.h5'

# Daily load trick

In [43]:
def load_metadata():
    with h5py.File(metadata_filepath, 'r') as f:
        return f['date_ids'][:]

In [45]:
date_ids = load_metadata()
len(date_ids)

481

In [26]:
def load_daily_minimal(date_id, folder_daily_h5):
    filepath=f'{folder_daily_h5}/{date_id}.h5'
    with h5py.File(filepath, 'r') as f:
        # Load the target column
        target = f['data']['target'][:]

        features_group = f['data']['features']
        feature_list = [features_group[name][:] for name in features_group.keys()]
        features = np.array(feature_list).T 

    return features, target

# Validation splits

In [60]:
max_train_size =  60
test_size=15
slide_step=30
n_samples=len(date_ids)

effective_train_size = max_train_size + gap
n_splits = (n_samples - test_size) // (effective_train_size + test_size) + 1
print(n_splits)
tscv = TimeSeriesSplit(n_splits=n_splits,
                max_train_size=max_train_size, 
                test_size=test_size, 
                gap=gap)


In [70]:
class SlidingWindowSplit:
    def __init__(self, 
                 data, 
                 train_days, 
                 test_days, 
                 slide_step=1):
        self.data = data
        self.train_days = train_days
        self.test_days = test_days
        self.slide_step = slide_step

    def __iter__(self):
        for i in range(self.train_days, len(self.data) - self.test_days + 1, self.slide_step):
            yield np.arange(i - self.train_days, i), np.arange(i, i + self.test_days)

    def __len__(self):
        return (len(self.data) - self.train_days - self.test_days + 1) // self.slide_step


In [71]:
tscv = SlidingWindowSplit(date_ids, 
                       max_train_size, 
                       test_size, 
                       slide_step)

In [73]:
for i, (train_index, test_index) in enumerate(tqdm(tscv, 
                                         desc="Training and Validation")):
    
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}")
    print(f"  Test:  index={test_index}")

Training and Validation: 14it [00:00, 1701.35it/s]             

Fold 0:
  Train: index=[ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
 48 49 50 51 52 53 54 55 56 57 58 59]
  Test:  index=[60 61 62 63 64 65 66 67 68 69 70 71 72 73 74]
Fold 1:
  Train: index=[30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53
 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77
 78 79 80 81 82 83 84 85 86 87 88 89]
  Test:  index=[ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104]
Fold 2:
  Train: index=[ 60  61  62  63  64  65  66  67  68  69  70  71  72  73  74  75  76  77
  78  79  80  81  82  83  84  85  86  87  88  89  90  91  92  93  94  95
  96  97  98  99 100 101 102 103 104 105 106 107 108 109 110 111 112 113
 114 115 116 117 118 119]
  Test:  index=[120 121 122 123 124 125 126 127 128 129 130 131 132 133 134]
Fold 3:
  Train: index=[ 90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 1




# Intermediate optuna on Catboost for feature selection

## Trial

In [85]:
def fit_catboost(trial):

    param = {
        'iterations' : 400, 
        "learning_rate": trial.suggest_float("learning_rate", 
      0.001, 0.01),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 
      2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 
      0.01, 0.8),
        
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", 
     ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", 
     ["Bayesian", "Bernoulli", "MVS"]),
        "used_ram_limit": "14gb"
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 
     0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 
     0.1, 1)
        
    tscv = SlidingWindowSplit(date_ids, 
                           max_train_size, 
                           test_size, 
                           slide_step)

    all_valid_labels = []
    all_valid_preds = []
    models = []

    for i, (train_index, test_index) in enumerate(tqdm(tscv,
                                                       desc="Training and Validation")):
        train_date_ids = date_ids[train_index]
        test_date_ids = date_ids[test_index]

        # Load training data
        train_data = []
        train_labels = []
        for date_id in train_date_ids:
            daily_data, daily_labels = load_daily_minimal(date_id, 
                                                          folder_daily_h5)
            
            train_data.append(daily_data)
            train_labels.append(daily_labels)
        
        train_data = np.vstack(train_data)
        train_labels = np.hstack(train_labels)

        # Load validation data
        valid_data = []
        valid_labels = []
        for date_id in test_date_ids:
            daily_data, daily_labels = load_daily_minimal(date_id, 
                                                          folder_daily_h5)
    
            valid_data.append(daily_data)
            valid_labels.append(daily_labels)

        valid_data = np.vstack(valid_data)
        valid_labels = np.hstack(valid_labels)

        # Inputation of NaN
        train_labels = np.nan_to_num(train_labels, nan=-9e10)
        valid_labels = np.nan_to_num(valid_labels, nan=-9e10)
        
        batch = Pool(train_data, label=train_labels)

        # Train
        if i == 0:
            model = CatBoostRegressor(**param, 
                                      thread_count=-1, 
                                      random_seed=42)
        else:
            model = CatBoostRegressor(**param, 
                                      thread_count=-1, 
                                      random_seed=42)
            batch.set_baseline(models[-1].predict(batch))
        
        model.fit(batch, verbose=0)

        models.append(model)

        preds = model.predict(valid_data)
        all_valid_labels.extend(valid_labels)
        all_valid_preds.extend(preds)

    # https://catboost.ai/en/docs/concepts/python-usages-examples#batch-training
    final_model = sum_models(models)

    final_preds = final_model.predict(valid_data)
    rmse = mean_squared_error(all_valid_labels, 
                              all_valid_preds, 
                              squared=False)
    return rmse

## Search

In [None]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(fit_catboost, 
               n_trials=120,
               timeout = 60*60*4
              )
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

[I 2024-07-05 05:37:38,518] A new study created in memory with name: no-name-d80d3240-4a02-48e6-8246-06f99a30b99e
Training and Validation:   8%|▊         | 1/13 [08:58<1:47:41, 538.44s/it]

In [None]:
study

In [None]:
optuna.visualization.plot_param_importances(study)

## Selection

In [None]:
importance_df = pd.DataFrame({'feature_importance': model.get_feature_importance(train_pool), 
              'feature_names': x_val.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

In [None]:
importance_df

# Optuna intensive