# Imports

In [5]:
from catboost import CatBoostRegressor
import h5py
import json
import mlflow
import optuna
import pandas as pd
import polars as pl
from sklearn.metrics import mean_squared_error
import sys

In [6]:
sys.path.append('../src/')

# Configs

In [7]:
settings_path = '../configs/settings.json'
with open(settings_path, 'r') as f:
    config = json.load(f)

# Functions

In [8]:
def write_to_hdf5(df, filename):
    with h5py.File(filename, 'w') as f:
        for column in df.columns:
            f.create_dataset(column, 
                             data=df[column].to_numpy())

def read_hdf5(filename):
    with h5py.File(filename, 'r') as f:
        data = {key: np.array(f[key]) for key in f.keys()}
    return data

# Read all the features

In [9]:
%%time
df_polars = pl.read_parquet(f"../{config['RAW_DATA_DIR']}/full_features.parquet.gzip")

In [10]:
df_polars.shape

(5237980, 406)

## Validation split

In [13]:
train_feas = df_polars.filter(pl.col('date_id') < 390)
valid_feas = df_polars.filter(pl.col('date_id') >= 390)

In [None]:
%%time
write_to_hdf5(train_feas, '../data/train_feas.h5')

In [None]:
%%time
write_to_hdf5(valid_feas, '../data/valid_feas.h5')

# Intermediate optuna on Catboost for feature selection

## Trial

In [None]:
def fit_catboost(trial):

    param = {
        'iterations' : 400, 
        "learning_rate": trial.suggest_float("learning_rate", 
      0.001, 0.01),
        "l2_leaf_reg": trial.suggest_int("l2_leaf_reg", 
      2, 50),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 
      0.01, 0.8),
        
        "depth": trial.suggest_int("depth", 3, 9),
        
        "boosting_type": trial.suggest_categorical("boosting_type", 
     ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical("bootstrap_type", 
     ["Bayesian", "Bernoulli", "MVS"]),
        "used_ram_limit": "14gb"
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 
     0, 20)
        
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 
     0.1, 1)
        

    model = CatBoostRegressor(
        **param,
        thread_count=-1,
        random_seed=42
    )

    model.fit(
        X_train,
        y_train,
        eval_set=(X_val, y_val),
        verbose=0,
        plot=False,
        early_stopping_rounds=50,
    )
    
    preds = model.predict(X_val)
    rmse = mean_squared_error(y_val, preds,
                              squared=False)
    
    return rmse

## Search

In [None]:
# Read the data from HDF5 files
%%time
train_data = read_hdf5('../data/train_feas.h5')
valid_data = read_hdf5('../data/valid_feas.h5')

# Separate features and target
X_train = np.column_stack([train_data[col] for col in train_data if col != 'target'])
y_train = train_data['target']
X_val = np.column_stack([valid_data[col] for col in valid_data if col != 'target'])
y_val = valid_data['target']

In [None]:
%%time
study = optuna.create_study(direction='minimize')
study.optimize(fit_catboost, 
               n_trials=120,
               timeout = 60*60*2
              )
print('Number of finished trials:', len(study.trials))
print('Best trial:', study.best_trial.params)

In [None]:
optuna.visualization.plot_param_importances(study)

## Selection

In [None]:
importance_df = pd.DataFrame({'feature_importance': model.get_feature_importance(train_pool), 
              'feature_names': x_val.columns}).sort_values(by=['feature_importance'], 
                                                           ascending=False)

In [None]:
importance_df

# Optuna intensive