# Overview
This repository provides code implementation for training Gradient Boosting Models (GBMs), a popular machine learning technique for both classification and regression tasks. GBMs are ensemble methods that combine the predictions of several base estimators to improve accuracy and generalization performance.



# Inference
[[MITSUI-CPC] Gradient Boosting Models (Inference)](https://www.kaggle.com/code/takaito/mitsui-cpc-gradient-boosting-models-inference)

# Tips
## 1. CV Strategy
By setting kfold = KFold(n_splits=CFG.N_SPLIT, shuffle=False), the data is being loaded in chronological order, so the splitting is performed based on the time series.

## 2. feature importance
In LightGBM, we save the feature importance. This allows you to check which features are effective and can provide insights for removing unnecessary features or creating new ones, so please make use of it.

To be updated!! (I plan to add more hints if the number of votes increases.)

In [1]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
from glob import glob
from pathlib import Path
import joblib
import pickle
import itertools
from tqdm.auto import tqdm

import torch
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GroupKFold
from sklearn.metrics import log_loss, roc_auc_score, matthews_corrcoef, f1_score
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import xgboost as xgb
from catboost import Pool, CatBoostRegressor, CatBoostClassifier

In [2]:
!mkdir oof
!mkdir models

In [3]:
# ====================================================
# Configurations
# ====================================================
class CFG:
    VER = 1
    AUTHOR = 'takaito'
    COMPETITION = 'mitsui-commodity-prediction-challenge'
    DATA_PATH = Path('/kaggle/input/mitsui-commodity-prediction-challenge')
    OOF_DATA_PATH = Path('./oof')
    MODEL_DATA_PATH = Path('./models')
    METHOD_LIST = ['lightgbm', 'xgboost', 'catboost']
    USE_GPU = torch.cuda.is_available()
    SEED = 42
    N_SPLIT = 3
    metric = 'rmse'
    metric_maximize_flag = False

    num_boost_round = 2500
    early_stopping_round = 10
    verbose = 50
    
    regression_lgb_params = {
        'objective': 'regression',
        'metric': 'rmse', 
        'learning_rate': 0.005,
        'num_leaves': 6,
        'seed': SEED,
    }
    regression_xgb_params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'learning_rate': 0.005, 
        'max_depth': 4,
        'random_state': SEED,
    }
    
    regression_cat_params = {
        'loss_function': 'RMSE',
        'learning_rate': 0.005, 
        'iterations': num_boost_round, 
        'depth': 4, 
        'random_seed': SEED,
    }

    PREFIX = f'{AUTHOR}_seed{SEED}_ver{VER}'

In [4]:
# ====================================================
# Seed everything
# ====================================================
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
seed_everything(CFG.SEED)

In [5]:
SOLUTION_NULL_FILLER = -999999


def rank_correlation_sharpe_ratio(merged_df: pd.DataFrame) -> float:
    """
    Calculates the rank correlation between predictions and target values,
    and returns its Sharpe ratio (mean / standard deviation).

    :param merged_df: DataFrame containing prediction columns (starting with 'prediction_')
                      and target columns (starting with 'target_')
    :return: Sharpe ratio of the rank correlation
    :raises ZeroDivisionError: If the standard deviation is zero
    """
    prediction_cols = [col for col in merged_df.columns if col.startswith('prediction_')]
    target_cols = [col for col in merged_df.columns if col.startswith('target_')]

    def _compute_rank_correlation(row):
        non_null_targets = [col for col in target_cols if not pd.isnull(row[col])]
        matching_predictions = [col for col in prediction_cols if col.replace('prediction', 'target') in non_null_targets]
        if not non_null_targets:
            raise ValueError('No non-null target values found')
        if row[non_null_targets].std(ddof=0) == 0 or row[matching_predictions].std(ddof=0) == 0:
            raise ZeroDivisionError('Denominator is zero, unable to compute rank correlation.')
        return np.corrcoef(row[matching_predictions].rank(method='average'), row[non_null_targets].rank(method='average'))[0, 1]

    daily_rank_corrs = merged_df.apply(_compute_rank_correlation, axis=1)
    std_dev = daily_rank_corrs.std(ddof=0)
    if std_dev == 0:
        raise ZeroDivisionError('Denominator is zero, unable to compute Sharpe ratio.')
    sharpe_ratio = daily_rank_corrs.mean() / std_dev
    return float(sharpe_ratio)


def score(solution: pd.DataFrame, submission: pd.DataFrame, row_id_column_name: str) -> float:
    """
    Calculates the rank correlation between predictions and target values,
    and returns its Sharpe ratio (mean / standard deviation).
    """
    del solution[row_id_column_name]
    del submission[row_id_column_name]
    assert all(solution.columns == submission.columns)

    submission = submission.rename(columns={col: col.replace('target_', 'prediction_') for col in submission.columns})

    # Not all securities trade on all dates, but solution files cannot contain nulls.
    # The filler value allows us to handle trading halts, holidays, & delistings.
    solution = solution.replace(SOLUTION_NULL_FILLER, None)
    return rank_correlation_sharpe_ratio(pd.concat([solution, submission], axis='columns'))

In [6]:
def lightgbm_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_valid, y_valid)
    
    model = lgb.train(
                params = CFG.regression_lgb_params,
                train_set = lgb_train,
                num_boost_round = CFG.num_boost_round,
                valid_sets = [lgb_train, lgb_valid],
                callbacks=[lgb.early_stopping(stopping_rounds=CFG.early_stopping_round, verbose=CFG.verbose),
                           lgb.log_evaluation(CFG.verbose),
                          ]
            )
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred
def xgboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    xgb_train = xgb.DMatrix(data=x_train, label=y_train)
    xgb_valid = xgb.DMatrix(data=x_valid, label=y_valid)
    model = xgb.train(
                CFG.regression_xgb_params,
                dtrain = xgb_train,
                num_boost_round = CFG.num_boost_round,
                evals = [(xgb_train, 'train'), (xgb_valid, 'eval')],
                early_stopping_rounds = CFG.early_stopping_round,
                verbose_eval = CFG.verbose
            )
    # Predict validation
    valid_pred = model.predict(xgb.DMatrix(x_valid))
    return model, valid_pred
def catboost_training(x_train: pd.DataFrame, y_train: pd.DataFrame, x_valid: pd.DataFrame, y_valid: pd.DataFrame):
    cat_train = Pool(data=x_train, label=y_train)
    cat_valid = Pool(data=x_valid, label=y_valid)
    model = CatBoostRegressor(**CFG.regression_cat_params)
    model.fit(cat_train,
              eval_set = [cat_valid],
              early_stopping_rounds = CFG.early_stopping_round,
              verbose = CFG.verbose,
              use_best_model = True)
    # Predict validation
    valid_pred = model.predict(x_valid)
    return model, valid_pred

def gradient_boosting_model_cv_training(method: str, train_df: pd.DataFrame, features: list, target_cols: list):
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train_df))
    oof_fold = np.zeros(len(train_df))
    for fold in range(CFG.N_SPLIT):
        print('-'*50)
        print(f'{method} training fold {fold+1}')
        x_train = train_df[train_df['cv_flag']!=fold+1][features]
        y_train = train_df[train_df['cv_flag']!=fold+1]['target']
        valid_df = train_df[train_df['cv_flag']==fold+1].copy()
        x_valid = valid_df[features]
        y_valid = valid_df['target']
        if method == 'lightgbm':
            model, valid_pred = lightgbm_training(x_train, y_train, x_valid, y_valid)
            ## 2. feature importance
            importance_df = pd.DataFrame(model.feature_importance(), index=features, columns=['importance']).reset_index()
            importance_df.to_csv(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_{CFG.PREFIX}_importance.csv', index=False)
        if method == 'xgboost':
            model, valid_pred = xgboost_training(x_train, y_train, x_valid, y_valid)
        if method == 'catboost':
            model, valid_pred = catboost_training(x_train, y_train, x_valid, y_valid)

        # Save best model
        pickle.dump(model, open(CFG.MODEL_DATA_PATH / f'{method}_fold{fold + 1}_{CFG.PREFIX}.pkl', 'wb'))
        # Add to out of folds array
        oof_predictions[train_df['cv_flag']==fold+1] = valid_pred
        del x_train, x_valid, y_train, y_valid, model, valid_pred, valid_df
        gc.collect()

    train_df['pred'] = oof_predictions
    # Create a dataframe to store out of folds predictions
    np.save(CFG.OOF_DATA_PATH / f'oof_{method}_{CFG.PREFIX}', oof_predictions)

In [7]:
train_df = pl.read_csv(CFG.DATA_PATH / f'train.csv').to_pandas()
train_labels_df = pl.read_csv(CFG.DATA_PATH / f'train_labels.csv').to_pandas()

In [8]:
original_features = list(train_df.columns[1:])

In [9]:
target_cols = list(train_labels_df.columns[1:])

In [10]:
train_df['cv_flag'] = pd.qcut(train_df.index, CFG.N_SPLIT, labels=False) + 1

In [11]:
training_df = []
for j, target_col in enumerate(target_cols):
    temp_train_df = train_df.copy()
    temp_train_df['target_id'] = j
    y = train_labels_df[target_col].values
    temp_train_df['target'] = y
    mask = ~(np.isnan(y) | np.isinf(y) | (np.abs(y) > 1e10))
    training_df.append(temp_train_df[mask].copy())
training_df = pd.concat(training_df).reset_index(drop=True)

In [12]:
for method in CFG.METHOD_LIST:
    gradient_boosting_model_cv_training(method, training_df.copy(), original_features + ['target_id'], target_cols)

--------------------------------------------------
lightgbm training fold 1
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.246962 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 140983
[LightGBM] [Info] Number of data points in the train set: 484761, number of used features: 553
[LightGBM] [Info] Start training from score -0.000052
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[39]	training's rmse: 0.0299882	valid_1's rmse: 0.0350345
--------------------------------------------------
lightgbm training fold 2
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 1.321417 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 142196
[LightGBM] [Info] Number of data points in the train set: 483219, number of used features: 558
[LightGBM] [Info] Start training from score -0.000072
