In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **Machine Learning Final Project**
Below is the code that we use to submit into the "Optiver: Trading at The Close" competition. For total, we have tried 3 model, which are (with their best score):

**1. XGBoost (5.5072)** [](http://)https://www.kaggle.com/code/re6125015ncku/final-test/notebook

**2. Catboost (5.3443)** [](http://)https://www.kaggle.com/re6125015ncku/catboost

**3. LightGBM (5.3360)** [](http://)https://www.kaggle.com/code/re6125015ncku/lightgbm/notebook

All of these are done with feature engineering (description below), and cross validation to determine the best parameter for each model, and to validate the results.

The code below are a combine code with three models added, and at the end, the one we chose is the LightGBM model which return the lowest MAE.

 # **Feature Engineering**
 This part is referenced from:
 1. The remarkable work of Angle, [](http://)https://www.kaggle.com/code/lblhandsome/optiver-robust-best-single-model/notebook
 2. The explanation from Zulqarnain Ali, [](http://)https://www.kaggle.com/code/zulqarnainali/explained-singel-model-optiver

In [None]:
# Importing necessary libraries
import os
import gc
import time
import warnings
from warnings import simplefilter
from itertools import combinations

import joblib
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import tensorflow as tf
from numba import njit, prange
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import KFold, GridSearchCV, TimeSeriesSplit, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

import xgboost as xgb
from catboost import CatBoostRegressor, EShapCalcType, EFeaturesSelectionAlgorithm
import lightgbm as lgb  # LightGBM gradient boosting framework

warnings.filterwarnings("ignore")
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [None]:
# Set up parameters
is_offline = False    # Flag for online/offline mode
is_train = True    # Flag for training mode
is_infer = True    # Flag for inference mode
split_day = 435    # Split day for time series data
max_lookback = np.nan

df = pd.read_csv("/kaggle/input/optiver-trading-at-the-close/train.csv")
df = df.dropna(subset=["target"])
df.reset_index(drop=True, inplace=True)
df.shape

In [None]:
def weighted_average(a):
    w = []
    n = len(a)
    for j in range(1, n + 1):
        j = 2 if j == 1 else j
        w.append(1 / (2**(n + 1 - j)))
    return w

In [None]:
def reduce_mem_usage(df, verbose=0):
    """
    Iterate through all numeric columns of a dataframe and modify the data type
    to reduce memory usage.
    """
    # Calculate the initial memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2

    for col in df.columns:
        col_type = df[col].dtype

        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            
            if str(col_type)[:3] == "int":
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)
            else:
                # Check if the column's data type is a float
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float32)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float32)
                    
    if verbose:
        logger.info(f"Memory usage of dataframe is {start_mem:.2f} MB")
        end_mem = df.memory_usage().sum() / 1024**2
        logger.info(f"Memory usage after optimization is: {end_mem:.2f} MB")
        decrease = 100 * (start_mem - end_mem) / start_mem
        logger.info(f"Decreased by {decrease:.2f}%")

    return df

In [None]:
# Function to compute triplet imbalance in parallel using Numba
@njit(parallel=True)
def compute_triplet_imbalance(df_values, comb_indices):
    num_rows = df_values.shape[0]
    num_combinations = len(comb_indices)
    imbalance_features = np.empty((num_rows, num_combinations))

    # Loop through all combinations of triplets
    for i in prange(num_combinations):
        a, b, c = comb_indices[i]
        
        # Loop through rows of the DataFrame
        for j in range(num_rows):
            max_val = max(df_values[j, a], df_values[j, b], df_values[j, c])
            min_val = min(df_values[j, a], df_values[j, b], df_values[j, c])
            mid_val = df_values[j, a] + df_values[j, b] + df_values[j, c] - min_val - max_val
            
            # Prevent division by zero
            if mid_val == min_val:
                imbalance_features[j, i] = np.nan
            else:
                imbalance_features[j, i] = (max_val - mid_val) / (mid_val - min_val)

    return imbalance_features


# Function to calculate triplet imbalance for given price data and a DataFrame
def calculate_triplet_imbalance_numba(price, df):
    # Convert DataFrame to numpy array for Numba compatibility
    df_values = df[price].values
    comb_indices = [(price.index(a), price.index(b), price.index(c)) for a, b, c in combinations(price, 3)]

    # Calculate the triplet imbalance using the Numba-optimized function
    features_array = compute_triplet_imbalance(df_values, comb_indices)

    # Create a DataFrame from the results
    columns = [f"{a}_{b}_{c}_imb2" for a, b, c in combinations(price, 3)]
    features = pd.DataFrame(features_array, columns=columns)

    return features

In [None]:
def imbalance_features(df):
    prices = ["reference_price", "far_price", "near_price", "ask_price", "bid_price", "wap"]
    sizes = ["matched_size", "bid_size", "ask_size", "imbalance_size"]
    
    df["volume"] = df.eval("ask_size + bid_size")
    df["mid_price"] = df.eval("(ask_price + bid_price) / 2")
    df["liquidity_imbalance"] = df.eval("(bid_size-ask_size)/(bid_size+ask_size)")
    df["matched_imbalance"] = df.eval("(imbalance_size-matched_size)/(matched_size+imbalance_size)")
    df["size_imbalance"] = df.eval("bid_size / ask_size")

    for c in combinations(prices, 2):
        df[f"{c[0]}_{c[1]}_imb"] = df.eval(f"({c[0]} - {c[1]})/({c[0]} + {c[1]})")

    for c in [['ask_price', 'bid_price', 'wap', 'reference_price'], sizes]:
        triplet_feature = calculate_triplet_imbalance_numba(c, df)
        df[triplet_feature.columns] = triplet_feature.values
    
    df["stock_weights"] = df["stock_id"].map(weights)
    df["weighted_wap"] = df["stock_weights"] * df["wap"]
    df['wap_momentum'] = df.groupby('stock_id')['weighted_wap'].pct_change(periods=6)
   
    df["imbalance_momentum"] = df.groupby(['stock_id'])['imbalance_size'].diff(periods=1) / df['matched_size']
    df["price_spread"] = df["ask_price"] - df["bid_price"]
    df["spread_intensity"] = df.groupby(['stock_id'])['price_spread'].diff()
    df['price_pressure'] = df['imbalance_size'] * (df['ask_price'] - df['bid_price'])
    df['market_urgency'] = df['price_spread'] * df['liquidity_imbalance']
    df['depth_pressure'] = (df['ask_size'] - df['bid_size']) * (df['far_price'] - df['near_price'])
    
    df['spread_depth_ratio'] = (df['ask_price'] - df['bid_price']) / (df['bid_size'] + df['ask_size'])
    df['mid_price_movement'] = df['mid_price'].diff(periods=5).apply(lambda x: 1 if x > 0 else (-1 if x < 0 else 0))
    
    df['micro_price'] = ((df['bid_price'] * df['ask_size']) + (df['ask_price'] * df['bid_size'])) / (df['bid_size'] + df['ask_size'])
    df['relative_spread'] = (df['ask_price'] - df['bid_price']) / df['wap']
    
    for func in ["mean", "std", "skew", "kurt"]:
        df[f"all_prices_{func}"] = df[prices].agg(func, axis=1)
        df[f"all_sizes_{func}"] = df[sizes].agg(func, axis=1)
        

    for col in ['matched_size', 'imbalance_size', 'reference_price', 'imbalance_buy_sell_flag']:
        for window in [1,3,5,10]:
            df[f"{col}_shift_{window}"] = df.groupby('stock_id')[col].shift(window)
            df[f"{col}_ret_{window}"] = df.groupby('stock_id')[col].pct_change(window)
    
    for col in ['ask_price', 'bid_price', 'ask_size', 'bid_size', 'weighted_wap','price_spread']:
        for window in [1,3,5,10]:
            df[f"{col}_diff_{window}"] = df.groupby("stock_id")[col].diff(window)
    
    for window in [3,5,10]:
        df[f'price_change_diff_{window}'] = df[f'bid_price_diff_{window}'] - df[f'ask_price_diff_{window}']
        df[f'size_change_diff_{window}'] = df[f'bid_size_diff_{window}'] - df[f'ask_size_diff_{window}']

    pl_df = pl.from_pandas(df)

    windows = [3, 5, 10]
    columns = ['ask_price', 'bid_price', 'ask_size', 'bid_size']

    group = ["stock_id"]
    expressions = []

    for window in windows:
        for col in columns:
            rolling_mean_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_mean(window)
                .over(group)
                .alias(f'rolling_diff_{col}_{window}')
            )

            rolling_std_expr = (
                pl.col(f"{col}_diff_{window}")
                .rolling_std(window)
                .over(group)
                .alias(f'rolling_std_diff_{col}_{window}')
            )

            expressions.append(rolling_mean_expr)
            expressions.append(rolling_std_expr)

    lazy_df = pl_df.lazy().with_columns(expressions)

    pl_df = lazy_df.collect()

    df = pl_df.to_pandas()
    gc.collect()
    
    df['mid_price*volume'] = df['mid_price_movement'] * df['volume']
    df['harmonic_imbalance'] = df.eval('2 / ((1 / bid_size) + (1 / ask_size))')
    
    for col in df.columns:
        df[col] = df[col].replace([np.inf, -np.inf], 0)

    return df

def other_features(df):
    df["dow"] = df["date_id"] % 5  # Day of the week
    df["seconds"] = df["seconds_in_bucket"] % 60  
    df["minute"] = df["seconds_in_bucket"] // 60  
    df['time_to_market_close'] = 540 - df['seconds_in_bucket']
    
    for key, value in global_stock_id_feats.items():
        df[f"global_{key}"] = df["stock_id"].map(value.to_dict())

    return df

def generate_all_features(df):
    cols = [c for c in df.columns if c not in ["row_id", "time_id", "target"]]
    df = df[cols]
    
    df = imbalance_features(df)
    gc.collect() 
    df = other_features(df)
    gc.collect()  
    feature_name = [i for i in df.columns if i not in ["row_id", "target", "time_id", "date_id"]]
    
    return df[feature_name]

In [None]:
weights = [
    0.004, 0.001, 0.002, 0.006, 0.004, 0.004, 0.002, 0.006, 0.006, 0.002, 0.002, 0.008,
    0.006, 0.002, 0.008, 0.006, 0.002, 0.006, 0.004, 0.002, 0.004, 0.001, 0.006, 0.004,
    0.002, 0.002, 0.004, 0.002, 0.004, 0.004, 0.001, 0.001, 0.002, 0.002, 0.006, 0.004,
    0.004, 0.004, 0.006, 0.002, 0.002, 0.04 , 0.002, 0.002, 0.004, 0.04 , 0.002, 0.001,
    0.006, 0.004, 0.004, 0.006, 0.001, 0.004, 0.004, 0.002, 0.006, 0.004, 0.006, 0.004,
    0.006, 0.004, 0.002, 0.001, 0.002, 0.004, 0.002, 0.008, 0.004, 0.004, 0.002, 0.004,
    0.006, 0.002, 0.004, 0.004, 0.002, 0.004, 0.004, 0.004, 0.001, 0.002, 0.002, 0.008,
    0.02 , 0.004, 0.006, 0.002, 0.02 , 0.002, 0.002, 0.006, 0.004, 0.002, 0.001, 0.02,
    0.006, 0.001, 0.002, 0.004, 0.001, 0.002, 0.006, 0.006, 0.004, 0.006, 0.001, 0.002,
    0.004, 0.006, 0.006, 0.001, 0.04 , 0.006, 0.002, 0.004, 0.002, 0.002, 0.006, 0.002,
    0.002, 0.004, 0.006, 0.006, 0.002, 0.002, 0.008, 0.006, 0.004, 0.002, 0.006, 0.002,
    0.004, 0.006, 0.002, 0.004, 0.001, 0.004, 0.002, 0.004, 0.008, 0.006, 0.008, 0.002,
    0.004, 0.002, 0.001, 0.004, 0.004, 0.004, 0.006, 0.008, 0.004, 0.001, 0.001, 0.002,
    0.006, 0.004, 0.001, 0.002, 0.006, 0.004, 0.006, 0.008, 0.002, 0.002, 0.004, 0.002,
    0.04 , 0.002, 0.002, 0.004, 0.002, 0.002, 0.006, 0.02 , 0.004, 0.002, 0.006, 0.02,
    0.001, 0.002, 0.006, 0.004, 0.006, 0.004, 0.004, 0.004, 0.004, 0.002, 0.004, 0.04,
    0.002, 0.008, 0.002, 0.004, 0.001, 0.004, 0.006, 0.004,
]
weights = {int(k):v for k,v in enumerate(weights)}

In [None]:
# Check if the code is running in offline or online mode
if is_offline:
    # In offline mode, split the data into training and validation sets based on the split_day
    df_train = df[df["date_id"] <= split_day]
    df_valid = df[df["date_id"] > split_day]
    print("Offline mode")
    print(f"train : {df_train.shape}, valid : {df_valid.shape}")
else:
    # In online mode, use the entire dataset for training
    df_train = df
    print("Online mode")
    
del df
gc.collect()

In [None]:
if is_train:
    global_stock_id_feats = {
        "median_size": df_train.groupby("stock_id")["bid_size"].median() + df_train.groupby("stock_id")["ask_size"].median(),
        "std_size": df_train.groupby("stock_id")["bid_size"].std() + df_train.groupby("stock_id")["ask_size"].std(),
        "ptp_size": df_train.groupby("stock_id")["bid_size"].max() - df_train.groupby("stock_id")["bid_size"].min(),
        "median_price": df_train.groupby("stock_id")["bid_price"].median() + df_train.groupby("stock_id")["ask_price"].median(),
        "std_price": df_train.groupby("stock_id")["bid_price"].std() + df_train.groupby("stock_id")["ask_price"].std(),
        "ptp_price": df_train.groupby("stock_id")["bid_price"].max() - df_train.groupby("stock_id")["ask_price"].min(),
    }
    if is_offline:
        df_train_feats = generate_all_features(df_train)
        print("Build Train Feats Finished.")
        df_valid_feats = generate_all_features(df_valid)
        print("Build Valid Feats Finished.")
        df_valid_feats = reduce_mem_usage(df_valid_feats)
    else:
        df_train_feats = generate_all_features(df_train)
        print("Build Online Train Feats Finished.")

    df_train_feats = reduce_mem_usage(df_train_feats)

# **XGBoost**

For results and visualization, please go to https://www.kaggle.com/code/re6125015ncku/final-test/notebook

In [None]:
# Train procedure
if is_train:
    offline_split = df_train['date_id']>(split_day - 45)
    X_train = df_train_feats[~offline_split]
    X_val = df_train_feats[offline_split]
    y_train = df_train['target'][~offline_split]
    y_val = df_train['target'][offline_split]
    del df_train
    gc.collect()
    
X_train.shape, X_val.shape

In [None]:
## Hyperparameter tuning by using GridSearchCV and TimeSeriesSplit, as the dataset are time
## series dataset
if is_train:
    cv_split = TimeSeriesSplit(n_splits=5)

    param_grid = {
        'learning_rate': [0.1],
        'n_estimators': [200],
        'subsample': [0.6, 0.7],
        'colsample_bytree': [0.6, 0.7],
        'reg_alpha':[0.5],
        'gamma':[0.5],

        ##fixed_dict
        'booster': ['gbtree'],
        'max_depth': [8],
        'min_child_weight': [3],
        'grow_policy': ['depthwise'],
        'objective': ['reg:absoluteerror'],
        'num_class': [1],
        'device' : ['gpu'],
        'eval_metric' : ['mae'],
        'random_state' : [42],
        'tree_method' : ['hist']
    }

    # Create the XGBoost model object
    xgb_model = xgb.XGBRegressor()
    # Create the GridSearchCV object
    grid_search = GridSearchCV(xgb_model, param_grid, cv=cv_split)
    # Fit the GridSearchCV object to the training data
    grid_search.fit(X_train, y_train)
    # Print the best set of hyperparameters and the corresponding score
    print("Best set of hyperparameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)

    best_params = grid_search.best_params_

In [None]:
## We adjust the parameter based on the value return from the best_params
clf = xgb.XGBRegressor(booster='gbtree', colsample_bytree=1.0, device='gpu', eval_metric='mae', gamma=0.5, grow_policy='depthwise', 
                       learning_rate=0.1, max_depth=4, min_child_weight=3, n_estimators=200, 
                       num_class=1, objective='reg:absoluteerror', random_state=42, reg_alpha=0.5, subsample=0.7, tree_method='hist')    ### version 4 -> 5.5131
# clf = xgb.XGBRegressor(**best_params)
clf

In [None]:
clf.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], early_stopping_rounds=100, verbose=0)

In [None]:
y_pred = clf.predict(X_val)
mae = mean_absolute_error(y_pred, y_val)
print("Mean Absolute Error: {}".format(mae))

# **CatBoost**

For results and visualization, please go to https://www.kaggle.com/re6125015ncku/catboost

In [None]:
## Hyperparameter tuning by using GridSearchCV and TimeSeriesSplit, as the dataset are time
## series dataset
if is_train:
    train_data = df_train_feats
    train_labels = df_train['target']
    cv_split = TimeSeriesSplit(n_splits=5)

    param_grid = {
        'learning_rate': [1.0, 0.1, 0.01],
        'depth': [4, 6, 8, 10],
        'l2_leaf_reg': [10, 30, 50],
        'iterations': [1200],
        'bootstrap_type': ['Bernoulli'],
        'subsample': [0.66],
        'od_type': ['Iter'],
        'od_wait': [30],
        'allow_writing_files': [False],
    }

    ctb_model = CatBoostRegressor(loss_function='MAE', eval_metric='MAE', task_type='GPU')
    # Create the GridSearchCV object
    grid_search = GridSearchCV(ctb_model, param_grid, cv=cv_split, scoring='neg_mean_absolute_error', verbose=1)

    # Fit the GridSearchCV object to the training data
    grid_search.fit(train_data, train_labels, verbose=0)

    # Print the best set of hyperparameters and the corresponding score
    print("Best set of hyperparameters: ", grid_search.best_params_)
    print("Best score: ", grid_search.best_score_)
    best_params = grid_search.best_params_

In [None]:
# Train procedure
if is_train:
    offline_split = df_train['date_id']>(split_day - 45)
    df_offline_train = df_train_feats[~offline_split]
    df_offline_valid = df_train_feats[offline_split]
    df_offline_train_target = df_train['target'][~offline_split]
    df_offline_valid_target = df_train['target'][offline_split]
    df_train_target = df_train["target"]
    del df_train
    gc.collect()
    
    ## We adjust the parameter based on the value return from the best_params
    ctb_params = dict(iterations=1200,
                      learning_rate=1.0,
                      depth=8,
                      l2_leaf_reg=30,
                      bootstrap_type='Bernoulli',
                      subsample=0.66,
                      loss_function='MAE',
                      eval_metric = 'MAE',
                      metric_period=100,
                      od_type='Iter',
                      od_wait=30,
                      task_type='GPU',
                      allow_writing_files=False,
                      )
    
    print("Feature Elimination Performing.")
    ctb_model = CatBoostRegressor(**ctb_params)
    summary = ctb_model.select_features(
        df_offline_train[feature_name], df_offline_train_target,
        eval_set=[(df_offline_valid[feature_name], df_offline_valid_target)],
        features_for_select=feature_name,
        num_features_to_select=len(feature_name)-24,    # Dropping from 124 to 100
        steps=3,
        algorithm=EFeaturesSelectionAlgorithm.RecursiveByShapValues,
        shap_calc_type=EShapCalcType.Regular,
        train_final_model=False,
        plot=True,
    )
    
    print("Valid Model Training on Selected Features Subset.")
    ctb_model = CatBoostRegressor(**ctb_params)
    ctb_model.fit(
        df_offline_train[summary['selected_features_names']], df_offline_train_target,
        eval_set=[(df_offline_valid[summary['selected_features_names']], df_offline_valid_target)],
        use_best_model=True,
    )
    
    del df_offline_train, df_offline_valid, df_offline_train_target, df_offline_valid_target
    gc.collect()
    
    print("Infer Model Training on Selected Features Subset.")
    infer_params = ctb_params.copy()
    # CatBoost train best with Valid number of iterations
    infer_params["iterations"] = ctb_model.best_iteration_
    infer_ctb_model = CatBoostRegressor(**infer_params)
    infer_ctb_model.fit(df_train_feats[summary['selected_features_names']], df_train_target)
    print("Infer Model Training on Selected Features Subset Complete.")
    
    if is_offline:   
        # Offline predictions
        df_valid_target = df_valid["target"]
        offline_predictions = infer_ctb_model.predict(df_valid_feats[summary['selected_features_names']])
        offline_score = mean_absolute_error(offline_predictions, df_valid_target)
        print(f"Offline Score {np.round(offline_score, 4)}")
        del df_valid, df_valid_feats
        gc.collect()
    
    del df_train_feats
    gc.collect()

In [None]:
## Generates a bar plot visualizing the feature importances of our model, 
## making it easier to identify which features are more influential in the model's predictions.
feat_importances = infer_ctb_model.get_feature_importance(prettified=True)

plt.figure(figsize=(12, 20))
sns.barplot(x="Importances", y="Feature Id", data=feat_importances)
plt.title('CatBoost features importance:')
plt.tight_layout()

In [None]:
## Check for top interactions in the model.
feat_interactions = infer_ctb_model.get_feature_importance(type=EFstrType.Interaction, prettified=True)
top_interactions = feat_interactions[:10]
top_interactions

In [None]:
## Change above in index to column, making it easier to
## identify which features interaction are more influential in the model's predictions.
top_interactions['First Feature Index'] = top_interactions['First Feature Index'].apply(lambda x: summary['selected_features_names'][x])
top_interactions['Second Feature Index'] = top_interactions['Second Feature Index'].apply(lambda x: summary['selected_features_names'][x])
top_interactions.columns = ['First Feature', 'Second Feature', 'Interaction']
top_interactions

# **LightGBM**

For results and visualization, please go to https://www.kaggle.com/code/re6125015ncku/lightgbm/notebook

In [None]:
## Hyperparameter tuning by using GridSearchCV and TimeSeriesSplit, as the dataset are time
## series dataset
train_data = df_train_feats
train_labels = df_train['target']
cv_split = TimeSeriesSplit(n_splits=5)

param_grid = {
    'num_leaves': [128, 256, 512],
    'subsample': [0.4, 0.6, 0.8, 1.0],
    'colsample_bytree': [0.4, 0.6, 0.8, 1.0],
    'learning_rate': [0.001, 0.01, 0.1, 1.0],
    'maxdepth': [3, 11, 24],
    'reg_alpha': [0.1, 0.2, 0.3],
    'reg_lambda': [0.2]
    'objective': ['mae'],
    'n_estimators': [5000],
    'device': ['gpu'],
    'n_jobs': [-1],
    'importance_type': [gain],
}

lgb_model = LGBMRegressor(loss_function='MAE', eval_metric='MAE', task_type='GPU')
# Create the GridSearchCV object
grid_search = GridSearchCV(lgb_model, param_grid, cv=cv_split, scoring='neg_mean_absolute_error', verbose=1)

# Fit the GridSearchCV object to the training data
grid_search.fit(train_data, train_labels, verbose=0)

# Print the best set of hyperparameters and the corresponding score
print("Best set of hyperparameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)
best_params = grid_search.best_params_

In [None]:
## We adjust the parameter based on the value return from the best_params
lgb_params = {
    "objective": "mae",
    "n_estimators": 5000,
    "num_leaves": 256,       # 512
    "subsample": 0.6,        # 0.4
    "colsample_bytree": 0.8, # 0.6
    "learning_rate": 0.01,   # 0.001
    'max_depth': 11,         # 24
    "n_jobs": -1,
    "device": "gpu",
    "verbosity": -1,
    "importance_type": "gain",
    "reg_alpha": 0.2,        # 0.1
    "reg_lambda": 3.25
}

feature_columns = list(df_train_feats.columns)
print(f"Features = {len(feature_columns)}")
    
num_folds = 5
fold_size = 480 // num_folds
gap = 5

models = []
models_cbt = []
scores = []

model_save_path = 'modelitos_para_despues' 
if not os.path.exists(model_save_path):
    os.makedirs(model_save_path)

date_ids = df_train['date_id'].values

for i in range(num_folds):
    start = i * fold_size
    end = start + fold_size
    if i < num_folds - 1:  
        purged_start = end - 2
        purged_end = end + gap + 2
        train_indices = (date_ids >= start) & (date_ids < purged_start) | (date_ids > purged_end)
    else:
        train_indices = (date_ids >= start) & (date_ids < end)

    test_indices = (date_ids >= end) & (date_ids < end + fold_size)

    gc.collect()

    df_fold_train = df_train_feats[train_indices]
    df_fold_train_target = df_train['target'][train_indices]
    df_fold_valid = df_train_feats[test_indices]
    df_fold_valid_target = df_train['target'][test_indices]

    print(f"Fold {i+1} Model Training")

    lgb_model = lgb.LGBMRegressor(**lgb_params)
    lgb_model.fit(
        df_fold_train[feature_columns],
        df_fold_train_target,
        eval_set=[(df_fold_valid[feature_columns], df_fold_valid_target)],
        callbacks=[
            lgb.callback.early_stopping(stopping_rounds=100),
            lgb.callback.log_evaluation(period=100),
        ],
    )

    models.append(lgb_model)
    model_filename = os.path.join(model_save_path, f'doblez_{i+1}.txt')
    lgb_model.booster_.save_model(model_filename)
    print(f"Model for fold {i+1} saved to {model_filename}")

    fold_predictions = lgb_model.predict(df_fold_valid[feature_columns])
    fold_score = mean_absolute_error(fold_predictions, df_fold_valid_target)
    scores.append(fold_score)
    print(f":LGB Fold {i+1} MAE: {fold_score}")

    del df_fold_train, df_fold_train_target, df_fold_valid, df_fold_valid_target
    gc.collect()

average_best_iteration = int(np.mean([model.best_iteration_ for model in models]))

final_model_params = lgb_params.copy()

num_model = 1

for i in range(num_model):
    final_model = lgb.LGBMRegressor(**final_model_params)
    final_model.fit(
        df_train_feats[feature_columns],
        df_train['target'],
        callbacks=[
            lgb.callback.log_evaluation(period=100),
        ],
    )
    models.append(final_model)

# **Submission**

In [None]:
def zero_sum(prices, volumes):
    std_error = np.sqrt(volumes)
    step = np.sum(prices) / np.sum(std_error)
    out = prices - std_error * step
    return out

if is_infer:
    import optiver2023
    env = optiver2023.make_env()
    iter_test = env.iter_test()
    counter = 0
    y_min, y_max = -64, 64
    qps, predictions = [], []
    cache = pd.DataFrame()

    lgb_model_weights = weighted_average(models)
    
    for (test, revealed_targets, sample_prediction) in iter_test:
        now_time = time.time()
        cache = pd.concat([cache, test], ignore_index=True, axis=0)
        if counter > 0:
            cache = cache.groupby(['stock_id']).tail(21).sort_values(by=['date_id', 'seconds_in_bucket', 'stock_id']).reset_index(drop=True)
        feat = generate_all_features(cache)[-len(test):]
        print(f"Feat Shape is: {feat.shape}")
        
        lgb_predictions = np.zeros(len(test))
        for model, weight in zip(models, lgb_model_weights):
            lgb_predictions += weight * model.predict(feat[feature_columns])

        predictions = lgb_predictions
        
        final_predictions = predictions - np.mean(predictions)
        clipped_predictions = np.clip(final_predictions, y_min, y_max)
        sample_prediction['target'] = clipped_predictions
        env.predict(sample_prediction)
        counter += 1
        qps.append(time.time() - now_time)
        if counter % 10 == 0:
            print(counter, 'qps:', np.mean(qps))

    time_cost = 1.146 * np.mean(qps)
    print(f"The code will take approximately {np.round(time_cost, 4)} hours to reason about")

In [None]:
sample_prediction.hist(column='target', bins=100, range=[-10,10])

In [None]:
sample_prediction.to_csv('preds.csv')
sample_prediction