# LightGBM Model

Having established the baseline model using logistic regression. Now we try to see how well a gradient-boosted tree model performs, in this case we use Light Gradient-Boosting Machine or LightGBM.
***

### Imports

In [1]:
import sys
import joblib
from pathlib import Path

PROJECT_ROOT = Path.cwd().parent
sys.path.append(str(PROJECT_ROOT))

import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
from sklearn.model_selection import RandomizedSearchCV
from src.time_splits import time_aware_splits

### Load Data

In [2]:
project_root = Path().resolve().parents[0]
sys.path.append(str(project_root))

df_dataset = pd.read_csv(Path(project_root) / "data" / "processed" / "dataset.csv")

### Prepare the Data

In [3]:
X = df_dataset.drop(columns=["date", "winner"])
# One-time hot encode
X_encoded = pd.get_dummies(X, columns=["division", "r_stance", "b_stance"], drop_first=False)
y = df_dataset["winner"]
print(X_encoded.dtypes.unique())

[dtype('float64') dtype('int64') dtype('bool')]


### Split the Data

In [4]:
X_train, X_val, X_test, y_train, y_val, y_test = time_aware_splits(X_encoded, y)

### Find Optimal Hyperparameters

In [8]:
param_dist = {
    "num_leaves": [15, 31, 47, 63],
    "max_depth": [4, 5, 6, 7, 8],
    "min_data_in_leaf": [150, 200, 250, 300],
    "learning_rate": [0.05, 0.07, 0.09, 0.11],
    "feature_fraction": [0.6, 0.8, 1.0],
    "bagging_fraction": [0.6, 0.8, 1.0],
    "boosting_type": ["gbdt", "dart"],
    "lambda_l1": [0, 0.5, 1.0],
    "lambda_l2": [0, 0.5, 1.0],
    "bagging_freq": [1, 5, 10],
    "feature_fraction_bynode": [0.6, 0.8, 1.0]
}

base_model = lgb.LGBMClassifier(
    objective="binary",
    n_estimators=300,
    random_state=42
)

rand_search = RandomizedSearchCV(
    estimator=base_model,
    param_distributions=param_dist,
    n_iter=200,
    scoring="roc_auc",
    verbose=2,
    random_state=42,
    n_jobs=-1
)

rand_search.fit(X_train, y_train)

best_model = rand_search.best_estimator_
y_val_proba = best_model.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, y_val_proba)

print("\nBest Hyperparameters:", rand_search.best_params_)
print("Validation AUC:", val_auc)


Fitting 5 folds for each of 200 candidates, totalling 1000 fits
[LightGBM] [Info] Number of positive: 4765, number of negative: 4766
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009396 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41595
[LightGBM] [Info] Number of data points in the train set: 9531, number of used features: 201
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499948 -> initscore=-0.000210
[LightGBM] [Info] Start training from score -0.000210

Best Hyperparameters: {'num_leaves': 15, 'min_data_in_leaf': 200, 'max_depth': 6, 'learning_rate': 0.11, 'lambda_l2': 0.5, 'lambda_l1': 1.0, 'feature_fraction_bynode': 0.6, 'feature_fraction': 0.6, 'boosting_type': 'dart', 'bagging_freq': 5, 'bagging_fraction': 0.8}
Validation AUC: 0.6409663888858066


In [None]:
NUM_LEAVES = [15, 31, 47, 63] 
MAX_DEPTH = [4, 5, 6, 7, 8] 
MIN_DATA_IN_LEAF = [50, 100, 150, 200, 250, 300] 
LEARNING_RATE = [0.05, 0.07, 0.09, 0.11]

best_auc = -10 
best_hyperparam = {} 
best_model = None
for num_leaf in NUM_LEAVES: 
    for depth in MAX_DEPTH: 
        for min_data_leaf in MIN_DATA_IN_LEAF: 
            for rate in LEARNING_RATE: 
                model = lgb.LGBMClassifier( 
                    objective="binary", 
                    learning_rate=rate, 
                    n_estimators=500, 
                    num_leaves=num_leaf, 
                    min_data_in_leaf=min_data_leaf, 
                    max_depth=depth, 
                    feature_fraction=0.8, 
                    bagging_fraction=0.8, 
                    bagging_freq=1, 
                    random_state=42 ) 
                
                model.fit( 
                    X_train, 
                    y_train, 
                    eval_set=[(X_val, y_val)], 
                    eval_metric="auc", 
                    callbacks=[lgb.early_stopping(stopping_rounds=50)] ) 
                
                y_val_pred = model.predict_proba(X_val)[:, 1] 
                auc = roc_auc_score(y_val, y_val_pred) 
                if auc > best_auc:
                     best_auc = auc 
                     best_model = model
                     best_params = { 
                         "num_leaves": num_leaf, 
                         "depth" : depth, 
                         "min_data_in_leaf": min_data_leaf, 
                         "learning_rate" : rate } 

print(best_auc, best_params)

[LightGBM] [Info] Number of positive: 4765, number of negative: 4766
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.007553 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41597
[LightGBM] [Info] Number of data points in the train set: 9531, number of used features: 202
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499948 -> initscore=-0.000210
[LightGBM] [Info] Start training from score -0.000210
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[106]	valid_0's auc: 0.643374	valid_0's binary_logloss: 0.661833
[LightGBM] [Info] Number of positive: 4765, number of negative: 4766
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.008427 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 41597
[LightGBM] [Info] Number of data points in the train set: 9531, number of used f

### Model Evaluation

In [13]:
X_train_final = np.vstack([X_train, X_val])
y_train_final = np.hstack([y_train, y_val])

# best_model.set_params(n_estimators=500)
best_model.fit(X_train_final, y_train_final)

y_test_proba = best_model.predict_proba(X_test)[:, 1]
test_auc = roc_auc_score(y_test, y_test_proba)

print("Test AUC:", test_auc)

AttributeError: 'NoneType' object has no attribute 'fit'

### Save the Model

### Final Observations