# LightGBM

In [1]:
import time
import ast
import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    log_loss
)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

# Evaluation helper
from evaluation import *
from data_utils import *

In [2]:
# Load and verify data
train_df, test_df, y, class_names = load_and_prepare_data()

# Classical ML path
X_train, X_val, X_test, y_train, y_val, vecs = prepare_tfidf_pipeline(train_df, test_df, y)

print("TF-IDF Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)

TF-IDF Train shape: (45981, 60000)
Validation shape: (11496, 60000)


In [3]:
from lightgbm import LGBMClassifier
import lightgbm as lgb

# LightGBM params & training
lgbm_params = dict(
    objective="multiclass",
    num_class=3,
    n_estimators=1200,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=127,
    min_child_samples=20,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=0.1,
    boosting_type="gbdt",
    random_state=42,
    n_jobs=-1,
    force_col_wise=True
)

lgbm_model = LGBMClassifier(**lgbm_params)

t0 = time.time()

lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    eval_metric="multi_logloss",
    callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
)

t1 = time.time()
print(f"⏱️ LightGBM training time: {(t1 - t0):.2f} seconds ({(t1 - t0)/60:.2f} minutes)")

[LightGBM] [Info] Total Bins 3296908
[LightGBM] [Info] Number of data points in the train set: 45981, number of used features: 56218
[LightGBM] [Info] Start training from score -1.052457
[LightGBM] [Info] Start training from score -1.073231
[LightGBM] [Info] Start training from score -1.174353
⏱️ LightGBM training time: 558.17 seconds (9.30 minutes)


In [4]:
# Predictions (LightGBM)
y_pred_val_lgbm  = lgbm_model.predict(X_val)
y_proba_val_lgbm = lgbm_model.predict_proba(X_val)

# Test predictions + submission
y_pred_test_lgbm  = lgbm_model.predict(X_test)
y_proba_test_lgbm = lgbm_model.predict_proba(X_test)



In [5]:
print("\n================ LIGHTGBM EVALUATION ================\n")
# Metrics
_ = eval_metrics(y_val, y_pred_val_lgbm)
eval_classification_report(y_val, y_pred_val_lgbm, class_names)
# ROC-AUC
_ = eval_roc_auc(y_val, y_proba_val_lgbm)
# Log-loss
_ = eval_log_loss(y_val, y_proba_val_lgbm)
_ = eval_log_loss_per_class(y_val, y_proba_val_lgbm)



*** GLOBAL METRICS ***
Accuracy (Global)      : 0.4616
Precision (Macro Avg)  : 0.4603
Recall (Macro Avg)     : 0.4575
F1-Score (Macro Avg)   : 0.4558

*** PER-CLASS EVALUATION ***
Class                Precision    Recall  F1-Score   Support
------------------------------------------------------------
winner_model_a            0.46      0.52      0.49      4013
winner_model_b            0.47      0.50      0.49      3931
winner_tie                0.45      0.35      0.39      3552
------------------------------------------------------------
Macro Avg                 0.46      0.46      0.46     34488
Weighted Avg              0.46      0.46      0.46     34488

*** ROC-AUC EVALUATION ***
ROC-AUC (OvR) : 0.6366

*** LOG-LOSS EVALUATION ***
Log-loss      : 1.0398

*** LOG-LOSS PER CLASS ***
Class 0: 0.9910  (n=4013)
Class 1: 1.0068  (n=3931)
Class 2: 1.1314  (n=3552)


In [6]:
# Confusion Matrix + Plot
cm_lgbm = eval_confusion_matrix(y_val, y_pred_val_lgbm, n_classes=y_proba_val_lgbm.shape[1])
plot_confusion_matrix(cm_lgbm, class_names, title="Confusion Matrix — LightGBM", save_path="results/confusion_matrix/confusion_matrix_lgbm.png")


Confusion Matrix (rows=true, cols=pred):
 [[2103 1151  759]
 [1205 1976  750]
 [1280 1044 1228]]
Saved plot to: images/confusion_matrix/confusion_matrix_lgbm.png


In [7]:
# ROC Curves
plot_roc_curves(y_val, y_proba_val_lgbm, class_names, title_prefix="LightGBM ROC", save_path="results/roc/roc_lgbm.png")

Saved plot to: images/roc/roc_lgbm.png


In [8]:
save_roc_to_csv(y_val, y_proba_val_lgbm, "LightGBM", fold_idx=1)

Saved ROC data for class 0 (AUC=0.6461) → results/roc/LightGBM_fold1_class0.csv
Saved ROC data for class 1 (AUC=0.6524) → results/roc/LightGBM_fold1_class1.csv
Saved ROC data for class 2 (AUC=0.6114) → results/roc/LightGBM_fold1_class2.csv


In [9]:
submission_lgbm = build_submission(
    test_df=test_df,
    y_pred_test=y_pred_test_lgbm,
    y_proba_test=y_proba_test_lgbm,
    model_name="lgbm"
)


Saved: results/submission/submission_lgbm.csv
