# Logistic Regression

In [1]:
import time
import ast
import numpy as np
import pandas as pd
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    classification_report,
    confusion_matrix,
    roc_auc_score,
    log_loss
)

import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme(style="whitegrid")

# Evaluation helper
from evaluation import *
from data_utils import *

In [2]:
# Load and verify data
train_df, test_df, y, class_names = load_and_prepare_data()

# Classical ML path
X_train, X_val, X_test, y_train, y_val, vecs = prepare_tfidf_pipeline(train_df, test_df, y)

print("TF-IDF Train shape:", X_train.shape)
print("Validation shape:", X_val.shape)

TF-IDF Train shape: (45981, 60000)
Validation shape: (11496, 60000)


In [3]:
from sklearn.linear_model import LogisticRegression

# Logistic Regression params & training
lr_model = LogisticRegression(
    solver="saga",
    multi_class="multinomial",
    max_iter=3000,
    C=1.0,
    n_jobs=-1,
    verbose=0
)

t0 = time.time()

lr_model.fit(X_train, y_train)

t1 = time.time()
print(f"⏱️ Logistic Regression training time: {(t1 - t0):.2f} seconds ({(t1 - t0)/60:.2f} minutes)")



⏱️ Logistic Regression training time: 9.95 seconds (0.17 minutes)


In [4]:
# Predictions (Logistic Regression)
y_pred_val_lr  = lr_model.predict(X_val)
y_proba_val_lr = lr_model.predict_proba(X_val)

# Test predictions + submission
y_pred_test_lr  = lr_model.predict(X_test)
y_proba_test_lr = lr_model.predict_proba(X_test)

In [5]:
print("\n================ LOGISTIC REGRESSION EVAL ================\n")
# Metrics
_ = eval_metrics(y_val, y_pred_val_lr)
eval_classification_report(y_val, y_pred_val_lr, class_names)
# ROC-AUC
_ = eval_roc_auc(y_val, y_proba_val_lr)
# Log-loss
_ = eval_log_loss(y_val, y_proba_val_lr)
_ = eval_log_loss_per_class(y_val, y_proba_val_lr)



*** GLOBAL METRICS ***
Accuracy (Global)      : 0.4214
Precision (Macro Avg)  : 0.4195
Recall (Macro Avg)     : 0.4194
F1-Score (Macro Avg)   : 0.4192

*** PER-CLASS EVALUATION ***
Class                Precision    Recall  F1-Score   Support
------------------------------------------------------------
winner_model_a            0.44      0.46      0.45      4013
winner_model_b            0.43      0.43      0.43      3931
winner_tie                0.39      0.37      0.38      3552
------------------------------------------------------------
Macro Avg                 0.42      0.42      0.42     34488
Weighted Avg              0.42      0.42      0.42     34488

*** ROC-AUC EVALUATION ***
ROC-AUC (OvR) : 0.6015

*** LOG-LOSS EVALUATION ***
Log-loss      : 1.1287

*** LOG-LOSS PER CLASS ***
Class 0: 1.0743  (n=4013)
Class 1: 1.1050  (n=3931)
Class 2: 1.2162  (n=3552)


In [6]:
# Confusion Matrix + Plot
cm_lr = eval_confusion_matrix(y_val, y_pred_val_lr, n_classes=y_proba_val_lr.shape[1])
plot_confusion_matrix(cm_lr, class_names, title="Confusion Matrix — Logistic Regression", save_path="results/confusion_matrix/confusion_matrix_logreg.png")


Confusion Matrix (rows=true, cols=pred):
 [[1837 1181  995]
 [1211 1704 1016]
 [1136 1113 1303]]
Saved plot to: images/confusion_matrix/confusion_matrix_logreg.png


In [7]:
# ROC Curves
plot_roc_curves(y_val, y_proba_val_lr, class_names, title_prefix="LogReg ROC", save_path="results/roc/roc_logreg.png")

Saved plot to: images/roc/roc_logreg.png


In [8]:
save_roc_to_csv(y_val, y_proba_val_lr, "LogReg", fold_idx=1)

Saved ROC data for class 0 (AUC=0.6131) → results/roc/LogReg_fold1_class0.csv
Saved ROC data for class 1 (AUC=0.6055) → results/roc/LogReg_fold1_class1.csv
Saved ROC data for class 2 (AUC=0.5858) → results/roc/LogReg_fold1_class2.csv


In [9]:
submission_lr = build_submission(
    test_df=test_df,
    y_pred_test=y_pred_test_lr,
    y_proba_test=y_proba_test_lr,
    model_name="logreg"
)


Saved: results/submission/submission_logreg.csv
