In [2]:
!pip install imbalanced-learn

Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl.metadata (8.8 kB)
Collecting sklearn-compat<1,>=0.1 (from imbalanced-learn)
  Using cached sklearn_compat-0.1.3-py3-none-any.whl.metadata (18 kB)
Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Using cached sklearn_compat-0.1.3-py3-none-any.whl (18 kB)
Installing collected packages: sklearn-compat, imbalanced-learn

   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   -------------------- ------------------- 1/2 [imbalanced-learn]
   ---------------------------------------- 2/2 [imbalanced-learn]

Successfully installed imbalanced-learn-0.13.0 sklearn-compat-0.1.3


In [4]:
#!/usr/bin/env python
# coding: utf-8
#
# LightGBM + SMOTE Champion Model Bake-Off v2 (Robust)
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE

optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)

class LGBM_SMOTE_Champion_Finder:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        self.X_all, self.y_all = self._make_windows()
        
        self.X_tune, self.y_tune, \
        self.X_val, self.y_val, \
        self.X_test, self.y_test = self._split_data()

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("─" * 60 + "\n1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        df = df.dropna()
        num_cols = df.select_dtypes(include=[np.number]).columns
        return df[list(req | set(num_cols))]

    def _make_windows(self) -> (np.ndarray, np.ndarray):
        print("2. Preparing sequence data...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            g = g.sort_values(cfg['quarter_col'])
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                X.append(arr[i - cfg['lags']:i].ravel())
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)

    def _split_data(self):
        print("3. Splitting data into Tune (60%), Validation (20%), and Test (20%) sets...")
        n = len(self.y_all)
        tune_end = int(n * 0.6)
        val_end = int(n * 0.8)
        
        X_tune, y_tune = self.X_all[:tune_end], self.y_all[:tune_end]
        X_val, y_val = self.X_all[tune_end:val_end], self.y_all[tune_end:val_end]
        X_test, y_test = self.X_all[val_end:], self.y_all[val_end:]
        
        print(f"   Tune set size: {len(y_tune)}")
        print(f"   Validation set size: {len(y_val)}")
        print(f"   Test set size: {len(y_test)}")
        return X_tune, y_tune, X_val, y_val, X_test, y_test

    def _objective(self, trial: optuna.Trial) -> float:
        params = {
            'objective': 'binary', 'metric': 'binary_logloss', 'verbosity': -1,
            'random_state': self.config['seed'],
            'n_estimators': trial.suggest_int('n_estimators', 200, 800, step=100),
            'learning_rate': trial.suggest_float('learning_rate', 1e-2, 0.1, log=True),
            'num_leaves': trial.suggest_int('num_leaves', 20, 150),
            'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        }

        # 🔥 FIX: Adaptive k_neighbors for SMOTE in Optuna trials
        n_pos_in_tune = np.sum(self.y_tune)
        k_neighbors = min(5, n_pos_in_tune - 1)
        
        if k_neighbors < 1: # SMOTE cannot run
            return 0.0 # Return a bad score if SMOTE is impossible
            
        smote = SMOTE(random_state=self.config['seed'], k_neighbors=k_neighbors)
        X_tune_resampled, y_tune_resampled = smote.fit_resample(self.X_tune, self.y_tune)
        
        scaler = StandardScaler().fit(self.X_tune)
        X_tune_std = scaler.transform(X_tune_resampled)
        X_val_std = scaler.transform(self.X_val)

        model = lgb.LGBMClassifier(**params).fit(X_tune_std, y_tune_resampled)
        
        y_probs = model.predict_proba(X_val_std)[:, 1]
        
        best_f1 = 0
        for threshold in np.arange(0.1, 0.9, 0.05):
            preds = (y_probs > threshold).astype(int)
            best_f1 = max(best_f1, f1_score(self.y_val, preds))
            
        return best_f1
    
    def _evaluate_on_test_set(self, params: Dict[str, Any], model_name: str):
        print(f"\n--- Evaluating '{model_name}' on the Final Test Set ---")
        
        win_size, retrain_interval = self.config['sliding_win_size'], self.config['retrain_interval']
        X_history = np.vstack([self.X_tune, self.X_val])
        y_history = np.concatenate([self.y_tune, self.y_val])
        
        all_probs, all_trues = [], []
        model, scaler = None, None

        for i in range(len(self.X_test)):
            if model is None or i % retrain_interval == 0:
                print(f"  Retraining at test step {i}...")
                
                X_train_current = np.vstack([X_history, self.X_test[:i]])
                y_train_current = np.concatenate([y_history, self.y_test[:i]])
                
                X_train_window, y_train_window = X_train_current[-win_size:], y_train_current[-win_size:]

                n_pos_in_window = np.sum(y_train_window)
                if n_pos_in_window < 2:
                    print("    Skipping training: not enough positive samples for SMOTE.")
                    continue

                # 🔥 FIX: Adaptive k_neighbors for SMOTE in the sliding window
                k_neighbors = min(5, n_pos_in_window - 1)
                smote = SMOTE(random_state=self.config['seed'], k_neighbors=k_neighbors)
                X_resampled, y_resampled = smote.fit_resample(X_train_window, y_train_window)

                scaler = StandardScaler().fit(X_train_window)
                X_train_std = scaler.transform(X_resampled)
                model = lgb.LGBMClassifier(**params).fit(X_train_std, y_resampled)

            if model is None: continue

            X_test_point = self.X_test[i].reshape(1, -1)
            X_test_point_std = scaler.transform(X_test_point)
            
            y_prob = model.predict_proba(X_test_point_std)[:, 1][0]
            all_probs.append(y_prob)
            all_trues.append(self.y_test[i])
            
        print("\n  Tuning classification threshold and calculating all metrics...")
        
        final_auc = roc_auc_score(all_trues, all_probs)
        best_f1, best_thresh, best_prec, best_rec = 0, 0, 0, 0
        
        for threshold in np.arange(0.1, 0.9, 0.01):
            preds = (np.array(all_probs) > threshold).astype(int)
            current_f1 = f1_score(all_trues, preds, zero_division=0)
            if current_f1 > best_f1:
                best_f1, best_thresh = current_f1, threshold
                best_prec = precision_score(all_trues, preds, zero_division=0)
                best_rec = recall_score(all_trues, preds, zero_division=0)
        
        final_gmean = np.sqrt(best_prec * best_rec) if best_prec > 0 and best_rec > 0 else 0

        print(f"\n[{model_name}] Final Test Set Performance:")
        print(f"  Best Threshold = {best_thresh:.2f}")
        print(f"  F1-Score       = {best_f1:.4f}")
        print(f"  AUC            = {final_auc:.4f}")
        print(f"  G-Mean         = {final_gmean:.4f}")
        print(f"  Precision      = {best_prec:.4f}")
        print(f"  Recall         = {best_rec:.4f}")

    def run(self):
        """Orchestrates the entire bake-off process."""
        
        print("\n" + "═" * 60)
        print("Bake-Off Round 1: Evaluating Expert-Tuned Baseline Model with SMOTE")
        print("═" * 60)
        expert_params = self.config['lightgbm_expert_params']
        self._evaluate_on_test_set(expert_params, "Expert-Tuned LGBM + SMOTE")

        print("\n" + "═" * 60)
        print("Bake-Off Round 2: Finding and Evaluating Optuna-Tuned Model with SMOTE")
        print("═" * 60)
        print("4. Starting Optuna optimization process...")
        study = optuna.create_study(direction='maximize')
        study.optimize(self._objective, n_trials=self.config['optuna_trials'], show_progress_bar=True)
        
        print(f"\nOptuna process finished!")
        print(f"🏆 Best F1-score on Validation Set: {study.best_value:.4f}")
        print(f"🏆 Best Hyperparameters Found: {study.best_params}")
        
        optuna_params = {**self.config['lightgbm_expert_params'], **study.best_params}
        self._evaluate_on_test_set(optuna_params, "Optuna-Tuned LGBM + SMOTE")
        print("\n" + "═" * 60)
        print("Bake-Off Complete!")


if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        
        "sliding_win_size": 200,
        "retrain_interval": 500,
        "optuna_trials": 50,

        "lightgbm_expert_params": {
            "objective": "binary", "metric": "auc", "random_state": 42, 
            "n_estimators": 500, "learning_rate": 0.05,
            "verbose": -1
        },
    }

    champion_finder = LGBM_SMOTE_Champion_Finder(config=CONFIG)
    champion_finder.run()

────────────────────────────────────────────────────────────
1. Loading and cleaning data...
2. Preparing sequence data...
3. Splitting data into Tune (60%), Validation (20%), and Test (20%) sets...
   Tune set size: 12256
   Validation set size: 4086
   Test set size: 4086

════════════════════════════════════════════════════════════
Bake-Off Round 1: Evaluating Expert-Tuned Baseline Model with SMOTE
════════════════════════════════════════════════════════════

--- Evaluating 'Expert-Tuned LGBM + SMOTE' on the Final Test Set ---
  Retraining at test step 0...
  Retraining at test step 500...
  Retraining at test step 1000...
    Skipping training: not enough positive samples for SMOTE.
  Retraining at test step 1500...
    Skipping training: not enough positive samples for SMOTE.
  Retraining at test step 2000...
  Retraining at test step 2500...
  Retraining at test step 3000...
  Retraining at test step 3500...
    Skipping training: not enough positive samples for SMOTE.
  Retraini

  0%|          | 0/50 [00:00<?, ?it/s]


Optuna process finished!
🏆 Best F1-score on Validation Set: 0.5351
🏆 Best Hyperparameters Found: {'n_estimators': 400, 'learning_rate': 0.019551686884351806, 'num_leaves': 150, 'reg_alpha': 8.032889994875912, 'reg_lambda': 8.219147923973554e-05}

--- Evaluating 'Optuna-Tuned LGBM + SMOTE' on the Final Test Set ---
  Retraining at test step 0...
  Retraining at test step 500...
  Retraining at test step 1000...
    Skipping training: not enough positive samples for SMOTE.
  Retraining at test step 1500...
    Skipping training: not enough positive samples for SMOTE.
  Retraining at test step 2000...
  Retraining at test step 2500...
  Retraining at test step 3000...
  Retraining at test step 3500...
    Skipping training: not enough positive samples for SMOTE.
  Retraining at test step 4000...
    Skipping training: not enough positive samples for SMOTE.

  Tuning classification threshold and calculating all metrics...

[Optuna-Tuned LGBM + SMOTE] Final Test Set Performance:
  Best Thr