In [1]:
#!/usr/bin/env python
# coding: utf-8
#
# Naive Bayes Final Exam: Standard vs. GSCV-Tuned
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score

warnings.filterwarnings("ignore", category=UserWarning)

class Naive_Bayes_Final_Exam:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        self.X_all, self.y_all = self._make_windows()
        
        self.X_tune, self.y_tune, \
        self.X_test, self.y_test = self._split_data()

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("─" * 60 + "\n1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        df = df.dropna()
        num_cols = df.select_dtypes(include=[np.number]).columns
        return df[list(req | set(num_cols))]

    def _make_windows(self) -> (np.ndarray, np.ndarray):
        print("2. Preparing sequence data...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            g = g.sort_values(cfg['quarter_col'])
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                X.append(arr[i - cfg['lags']:i].ravel())
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)

    def _split_data(self):
        """Splits data chronologically into Tune and Test sets."""
        print("3. Splitting data into Tune (60%) and Test (40%) sets...")
        n = len(self.y_all)
        tune_end = int(n * 0.6)
        
        X_tune, y_tune = self.X_all[:tune_end], self.y_all[:tune_end]
        X_test, y_test = self.X_all[tune_end:], self.y_all[tune_end:]
        
        print(f"   Tune set size: {len(y_tune)}")
        print(f"   Test set size: {len(y_test)}")
        return X_tune, y_tune, X_test, y_test

    def _evaluate_on_test_set(self, params: Dict[str, Any], model_name: str):
        """
        Evaluates a Naive Bayes model on the final test set 
        using a sliding window (retraining every step) and full metrics.
        """
        print(f"\n--- Evaluating '{model_name}' on the Final Test Set ---")
        
        win_size = self.config['sliding_win_size']
        X_history = self.X_tune
        y_history = self.y_tune
        
        all_probs, all_trues = [], []

        for i in range(len(self.X_test)):
            # Define the current training window from all data seen so far
            X_train_current = np.vstack([X_history, self.X_test[:i]])
            y_train_current = np.concatenate([y_history, self.y_test[:i]])
            X_train_window = X_train_current[-win_size:]
            y_train_window = y_train_current[-win_size:]

            # Skip if the window is invalid
            if len(np.unique(y_train_window)) < 2:
                if i % 500 == 0: print(f"  Skipping at step {i}: window contains only one class.")
                continue
            
            # Since Naive Bayes is very fast, we retrain at every step
            scaler = StandardScaler().fit(X_train_window)
            X_train_std = scaler.transform(X_train_window)
            model = GaussianNB(**params).fit(X_train_std, y_train_window)

            X_test_point = self.X_test[i].reshape(1, -1)
            X_test_point_std = scaler.transform(X_test_point)
            
            y_prob = model.predict_proba(X_test_point_std)[:, 1][0]
            all_probs.append(y_prob)
            all_trues.append(self.y_test[i])
            
        print("\n  Tuning classification threshold and calculating all metrics...")
        
        final_auc = roc_auc_score(all_trues, all_probs)
        best_f1, best_thresh, best_prec, best_rec = 0, 0, 0, 0
        
        for threshold in np.arange(0.1, 0.9, 0.01):
            preds = (np.array(all_probs) > threshold).astype(int)
            current_f1 = f1_score(all_trues, preds, zero_division=0)
            if current_f1 > best_f1:
                best_f1, best_thresh = current_f1, threshold
                best_prec = precision_score(all_trues, preds, zero_division=0)
                best_rec = recall_score(all_trues, preds, zero_division=0)
        
        final_gmean = np.sqrt(best_prec * best_rec) if best_prec > 0 and best_rec > 0 else 0

        print(f"\n[{model_name}] Final Test Set Performance:")
        print(f"  Best Threshold = {best_thresh:.2f}")
        print(f"  F1-Score       = {best_f1:.4f}")
        print(f"  AUC            = {final_auc:.4f}")
        print(f"  G-Mean         = {final_gmean:.4f}")
        print(f"  Precision      = {best_prec:.4f}")
        print(f"  Recall         = {best_rec:.4f}")

    def run(self):
        """Orchestrates the entire Naive Bayes final exam."""
        print("Starting Naive Bayes Final Exam...")
        
        # --- Model 1: Standard Baseline NB ---
        print("\n" + "═" * 60)
        print("Round 1: Evaluating Standard Baseline Naive Bayes")
        print("═" * 60)
        standard_params = {} # GaussianNB has no major params to set for a baseline
        self._evaluate_on_test_set(standard_params, "Standard Naive Bayes")

        # --- Model 2: GSCV-Tuned NB ---
        print("\n" + "═" * 60)
        print("Round 2: Finding and Evaluating GSCV-Tuned Naive Bayes")
        print("═" * 60)
        print("4. Starting GridSearchCV to find best params...")
        
        pipeline = Pipeline([
            ('scaler', StandardScaler()),
            ('clf', GaussianNB())
        ])
        
        # The main parameter to tune for GaussianNB is var_smoothing
        param_grid = {
            'clf__var_smoothing': np.logspace(0, -9, num=10)
        }
        
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=self.config['seed'])
        gs = GridSearchCV(pipeline, param_grid, scoring='f1', cv=cv, n_jobs=-1, verbose=1)
        gs.fit(self.X_tune, self.y_tune)
        
        print("\nGridSearchCV process finished!")
        print(f"🏆 Best F1-score on Tune Set: {gs.best_score_:.4f}")
        print(f"🏆 Best Hyperparameters Found: {gs.best_params_}")
        
        gscv_params = {k.replace('clf__', ''): v for k, v in gs.best_params_.items()}

        self._evaluate_on_test_set(gscv_params, "GSCV-Tuned Naive Bayes")
        print("\n" + "═" * 60)
        print("Naive Bayes Final Exam Complete!")


if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        
        "sliding_win_size": 200,
        # retrain_interval is not needed here as we retrain every step
    }

    exam_runner = Naive_Bayes_Final_Exam(config=CONFIG)
    exam_runner.run()

────────────────────────────────────────────────────────────
1. Loading and cleaning data...
2. Preparing sequence data...
3. Splitting data into Tune (60%) and Test (40%) sets...
   Tune set size: 12256
   Test set size: 8172
Starting Naive Bayes Final Exam...

════════════════════════════════════════════════════════════
Round 1: Evaluating Standard Baseline Naive Bayes
════════════════════════════════════════════════════════════

--- Evaluating 'Standard Naive Bayes' on the Final Test Set ---
  Skipping at step 0: window contains only one class.
  Skipping at step 500: window contains only one class.
  Skipping at step 1000: window contains only one class.
  Skipping at step 3000: window contains only one class.
  Skipping at step 4500: window contains only one class.
  Skipping at step 5500: window contains only one class.
  Skipping at step 6500: window contains only one class.
  Skipping at step 7000: window contains only one class.
  Skipping at step 7500: window contains only on