In [3]:
#!/usr/bin/env python
# coding: utf-8
#
# Statistical Test Data Generation (Part 1): Naive Bayes
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

warnings.filterwarnings("ignore", category=UserWarning)

class Naive_Bayes_Score_Generator:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        self.X_all, self.y_all = self._make_windows()
        print(f"✅ Data prepared. Total sequences: {len(self.y_all)}")

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        return df.dropna()

    def _make_windows(self) -> Tuple[np.ndarray, np.ndarray]:
        print(f"2. Preparing flattened data windows...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                X.append(arr[i - cfg['lags']:i].ravel())
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)

    def run_and_save_scores(self):
        """Runs the Naive Bayes model and saves its cumulative AUC scores."""
        print("\n3. Setting up the official 'Test Stream'...")
        cfg = self.config
        # Use first 20% of data as a 'warm-up' or history buffer
        history_end = int(len(self.y_all) * 0.2)
        X_history, y_history = self.X_all[:history_end], self.y_all[:history_end]
        X_test, y_test = self.X_all[history_end:], self.y_all[history_end:]
        print(f"   History buffer size: {len(y_history)}")
        print(f"   Official Test Stream size: {len(y_test)}")

        print("\n4. Running Naive Bayes on the test stream...")
        win_size = cfg['nb_win_size']
        all_probs, all_trues = [], []
        
        for i in range(len(X_test)):
            current_train_X = np.vstack([X_history, X_test[:i]])[-win_size:]
            current_train_y = np.concatenate([y_history, y_test[:i]])[-win_size:]
            if len(np.unique(current_train_y)) < 2:
                all_probs.append(0.5)  # Neutral probability if model can't be trained
                all_trues.append(y_test[i])
                continue
            
            scaler = StandardScaler().fit(current_train_X)
            model = GaussianNB().fit(scaler.transform(current_train_X), current_train_y)
            y_prob = model.predict_proba(scaler.transform(X_test[i].reshape(1, -1)))[:, 1][0]
            all_probs.append(y_prob)
            all_trues.append(y_test[i])

        print("   Evaluation stream complete.")
        
        print("\n5. Calculating and saving cumulative AUC scores...")
        eval_step = cfg['eval_step']
        auc_scores = []
        for i in range(eval_step, len(all_trues), eval_step):
            if len(np.unique(all_trues[:i])) < 2:
                auc_scores.append(0.5)
                continue
            auc = roc_auc_score(all_trues[:i], all_probs[:i])
            auc_scores.append(auc)
            
        # Save the scores to a file
        results_df = pd.DataFrame({'nb_auc_scores': auc_scores})
        results_df.to_csv("nb_auc_scores.csv", index=False)
        print(f"   ✅ Successfully saved {len(auc_scores)} AUC scores to 'nb_auc_scores.csv'")


if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        
        # Config for Naive Bayes
        "nb_win_size": 200,
        
        # Config for evaluation
        "eval_step": 500, # Get a score every 500 samples
    }
    
    score_generator = Naive_Bayes_Score_Generator(config=CONFIG)
    score_generator.run_and_save_scores()

1. Loading and cleaning data...
2. Preparing flattened data windows...
✅ Data prepared. Total sequences: 20428

3. Setting up the official 'Test Stream'...
   History buffer size: 4085
   Official Test Stream size: 16343

4. Running Naive Bayes on the test stream...
   Evaluation stream complete.

5. Calculating and saving cumulative AUC scores...
   ✅ Successfully saved 32 AUC scores to 'nb_auc_scores.csv'


In [4]:
#!/usr/bin/env python
# coding: utf-8
#
# Statistical Test Data Generation (Part 2): ARF+ADWIN
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score
from river import ensemble, tree, drift

warnings.filterwarnings("ignore", category=UserWarning)

class ARF_Score_Generator:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        print(f"✅ Data prepared. Total samples to stream: {len(self.df)}")

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        return df.dropna()

    def _get_chunk_auc_scores(self, y_trues: List[int], y_probs: List[float], step: int) -> List[float]:
        """Calculates cumulative AUC scores at each interval."""
        auc_scores = []
        for i in range(step, len(y_trues), step):
            if len(np.unique(y_trues[:i])) < 2:
                auc_scores.append(0.5)
                continue
            auc = roc_auc_score(y_trues[:i], y_probs[:i])
            auc_scores.append(auc)
        return auc_scores

    def run_and_save_scores(self):
        """Runs the ARF+ADWIN model and saves its cumulative AUC scores."""
        print("\n3. Setting up the official 'Test Stream'...")
        cfg = self.config
        
        # Use first 20% of data as a 'warm-up' or history buffer
        history_end = int(len(self.df) * 0.2)
        df_history = self.df.iloc[:history_end]
        df_test = self.df.iloc[history_end:]
        print(f"   History buffer size: {len(df_history)}")
        print(f"   Official Test Stream size: {len(df_test)}")

        print("\n4. Running ARF+ADWIN on the test stream...")
        
        params = cfg['arf_champion_params']
        base_model = tree.HoeffdingTreeClassifier(grace_period=params['grace_period'], delta=params['delta'])
        forest = ensemble.BaggingClassifier(model=base_model, n_models=params['n_models'], seed=cfg['seed'])
        
        # Pre-train the model on the history buffer for a fair start
        print("   Warming up model on history buffer...")
        for _, row in df_history.iterrows():
            forest.learn_one(row[self.feat_cols].to_dict(), int(row[cfg['target_col']]))
            
        all_probs, all_trues = [], []
        # Evaluate on the official test stream
        for _, row in df_test.iterrows():
            x_dict = row[self.feat_cols].to_dict()
            y_true = int(row[cfg['target_col']])
            
            # Prequential: Test then Train
            y_prob = forest.predict_proba_one(x_dict).get(1, 0.5)
            all_probs.append(y_prob)
            all_trues.append(y_true)
            
            forest.learn_one(x_dict, y_true)

        print("   Evaluation stream complete.")
        
        print("\n5. Calculating and saving cumulative AUC scores...")
        eval_step = cfg['eval_step']
        auc_scores = self._get_chunk_auc_scores(all_trues, all_probs, eval_step)
            
        # Save the scores to a file
        results_df = pd.DataFrame({'arf_auc_scores': auc_scores})
        results_df.to_csv("arf_auc_scores.csv", index=False)
        print(f"   ✅ Successfully saved {len(auc_scores)} AUC scores to 'arf_auc_scores.csv'")

if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "seed": 42,
        
        # Using the champion hyperparameters we found previously
        "arf_champion_params": {'n_models': 5, 'grace_period': 281, 'delta': 2.25e-06},
        
        # Config for evaluation (MUST be the same as the Naive Bayes script)
        "eval_step": 500, 
    }
    
    score_generator = ARF_Score_Generator(config=CONFIG)
    score_generator.run_and_save_scores()

1. Loading and cleaning data...
✅ Data prepared. Total samples to stream: 23834

3. Setting up the official 'Test Stream'...
   History buffer size: 4766
   Official Test Stream size: 19068

4. Running ARF+ADWIN on the test stream...
   Warming up model on history buffer...
   Evaluation stream complete.

5. Calculating and saving cumulative AUC scores...
   ✅ Successfully saved 38 AUC scores to 'arf_auc_scores.csv'


In [5]:
#!/usr/bin/env python
# coding: utf-8
#
# Final Statistical Test v2: Wilcoxon Test with Perfectly Aligned Data Streams
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from river import ensemble, tree
from scipy.stats import wilcoxon

warnings.filterwarnings("ignore", category=UserWarning)

class Statistical_Test_Runner:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        # We only need to load the dataframe once to prepare the windows
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        self.X_flat, self.y_flat = self._make_windows()
        print(f"✅ Unified data stream prepared. Total sequences: {len(self.y_flat)}")

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        return df.dropna()

    def _make_windows(self) -> Tuple[np.ndarray, np.ndarray]:
        print(f"2. Preparing unified flattened data windows...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                X.append(arr[i - cfg['lags']:i].ravel())
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)

    def _get_chunk_auc_scores(self, y_trues: List[int], y_probs: List[float], step: int) -> List[float]:
        """Calculates cumulative AUC scores at each interval."""
        auc_scores = []
        for i in range(step, len(y_trues) + 1, step):
            if len(np.unique(y_trues[:i])) < 2:
                auc_scores.append(0.5)
                continue
            auc = roc_auc_score(y_trues[:i], y_probs[:i])
            auc_scores.append(auc)
        return auc_scores

    def run_naive_bayes_stream(self, X_history, y_history, X_test, y_test) -> List[float]:
        print("\n--- Running Naive Bayes on the unified test stream ---")
        cfg, win_size = self.config, self.config['nb_win_size']
        all_probs, all_trues = [], []
        
        for i in range(len(X_test)):
            current_train_X = np.vstack([X_history, X_test[:i]])[-win_size:]
            current_train_y = np.concatenate([y_history, y_test[:i]])[-win_size:]
            if len(np.unique(current_train_y)) < 2:
                all_probs.append(0.5); all_trues.append(y_test[i]); continue
            
            scaler = StandardScaler().fit(current_train_X)
            model = GaussianNB().fit(scaler.transform(current_train_X), current_train_y)
            y_prob = model.predict_proba(scaler.transform(X_test[i].reshape(1, -1)))[:, 1][0]
            all_probs.append(y_prob)
            all_trues.append(y_test[i])
            
        return self._get_chunk_auc_scores(all_trues, all_probs, cfg['eval_step'])

    def run_arf_stream(self, X_history, y_history, X_test, y_test) -> List[float]:
        print("\n--- Running ARF+ADWIN on the unified test stream ---")
        cfg, params = self.config, self.config['arf_champion_params']
        base_model = tree.HoeffdingTreeClassifier(grace_period=params['grace_period'], delta=params['delta'])
        forest = ensemble.BaggingClassifier(model=base_model, n_models=params['n_models'], seed=cfg['seed'])
        
        # 🔥 FIX: Pre-train the ARF model on the same history buffer as Naive Bayes
        print("   Warming up ARF model on history buffer...")
        for i in range(len(X_history)):
            forest.learn_one(dict(enumerate(X_history[i])), y_history[i])
            
        all_probs, all_trues = [], []
        # 🔥 FIX: Evaluate on the exact same X_test stream as Naive Bayes
        for i in range(len(X_test)):
            x_dict = dict(enumerate(X_test[i]))
            y_true = y_test[i]
            y_prob = forest.predict_proba_one(x_dict).get(1, 0.5)
            all_probs.append(y_prob)
            all_trues.append(y_true)
            forest.learn_one(x_dict, y_true)
        
        return self._get_chunk_auc_scores(all_trues, all_probs, cfg['eval_step'])

    def run_test(self):
        """Orchestrates the comparison and runs the Wilcoxon test."""
        print("\n3. Setting up the unified 'Test Stream' from sequence data...")
        history_end = int(len(self.y_flat) * 0.2)
        X_history, y_history = self.X_flat[:history_end], self.y_flat[:history_end]
        X_test, y_test = self.X_flat[history_end:], self.y_flat[history_end:]
        print(f"   History buffer size: {len(y_history)}")
        print(f"   Official Test Stream size: {len(y_test)}")
        
        nb_auc_scores = self.run_naive_bayes_stream(X_history, y_history, X_test, y_test)
        arf_auc_scores = self.run_arf_stream(X_history, y_history, X_test, y_test)
        
        print("\n" + "═" * 60)
        print("4. Performing Wilcoxon Signed-Rank Test on Cumulative AUC Scores")
        print("═" * 60)
        
        if len(nb_auc_scores) != len(arf_auc_scores):
            print(f"❌ ERROR: Score lists have different lengths. NB: {len(nb_auc_scores)}, ARF: {len(arf_auc_scores)}. Cannot perform test.")
            return

        print(f"   Paired Scores for Naive Bayes: {[f'{s:.3f}' for s in nb_auc_scores[:5]]} ...")
        print(f"   Paired Scores for ARF+ADWIN:   {[f'{s:.3f}' for s in arf_auc_scores[:5]]} ...")
        
        statistic, p_value = wilcoxon(nb_auc_scores, arf_auc_scores, alternative='greater')
        
        print(f"\n   Number of paired observations: {len(nb_auc_scores)}")
        print(f"   Wilcoxon T-statistic: {statistic:.4f}")
        print(f"   P-value: {p_value:.6f}")
        
        alpha = 0.05
        print(f"\n   Significance level (alpha): {alpha}")
        if p_value < alpha:
            print("✅ **Conclusion**: The result is statistically significant.")
            print("   We can reject the null hypothesis. The superior performance of Naive Bayes is not due to random chance.")
        else:
            print("ℹ️ **Conclusion**: The result is not statistically significant.")
            print("   We cannot reject the null hypothesis that the models perform equally.")

if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        "nb_win_size": 500,
        "arf_champion_params": {'n_models': 5, 'grace_period': 281, 'delta': 2.25e-06},
        "eval_step": 250,
    }
    
    tester = Statistical_Test_Runner(config=CONFIG)
    tester.run_test()

1. Loading and cleaning data...
2. Preparing unified flattened data windows...
✅ Unified data stream prepared. Total sequences: 20428

3. Setting up the unified 'Test Stream' from sequence data...
   History buffer size: 4085
   Official Test Stream size: 16343

--- Running Naive Bayes on the unified test stream ---

--- Running ARF+ADWIN on the unified test stream ---
   Warming up ARF model on history buffer...

════════════════════════════════════════════════════════════
4. Performing Wilcoxon Signed-Rank Test on Cumulative AUC Scores
════════════════════════════════════════════════════════════
   Paired Scores for Naive Bayes: ['0.611', '0.555', '0.536', '0.558', '0.546'] ...
   Paired Scores for ARF+ADWIN:   ['0.295', '0.313', '0.442', '0.486', '0.545'] ...

   Number of paired observations: 32
   Wilcoxon T-statistic: 74.0000
   P-value: 0.999919

   Significance level (alpha): 0.05
ℹ️ **Conclusion**: The result is not statistically significant.
   We cannot reject the null hypot

In [6]:
#!/usr/bin/env python
# coding: utf-8
#
# Final Statistical Test: Naive Bayes (Adaptability) vs. LSTM (Deep Knowledge)
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from scipy.stats import wilcoxon
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings("ignore", category=UserWarning)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMWithAttention(nn.Module):
    """An LSTM with a basic attention mechanism."""
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attn_layer = nn.Linear(hidden_size, 1)
        self.output_layer = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        attn_weights = torch.softmax(self.attn_layer(lstm_out), dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        return self.sigmoid(self.output_layer(context))

class Final_Showdown:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        
        self.X_flat, self.y_flat = self._make_windows(flatten=True)
        self.X_seq, self.y_seq = self._make_windows(flatten=False)
        self.config['n_features_seq'] = self.X_seq.shape[2]
        
        print(f"✅ Data prepared. Total sequences: {len(self.y_flat)}")
        print(f"✅ Using device: {DEVICE}")

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        return df.dropna()

    def _make_windows(self, flatten: bool) -> Tuple[np.ndarray, np.ndarray]:
        print(f"2. Preparing {'flattened' if flatten else 'sequential'} data windows...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                win = arr[i - cfg['lags']:i]
                X.append(win.ravel() if flatten else win)
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)
    
    def _get_chunk_auc_scores(self, y_trues: List[int], y_probs: List[float], step: int) -> List[float]:
        auc_scores = []
        for i in range(step, len(y_trues) + 1, step):
            if len(np.unique(y_trues[:i])) < 2:
                auc_scores.append(0.5)
                continue
            auc = roc_auc_score(y_trues[:i], y_probs[:i])
            auc_scores.append(auc)
        return auc_scores

    def run_naive_bayes_stream(self, X_history, y_history, X_test, y_test) -> List[float]:
        print("\n--- Running Naive Bayes on the test stream ---")
        cfg, win_size = self.config, self.config['nb_win_size']
        all_probs, all_trues = [], []
        
        for i in range(len(X_test)):
            current_train_X = np.vstack([X_history, X_test[:i]])[-win_size:]
            current_train_y = np.concatenate([y_history, y_test[:i]])[-win_size:]
            if len(np.unique(current_train_y)) < 2:
                all_probs.append(0.5); all_trues.append(y_test[i]); continue
            
            scaler = StandardScaler().fit(current_train_X)
            model = GaussianNB().fit(scaler.transform(current_train_X), current_train_y)
            y_prob = model.predict_proba(scaler.transform(X_test[i].reshape(1, -1)))[:, 1][0]
            all_probs.append(y_prob)
            all_trues.append(y_test[i])
            
        return self._get_chunk_auc_scores(all_trues, all_probs, cfg['eval_step'])

    def run_lstm_stream(self, X_history, y_history, X_test, y_test) -> List[float]:
        print("\n--- Running LSTM on the test stream ---")
        cfg, params = self.config, self.config['lstm_champion_params']
        
        print("   Training LSTM model on full history...")
        scaler = StandardScaler().fit(X_history.reshape(-1, cfg['n_features_seq']))
        X_history_std = scaler.transform(X_history.reshape(-1, cfg['n_features_seq'])).reshape(X_history.shape)
        
        model_arch_params = {k: v for k, v in params.items() if k != 'lr'}
        learning_rate = params['lr']
        model = LSTMWithAttention(input_size=cfg['n_features_seq'], **model_arch_params).to(DEVICE)
        
        loader = DataLoader(TensorDataset(torch.tensor(X_history_std, dtype=torch.float32), torch.tensor(y_history, dtype=torch.float32).unsqueeze(1)),
                          batch_size=cfg['batch_size'], shuffle=True)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        loss_fn = nn.BCELoss()
        
        for epoch in range(cfg['epochs']):
            for xb, yb in loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                loss = loss_fn(model(xb), yb); optimizer.zero_grad(); loss.backward(); optimizer.step()
        
        print("   Evaluating static LSTM on test stream...")
        model.eval()
        with torch.no_grad():
            X_test_std = scaler.transform(X_test.reshape(-1, cfg['n_features_seq'])).reshape(X_test.shape)
            test_probs = model(torch.tensor(X_test_std, dtype=torch.float32).to(DEVICE)).cpu().numpy().flatten()
        
        return self._get_chunk_auc_scores(y_test.tolist(), test_probs.tolist(), cfg['eval_step'])

    def run_test(self):
        """Orchestrates the comparison and runs the Wilcoxon test."""
        print("\n3. Setting up the official 'Test Stream' (final 20% of data)...")
        test_split = int(len(self.y_flat) * 0.8)
        
        # Data for Naive Bayes (flattened)
        X_history_flat, y_history_flat = self.X_flat[:test_split], self.y_flat[:test_split]
        X_test_flat, y_test_flat = self.X_flat[test_split:], self.y_flat[test_split:]
        
        # Data for LSTM (sequential)
        X_history_seq, y_history_seq = self.X_seq[:test_split], self.y_seq[:test_split]
        X_test_seq, y_test_seq = self.X_seq[test_split:], self.y_seq[test_split:]
        
        print(f"   History buffer size: {len(y_history_flat)}")
        print(f"   Official Test Stream size: {len(y_test_flat)}")
        
        nb_auc_scores = self.run_naive_bayes_stream(X_history_flat, y_history_flat, X_test_flat, y_test_flat)
        lstm_auc_scores = self.run_lstm_stream(X_history_seq, y_history_seq, X_test_seq, y_test_seq)
        
        print("\n" + "═" * 60)
        print("4. Performing Wilcoxon Signed-Rank Test on Cumulative AUC Scores")
        print("═" * 60)
        
        if len(nb_auc_scores) != len(lstm_auc_scores):
            print(f"❌ ERROR: Score lists have different lengths. NB: {len(nb_auc_scores)}, LSTM: {len(lstm_auc_scores)}. Cannot perform test.")
            return

        print(f"   Paired Scores for Naive Bayes: {[f'{s:.3f}' for s in nb_auc_scores[:5]]} ...")
        print(f"   Paired Scores for LSTM:        {[f'{s:.3f}' for s in lstm_auc_scores[:5]]} ...")
        
        statistic, p_value = wilcoxon(nb_auc_scores, lstm_auc_scores, alternative='greater')
        
        print(f"\n   Number of paired observations: {len(nb_auc_scores)}")
        print(f"   Wilcoxon T-statistic: {statistic:.4f}")
        print(f"   P-value: {p_value:.6f}")
        
        alpha = 0.05
        print(f"\n   Significance level (alpha): {alpha}")
        if p_value < alpha:
            print("✅ **Conclusion**: The result is statistically significant.")
            print("   We can reject the null hypothesis. The superior performance of Naive Bayes is not due to random chance.")
        else:
            print("ℹ️ **Conclusion**: The result is not statistically significant.")
            print("   We cannot reject the null hypothesis that the models perform equally.")

if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        
        "nb_win_size": 200,
        "lstm_champion_params": {'hidden_size': 32, 'num_layers': 1, 'dropout': 0.40, 'lr': 0.0013},
        "epochs": 25, "batch_size": 128,
        
        "eval_step": 200, # Get a paired score every 200 samples
    }
    
    tester = Final_Showdown(config=CONFIG)
    tester.run_test()

1. Loading and cleaning data...
2. Preparing flattened data windows...
2. Preparing sequential data windows...
✅ Data prepared. Total sequences: 20428
✅ Using device: cuda

3. Setting up the official 'Test Stream' (final 20% of data)...
   History buffer size: 16342
   Official Test Stream size: 4086

--- Running Naive Bayes on the test stream ---

--- Running LSTM on the test stream ---
   Training LSTM model on full history...
   Evaluating static LSTM on test stream...

════════════════════════════════════════════════════════════
4. Performing Wilcoxon Signed-Rank Test on Cumulative AUC Scores
════════════════════════════════════════════════════════════
   Paired Scores for Naive Bayes: ['0.500', '0.500', '0.387', '0.738', '0.736'] ...
   Paired Scores for LSTM:        ['0.500', '0.500', '0.795', '0.571', '0.594'] ...

   Number of paired observations: 20
   Wilcoxon T-statistic: 153.0000
   P-value: 0.001643

   Significance level (alpha): 0.05
✅ **Conclusion**: The result is stati

In [1]:
#!/usr/bin/env python
# coding: utf-8
#
# Final Statistical Test v2: NB vs. LSTM with Effect Size
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
# 🔥 Key Import for statistical testing and effect size
import pingouin as pg
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings("ignore", category=UserWarning)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMWithAttention(nn.Module):
    """An LSTM with a basic attention mechanism."""
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attn_layer = nn.Linear(hidden_size, 1)
        self.output_layer = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        attn_weights = torch.softmax(self.attn_layer(lstm_out), dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        return self.sigmoid(self.output_layer(context))

class Final_Showdown:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        
        self.X_flat, self.y_flat = self._make_windows(flatten=True)
        self.X_seq, self.y_seq = self._make_windows(flatten=False)
        self.config['n_features_seq'] = self.X_seq.shape[2]
        
        print(f"✅ Data prepared. Total sequences: {len(self.y_flat)}")
        print(f"✅ Using device: {DEVICE}")

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        return df.dropna()

    def _make_windows(self, flatten: bool) -> Tuple[np.ndarray, np.ndarray]:
        print(f"2. Preparing {'flattened' if flatten else 'sequential'} data windows...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                win = arr[i - cfg['lags']:i]
                X.append(win.ravel() if flatten else win)
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)
    
    def _get_chunk_auc_scores(self, y_trues: List[int], y_probs: List[float], step: int) -> List[float]:
        auc_scores = []
        for i in range(step, len(y_trues) + 1, step):
            if len(np.unique(y_trues[:i])) < 2:
                auc_scores.append(0.5)
                continue
            auc = roc_auc_score(y_trues[:i], y_probs[:i])
            auc_scores.append(auc)
        return auc_scores

    def run_naive_bayes_stream(self, X_history, y_history, X_test, y_test) -> List[float]:
        print("\n--- Running Naive Bayes on the test stream ---")
        cfg, win_size = self.config, self.config['nb_win_size']
        all_probs, all_trues = [], []
        
        for i in range(len(X_test)):
            current_train_X = np.vstack([X_history, X_test[:i]])[-win_size:]
            current_train_y = np.concatenate([y_history, y_test[:i]])[-win_size:]
            if len(np.unique(current_train_y)) < 2:
                all_probs.append(0.5); all_trues.append(y_test[i]); continue
            
            scaler = StandardScaler().fit(current_train_X)
            model = GaussianNB().fit(scaler.transform(current_train_X), current_train_y)
            y_prob = model.predict_proba(scaler.transform(X_test[i].reshape(1, -1)))[:, 1][0]
            all_probs.append(y_prob)
            all_trues.append(y_test[i])
            
        return self._get_chunk_auc_scores(all_trues, all_probs, cfg['eval_step'])

    def run_lstm_stream(self, X_history, y_history, X_test, y_test) -> List[float]:
        print("\n--- Running LSTM on the test stream ---")
        cfg, params = self.config, self.config['lstm_champion_params']
        
        print("   Training LSTM model on full history...")
        scaler = StandardScaler().fit(X_history.reshape(-1, cfg['n_features_seq']))
        X_history_std = scaler.transform(X_history.reshape(-1, cfg['n_features_seq'])).reshape(X_history.shape)
        
        model_arch_params = {k: v for k, v in params.items() if k != 'lr'}
        learning_rate = params['lr']
        model = LSTMWithAttention(input_size=cfg['n_features_seq'], **model_arch_params).to(DEVICE)
        
        loader = DataLoader(TensorDataset(torch.tensor(X_history_std, dtype=torch.float32), torch.tensor(y_history, dtype=torch.float32).unsqueeze(1)),
                          batch_size=cfg['batch_size'], shuffle=True)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        loss_fn = nn.BCELoss()
        
        for epoch in range(cfg['epochs']):
            for xb, yb in loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                loss = loss_fn(model(xb), yb); optimizer.zero_grad(); loss.backward(); optimizer.step()
        
        print("   Evaluating static LSTM on test stream...")
        model.eval()
        with torch.no_grad():
            X_test_std = scaler.transform(X_test.reshape(-1, cfg['n_features_seq'])).reshape(X_test.shape)
            test_probs = model(torch.tensor(X_test_std, dtype=torch.float32).to(DEVICE)).cpu().numpy().flatten()
        
        return self._get_chunk_auc_scores(y_test.tolist(), test_probs.tolist(), cfg['eval_step'])

    def run_test(self):
        """Orchestrates the comparison and runs the Wilcoxon test."""
        print("\n3. Setting up the official 'Test Stream' (final 20% of data)...")
        test_split = int(len(self.y_flat) * 0.8)
        
        X_history_flat, y_history_flat = self.X_flat[:test_split], self.y_flat[:test_split]
        X_test_flat, y_test_flat = self.X_flat[test_split:], self.y_flat[test_split:]
        
        X_history_seq, y_history_seq = self.X_seq[:test_split], self.y_seq[:test_split]
        X_test_seq, y_test_seq = self.X_seq[test_split:], self.y_seq[test_split:]
        
        print(f"   History buffer size: {len(y_history_flat)}")
        print(f"   Official Test Stream size: {len(y_test_flat)}")
        
        nb_auc_scores = self.run_naive_bayes_stream(X_history_flat, y_history_flat, X_test_flat, y_test_flat)
        lstm_auc_scores = self.run_lstm_stream(X_history_seq, y_history_seq, X_test_seq, y_test_seq)
        
        print("\n" + "═" * 60)
        print("4. Performing Wilcoxon Signed-Rank Test on Cumulative AUC Scores")
        print("═" * 60)
        
        if len(nb_auc_scores) != len(lstm_auc_scores):
            print(f"❌ ERROR: Score lists have different lengths. NB: {len(nb_auc_scores)}, LSTM: {len(lstm_auc_scores)}. Cannot perform test.")
            return

        # 🔥 FIX: Using pingouin for a more comprehensive statistical test
        # The pingouin library makes getting the p-value and effect size very easy.
        stats_df = pg.wilcoxon(nb_auc_scores, lstm_auc_scores, alternative='greater')
        
        p_value = stats_df['p-val'].iloc[0]
        # The rank-biserial correlation is a great effect size 'r' for Wilcoxon
        effect_size_r = stats_df['RBC'].iloc[0] 
        
        print(stats_df) # Print the full statistical results table
        
        print(f"\n   Number of paired observations: {len(nb_auc_scores)}")
        print(f"   P-value: {p_value:.6f}")
        print(f"   Effect Size (r): {effect_size_r:.4f}")
        
        alpha = 0.05
        print(f"\n   Significance level (alpha): {alpha}")
        if p_value < alpha:
            print("✅ **Conclusion**: The result is statistically significant.")
            print("   We can reject the null hypothesis. The superior performance of Naive Bayes is not due to random chance.")
        else:
            print("ℹ️ **Conclusion**: The result is not statistically significant.")
            print("   We cannot reject the null hypothesis that the models perform equally.")

if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        
        "nb_win_size": 200,
        "lstm_champion_params": {'hidden_size': 32, 'num_layers': 1, 'dropout': 0.40, 'lr': 0.0013},
        "epochs": 25, "batch_size": 128,
        
        "eval_step": 200,
    }
    
    # You might need to install pingouin: pip install pingouin
    tester = Final_Showdown(config=CONFIG)
    tester.run_test()

1. Loading and cleaning data...
2. Preparing flattened data windows...
2. Preparing sequential data windows...
✅ Data prepared. Total sequences: 20428
✅ Using device: cuda

3. Setting up the official 'Test Stream' (final 20% of data)...
   History buffer size: 16342
   Official Test Stream size: 4086

--- Running Naive Bayes on the test stream ---

--- Running LSTM on the test stream ---
   Training LSTM model on full history...
   Evaluating static LSTM on test stream...

════════════════════════════════════════════════════════════
4. Performing Wilcoxon Signed-Rank Test on Cumulative AUC Scores
════════════════════════════════════════════════════════════
          W-val alternative     p-val       RBC    CLES
Wilcoxon  153.0     greater  0.001762  0.789474  0.7125

   Number of paired observations: 20
   P-value: 0.001762
   Effect Size (r): 0.7895

   Significance level (alpha): 0.05
✅ **Conclusion**: The result is statistically significant.
   We can reject the null hypothesis. The