In [None]:
#!/usr/bin/env python
# coding: utf-8
#
# Final Analysis: Visualizing Concept Drift - v2 (Corrected Plotting)
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any, List, Tuple

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
from river import ensemble, tree, drift
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

warnings.filterwarnings("ignore", category=UserWarning)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class LSTMWithAttention(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        self.attn_layer = nn.Linear(hidden_size, 1)
        self.output_layer = nn.Linear(hidden_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        attn_weights = torch.softmax(self.attn_layer(lstm_out), dim=1)
        context = torch.sum(attn_weights * lstm_out, dim=1)
        return self.sigmoid(self.output_layer(context))

class Final_Drift_Visualizer:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        
        self.X_flat, self.y_flat = self._make_windows(flatten=True)
        self.X_seq, self.y_seq = self._make_windows(flatten=False)
        self.config['n_features_seq'] = self.X_seq.shape[2]
        
        print(f"✅ Data prepared. Total sequences: {len(self.y_flat)}")
        print(f"✅ Using device: {DEVICE}")

    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        return df.dropna()

    def _make_windows(self, flatten: bool) -> Tuple[np.ndarray, np.ndarray]:
        print(f"2. Preparing {'flattened' if flatten else 'sequential'} data windows...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                win = arr[i - cfg['lags']:i]
                X.append(win.ravel() if flatten else win)
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)
    
    def _get_rolling_f1(self, y_trues: List[int], y_preds: List[int], step: int) -> Tuple[np.ndarray, np.ndarray]:
        f1_scores, indices = [], []
        for i in range(step, len(y_trues), step):
            window_trues = y_trues[:i]
            window_preds = y_preds[:i]
            f1 = f1_score(window_trues, window_preds, zero_division=0)
            f1_scores.append(f1)
            indices.append(i)
        return np.array(indices), np.array(f1_scores)

    def run_naive_bayes(self) -> Tuple[np.ndarray, np.ndarray]:
        print("\n--- Running Naive Bayes (per-instance retraining) ---")
        X, y = self.X_flat, self.y_flat
        cfg = self.config
        win_size, history_end = cfg['nb_win_size'], int(len(y) * 0.2)
        X_history, y_history, X_test, y_test = X[:history_end], y[:history_end], X[history_end:], y[history_end:]
        preds, trues = [], []
        for i in range(len(X_test)):
            current_train_X = np.vstack([X_history, X_test[:i]])[-win_size:]
            current_train_y = np.concatenate([y_history, y_test[:i]])[-win_size:]
            if len(np.unique(current_train_y)) < 2:
                preds.append(0); trues.append(y_test[i]); continue
            scaler = StandardScaler().fit(current_train_X)
            model = GaussianNB().fit(scaler.transform(current_train_X), current_train_y)
            y_prob = model.predict_proba(scaler.transform(X_test[i].reshape(1, -1)))[:, 1][0]
            preds.append(int(y_prob >= cfg['nb_threshold']))
            trues.append(y_test[i])
        return self._get_rolling_f1(trues, preds, cfg['eval_step'])

    def run_arf(self) -> Tuple[np.ndarray, np.ndarray]:
        print("\n--- Running ARF+ADWIN (online learning) ---")
        cfg = self.config
        params = cfg['arf_champion_params']
        base_model = tree.HoeffdingTreeClassifier(grace_period=params['grace_period'], delta=params['delta'], split_criterion='hellinger')
        forest = ensemble.BaggingClassifier(model=base_model, n_models=params['n_models'], seed=cfg['seed'])
        detector = drift.ADWIN()
        trues, preds = [], []
        for _, row in self.df.iterrows():
            x, y = row[self.feat_cols].to_dict(), int(row[cfg['target_col']])
            y_pred = forest.predict_one(x)
            if y_pred is not None:
                preds.append(y_pred); trues.append(y)
                error = int(y_pred != y)
                detector.update(error)
                if detector.drift_detected: forest = ensemble.BaggingClassifier(model=base_model, n_models=params['n_models'], seed=cfg['seed'])
            forest.learn_one(x, y)
        return self._get_rolling_f1(trues, preds, cfg['eval_step'])

    def run_lstm(self) -> Tuple[np.ndarray, np.ndarray]:
        print("\n--- Running LSTM (train-once) ---")
        X, y, cfg = self.X_seq, self.y_seq, self.config
        train_end = int(len(y) * 0.8)
        X_train, y_train, X_test, y_test = X[:train_end], y[:train_end], X[train_end:], y[train_end:]
        scaler = StandardScaler().fit(X_train.reshape(-1, cfg['n_features_seq']))
        X_train_std = scaler.transform(X_train.reshape(-1, cfg['n_features_seq'])).reshape(X_train.shape)
        X_test_std = scaler.transform(X_test.reshape(-1, cfg['n_features_seq'])).reshape(X_test.shape)
        print("   Training LSTM model...")
        all_lstm_params = cfg['lstm_champion_params']
        model_arch_params, learning_rate = {k: v for k, v in all_lstm_params.items() if k != 'lr'}, all_lstm_params['lr']
        model = LSTMWithAttention(input_size=cfg['n_features_seq'], **model_arch_params).to(DEVICE)
        train_loader = DataLoader(TensorDataset(torch.tensor(X_train_std, dtype=torch.float32), torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)),
                                  batch_size=cfg['batch_size'], shuffle=True)
        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
        loss_fn = nn.BCELoss()
        for epoch in range(cfg['epochs']):
            for xb, yb in train_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                loss = loss_fn(model(xb), yb); optimizer.zero_grad(); loss.backward(); optimizer.step()
        model.eval()
        with torch.no_grad():
            test_probs = model(torch.tensor(X_test_std, dtype=torch.float32).to(DEVICE)).cpu().numpy().flatten()
        preds = (test_probs >= cfg['lstm_threshold']).astype(int)
        return self._get_rolling_f1(y_test.tolist(), preds.tolist(), cfg['eval_step'])

    def plot_final_results(self, all_results: Dict[str, Any]):
        print("\n--- Generating Final Championship Comparison Plot ---")
        plt.style.use('seaborn-v0_8-whitegrid')
        fig, ax = plt.subplots(figsize=(14, 8))
        
        max_f1_observed = 0.0

        for name, (indices, scores) in all_results.items():
            if len(scores) > 0 and scores.max() > max_f1_observed:
                max_f1_observed = scores.max()
            details = self.config['plot_styles'][name]
            ax.plot(indices, scores, label=name, marker=details['marker'], color=details['color'], linestyle=details['linestyle'], markersize=5, linewidth=2)

        ax.set_title('Model Performance Over Time: A Comparison of Adaptive Strategies', fontsize=18, weight='bold', pad=20)
        ax.set_xlabel('Number of Samples Evaluated in Stream', fontsize=14, labelpad=10)
        ax.set_ylabel('Cumulative F1-Score', fontsize=14, labelpad=10)
        legend = ax.legend(title='Model Philosophy', fontsize=12, title_fontsize=13)
        plt.setp(legend.get_title(), weight='bold')
        ax.grid(True, which='both', linestyle='--', linewidth=0.5)
        
        # 🔥 FIX: Dynamically set the y-axis limit with a 10% headroom
        ax.set_ylim(0, min(1.0, max_f1_observed * 1.1) if max_f1_observed > 0 else 1.0)
        
        plt.tight_layout()
        plt.savefig("CHAMPIONSHIP_FINAL_PLOT.png", dpi=300)
        print("\n✅ Grand Finale plot saved as 'CHAMPIONSHIP_FINAL_PLOT2.png'")
        plt.show()

    def run_all(self):
        """Orchestrates the entire analysis."""
        all_results = {}
        all_results['Naive Bayes (Per-Instance Adaptation)'] = self.run_naive_bayes()
        all_results['ARF+ADWIN (Online Ensemble)'] = self.run_arf()
        all_results['LSTM (Static Knowledge)'] = self.run_lstm()
        
        self.plot_final_results(all_results)


if __name__ == "__main__":
    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        "eval_step": 500,

        "nb_win_size": 200, "nb_threshold": 0.10,
        "arf_champion_params": {'n_models': 5, 'grace_period': 281, 'delta': 2.25e-06},
        "lstm_champion_params": {'hidden_size': 32, 'num_layers': 1, 'dropout': 0.40, 'lr': 0.0013},
        "lstm_threshold": 0.15,
        
        "epochs": 25, "batch_size": 128,

        "plot_styles": {
            'Naive Bayes (Per-Instance Adaptation)': {'color': 'blue', 'marker': 'o', 'linestyle': '-'},
            'ARF+ADWIN (Online Ensemble)': {'color': 'green', 'marker': 's', 'linestyle': '--'},
            'LSTM (Static Knowledge)': {'color': 'red', 'marker': 'x', 'linestyle': ':'},
        }
    }
    
    visualizer = Final_Drift_Visualizer(config=CONFIG)
    visualizer.run_all()

1. Loading and cleaning data...
2. Preparing flattened data windows...
2. Preparing sequential data windows...
✅ Data prepared. Total sequences: 20428
✅ Using device: cuda

--- Running Naive Bayes (per-instance retraining) ---
