In [2]:
#!/usr/bin/env python
# coding: utf-8
#
# BiGRU Champion Model Bake-Off with Optuna - v2 (Corrected)
#
from __future__ import annotations
import warnings
from pathlib import Path
from typing import Dict, Any

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
import optuna
from sklearn.metrics import f1_score, precision_score, recall_score, roc_auc_score
from sklearn.preprocessing import StandardScaler

# Suppress Optuna's trial info messages and other warnings
optuna.logging.set_verbosity(optuna.logging.WARNING)
warnings.filterwarnings("ignore", category=UserWarning)
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class BiGRUWithAttention(nn.Module):
    """A Bidirectional GRU with a basic attention mechanism."""
    def __init__(self, input_size, hidden_size=64, num_layers=2, dropout=0.3):
        super().__init__()
        self.gru = nn.GRU(
            input_size, hidden_size, num_layers,
            batch_first=True, dropout=dropout, bidirectional=True
        )
        self.attn_layer = nn.Linear(hidden_size * 2, 1) # x2 for bidirectional
        self.output_layer = nn.Linear(hidden_size * 2, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        gru_out, _ = self.gru(x)
        attn_weights = torch.softmax(self.attn_layer(gru_out), dim=1)
        context = torch.sum(attn_weights * gru_out, dim=1)
        output = self.sigmoid(self.output_layer(context))
        return output

class BiGRU_Champion_Finder:
    def __init__(self, config: Dict[str, Any]):
        self.config = config
        self.df = self._load_data(config['csv_path'])
        self.feat_cols = [c for c in self.df.columns if c not in config['meta_cols']]
        self.X_all, self.y_all = self._make_windows()
        
        self.X_tune, self.y_tune, \
        self.X_val, self.y_val, \
        self.X_test, self.y_test = self._split_data()
        
        # Standardization: Fit ONLY on tune set, transform others
        self.scaler = StandardScaler().fit(self.X_tune.reshape(-1, self.config['n_features']))
        self.X_tune_std = self._scale_data(self.X_tune)
        self.X_val_std = self._scale_data(self.X_val)
        self.X_test_std = self._scale_data(self.X_test)

    # 🔥 FIX: Re-inserting the missing helper methods
    def _load_data(self, path: str | Path) -> pd.DataFrame:
        print("─" * 60 + "\n1. Loading and cleaning data...")
        df = pd.read_csv(path).loc[:, ~pd.read_csv(path).columns.duplicated()]
        req = set(self.config['meta_cols'])
        if missing := req - set(df.columns): raise KeyError(f"Missing cols: {missing}")
        df[self.config['quarter_col']] = pd.to_datetime(df[self.config['quarter_col']])
        df.sort_values([self.config['id_col'], self.config['quarter_col']], inplace=True)
        df = df.dropna()
        num_cols = df.select_dtypes(include=[np.number]).columns
        return df[list(req | set(num_cols))]
    
    def _make_windows(self) -> (np.ndarray, np.ndarray):
        print("2. Preparing sequence data...")
        X, y = [], []
        cfg = self.config
        for _, g in self.df.groupby(cfg['id_col']):
            g = g.sort_values(cfg['quarter_col'])
            arr, lbl = g[self.feat_cols].to_numpy(), g[cfg['target_col']].to_numpy()
            for i in range(cfg['lags'], len(g)):
                # Keep the windowed format for RNNs
                X.append(arr[i - cfg['lags']:i])
                y.append(lbl[i])
        return np.asarray(X), np.asarray(y)

    def _split_data(self):
        print("3. Splitting data into Tune (60%), Validation (20%), and Test (20%) sets...")
        n = len(self.y_all)
        tune_end = int(n * 0.6)
        val_end = int(n * 0.8)
        
        X_tune, y_tune = self.X_all[:tune_end], self.y_all[:tune_end]
        X_val, y_val = self.X_all[tune_end:val_end], self.y_all[tune_end:val_end]
        X_test, y_test = self.X_all[val_end:], self.y_all[val_end:]
        
        print(f"   Tune set size: {len(y_tune)}")
        print(f"   Validation set size: {len(y_val)}")
        print(f"   Test set size: {len(y_test)}")
        return X_tune, y_tune, X_val, y_val, X_test, y_test

    def _scale_data(self, X):
        """Helper to scale 3D sequence data correctly."""
        return self.scaler.transform(X.reshape(-1, self.config['n_features'])).reshape(X.shape)

    def _objective(self, trial: optuna.Trial) -> float:
        """The objective function for Optuna to maximize."""
        cfg = self.config
        params = {
            'hidden_size': trial.suggest_categorical('hidden_size', [32, 64, 128]),
            'num_layers': trial.suggest_int('num_layers', 1, 3),
            'dropout': trial.suggest_float('dropout', 0.1, 0.5),
            'lr': trial.suggest_float('lr', 1e-4, 1e-2, log=True)
        }
        
        model = BiGRUWithAttention(
            input_size=cfg['n_features'], 
            hidden_size=params['hidden_size'],
            num_layers=params['num_layers'],
            dropout=params['dropout']
        ).to(DEVICE)
        
        train_loader = DataLoader(TensorDataset(torch.tensor(self.X_tune_std, dtype=torch.float32), 
                                                torch.tensor(self.y_tune, dtype=torch.float32).unsqueeze(1)),
                                  batch_size=cfg['batch_size'], shuffle=True)
        
        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
        # 🔥 Using pos_weight for class imbalance
        pos_weight = torch.tensor([cfg['pos_weight_value']], device=DEVICE)
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

        model.train()
        for epoch in range(cfg['epochs']):
            for xb, yb in train_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                preds = model(xb)
                loss = loss_fn(preds, yb)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

        model.eval()
        val_probs = []
        with torch.no_grad():
            val_tensor = torch.tensor(self.X_val_std, dtype=torch.float32).to(DEVICE)
            # Use sigmoid here because BCEWithLogitsLoss doesn't have it internally
            val_probs = torch.sigmoid(model(val_tensor)).cpu().numpy().flatten()
            
        best_f1 = 0
        for threshold in np.arange(0.1, 0.9, 0.05):
            preds = (val_probs > threshold).astype(int)
            best_f1 = max(best_f1, f1_score(self.y_val, preds))
        
        return best_f1

    def _evaluate_champion_model(self, params: Dict[str, Any]):
        """Trains the champion BiGRU on all history and evaluates on the test set."""
        print("\n--- Training and Evaluating Champion BiGRU Model ---")
        cfg = self.config
        
        X_train_final = np.vstack([self.X_tune_std, self.X_val_std])
        y_train_final = np.concatenate([self.y_tune, self.y_val])

        print(f"Final training on {len(y_train_final)} samples...")
        
        final_loader = DataLoader(TensorDataset(torch.tensor(X_train_final, dtype=torch.float32), 
                                                torch.tensor(y_train_final, dtype=torch.float32).unsqueeze(1)),
                                  batch_size=cfg['batch_size'], shuffle=True)

        model = BiGRUWithAttention(
            input_size=cfg['n_features'],
            hidden_size=params['hidden_size'],
            num_layers=params['num_layers'],
            dropout=params['dropout']
        ).to(DEVICE)
        optimizer = torch.optim.Adam(model.parameters(), lr=params['lr'])
        pos_weight = torch.tensor([cfg['pos_weight_value']], device=DEVICE)
        loss_fn = nn.BCEWithLogitsLoss(pos_weight=pos_weight)

        model.train()
        for epoch in range(cfg['epochs_final']): # Use more epochs for final training
            for xb, yb in final_loader:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                loss = loss_fn(model(xb), yb)
                optimizer.zero_grad(); loss.backward(); optimizer.step()
        print("Final training complete.")
        
        model.eval()
        with torch.no_grad():
            test_tensor = torch.tensor(self.X_test_std, dtype=torch.float32).to(DEVICE)
            test_probs = torch.sigmoid(model(test_tensor)).cpu().numpy().flatten()
        
        print("\n  Tuning classification threshold and calculating all metrics on test set...")
        final_auc = roc_auc_score(self.y_test, test_probs)
        best_f1, best_thresh, best_prec, best_rec = 0, 0, 0, 0
        for threshold in np.arange(0.1, 0.9, 0.01):
            preds = (test_probs > threshold).astype(int)
            current_f1 = f1_score(self.y_test, preds, zero_division=0)
            if current_f1 > best_f1:
                best_f1, best_thresh = current_f1, threshold
                best_prec = precision_score(self.y_test, preds, zero_division=0)
                best_rec = recall_score(self.y_test, preds, zero_division=0)
        final_gmean = np.sqrt(best_prec * best_rec) if best_prec > 0 and best_rec > 0 else 0

        print(f"\n[Optuna-Tuned BiGRU] Final Test Set Performance:")
        print(f"  Best Threshold = {best_thresh:.2f}")
        print(f"  F1-Score       = {best_f1:.4f}")
        print(f"  AUC            = {final_auc:.4f}")
        print(f"  G-Mean         = {final_gmean:.4f}")
        print(f"  Precision      = {best_prec:.4f}")
        print(f"  Recall         = {best_rec:.4f}")

    def run(self):
        """Orchestrates the entire BiGRU bake-off process."""
        print("\n" + "═" * 60)
        print("Starting BiGRU Championship Bake-Off")
        print("═" * 60)
        
        print("4. Starting Optuna optimization process...")
        study = optuna.create_study(direction='maximize')
        study.optimize(self._objective, n_trials=self.config['optuna_trials'], show_progress_bar=True)
        
        print(f"\nOptuna process finished!")
        print(f"🏆 Best F1-score on Validation Set: {study.best_value:.4f}")
        print(f"🏆 Best Hyperparameters Found: {study.best_params}")
        
        self._evaluate_champion_model(study.best_params)
        print("\nBiGRU Bake-Off Complete!")


if __name__ == "__main__":
    temp_df = pd.read_csv(r'cvm_indicators_dataset_2011-2021.csv')
    n_features = len(temp_df.columns) - 3 

    CONFIG = {
        "csv_path": r'cvm_indicators_dataset_2011-2021.csv',
        "id_col": "ID", "quarter_col": "QUARTER", "target_col": "LABEL",
        "meta_cols": ["ID", "QUARTER", "LABEL"],
        "lags": 4, "seed": 42,
        "n_features": n_features,
        
        "optuna_trials": 30,

        "epochs": 15,
        "epochs_final": 25, # More epochs for the final champion model
        "batch_size": 128,
        "pos_weight_value": 35 # Calculated from (num_negative / num_positive)
    }

    champion_finder = BiGRU_Champion_Finder(config=CONFIG)
    champion_finder.run()

────────────────────────────────────────────────────────────
1. Loading and cleaning data...
2. Preparing sequence data...
3. Splitting data into Tune (60%), Validation (20%), and Test (20%) sets...
   Tune set size: 12256
   Validation set size: 4086
   Test set size: 4086

════════════════════════════════════════════════════════════
Starting BiGRU Championship Bake-Off
════════════════════════════════════════════════════════════
4. Starting Optuna optimization process...


  0%|          | 0/30 [00:00<?, ?it/s]


Optuna process finished!
🏆 Best F1-score on Validation Set: 0.4384
🏆 Best Hyperparameters Found: {'hidden_size': 64, 'num_layers': 1, 'dropout': 0.27658312824415443, 'lr': 0.00037063679258648243}

--- Training and Evaluating Champion BiGRU Model ---
Final training on 16342 samples...
Final training complete.

  Tuning classification threshold and calculating all metrics on test set...

[Optuna-Tuned BiGRU] Final Test Set Performance:
  Best Threshold = 0.72
  F1-Score       = 0.3074
  AUC            = 0.7474
  G-Mean         = 0.3150
  Precision      = 0.2526
  Recall         = 0.3927

BiGRU Bake-Off Complete!
