In [None]:
# WAE-GAN Evaluation Notebook

import os
import pickle as pkl
import numpy as np
import torch as th
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample

from ArgumentParser import parse_arguments
from train_cycles_adversarial import load_parameters, main, calculate_test_losses

#  Helper: load resultados por fold e extrair ground-truth + scores
def load_scores_and_labels(pkl_path):
    with open(pkl_path, "rb") as f:
        result = pkl.load(f)
    
    y_true = np.array([0] * len(result['train']['reconstruction']) + [1] * len(result['test']['reconstruction']))
    y_score = np.array(result['train']['reconstruction'] + result['test']['reconstruction'])
    return y_true, y_score

#  Helper: calcula intervalo de confiança bootstrap da ROC-AUC
def bootstrap_roc_auc(y_true, y_score, n_bootstrap=1000, seed=42):
    rng = np.random.RandomState(seed)
    scores = []
    for _ in range(n_bootstrap):
        indices = rng.randint(0, len(y_true), len(y_true))
        if len(np.unique(y_true[indices])) < 2:
            continue
        score = roc_auc_score(y_true[indices], y_score[indices])
        scores.append(score)
    sorted_scores = np.sort(scores)
    lower = sorted_scores[int(0.025 * len(sorted_scores))]
    upper = sorted_scores[int(0.975 * len(sorted_scores))]
    return np.mean(scores), lower, upper

#  Helper: métricas padronizadas

def compute_metrics(y_true, y_pred):
    return {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision": precision_score(y_true, y_pred),
        "recall": recall_score(y_true, y_pred),
        "f1": f1_score(y_true, y_pred)
    }

#  Avalia os 10 folds e agrega resultados
def evaluate_all_folds(base_path, folds=10):
    all_metrics = []
    all_auc_scores = []

    for i in range(folds):
        pkl_path = os.path.join(base_path, f"final_complete_losses_WAE_LSTMDiscriminator_analog_feats_8_2_10_0.001_0.001_fan_id_00_10_0.001_0.001_fold{i}.pkl")
        y_true, y_score = load_scores_and_labels(pkl_path)
        y_pred = (y_score > np.percentile(y_score, 95)).astype(int)

        metrics = compute_metrics(y_true, y_pred)
        auc = roc_auc_score(y_true, y_score)
        all_metrics.append(metrics)
        all_auc_scores.append(auc)

    mean_metrics = {k: np.mean([m[k] for m in all_metrics]) for k in all_metrics[0]}
    std_metrics = {k: np.std([m[k] for m in all_metrics]) for k in all_metrics[0]}
    auc_mean, auc_low, auc_high = bootstrap_roc_auc(np.array(y_true), np.array(y_score))

    return mean_metrics, std_metrics, auc_mean, auc_low, auc_high

# ▶ Executar avaliação
base_result_path = "results/"
mean_metrics, std_metrics, auc_mean, auc_low, auc_high = evaluate_all_folds(base_result_path)

print("\n\u2728 Resultados agregados (10-fold cross-validation):")
for k in mean_metrics:
    print(f"{k.capitalize()}: {mean_metrics[k]:.3f} ± {std_metrics[k]:.3f}")
print(f"ROC-AUC: {auc_mean:.3f} (95% CI: {auc_low:.3f} - {auc_high:.3f})")
