### manual results verification

In [1]:
import pandas as pd
import numpy as np
from utils import  custom_label_binarize
from sklearn.metrics import roc_curve, auc, f1_score, balanced_accuracy_score, accuracy_score, roc_auc_score

In [11]:
ground_truth = pd.read_csv(r"E:\KSA Project\Project_Pipeline\existing_approaches\CAIMAN_Fivecrop_4Folds\MSI_vs_MSS_T50R50\fold4\val_GT.csv")
patch_predictions = pd.read_csv(r"E:\KSA Project\Project_Pipeline\existing_approaches\CAIMAN_Fivecrop_4Folds\MSI_vs_MSS_T50R50\fold4\val_tile_pred_AUC.csv")
# ground_truth = pd.read_csv(r"E:\KSA Project\Project_Pipeline\existing_approaches\CAIMAN_Fivecrop_4Folds\MSI_vs_MSS_T50R50\fold4\test_GT.csv")
# patch_predictions = pd.read_csv(r"E:\KSA Project\Project_Pipeline\existing_approaches\CAIMAN_Fivecrop_4Folds\MSI_vs_MSS_T50R50\fold4\test_tile_pred_AUC.csv")

In [12]:
# Merge patch predictions with ground truth labels
ground_truth["slideidx"] = range(len(ground_truth))  # Assuming slideidx is aligned with ground truth order
merged_df = patch_predictions.merge(ground_truth, on="slideidx", how="left")

# Group data by slide index
grouped = merged_df.groupby("slideidx")

# Initialize empty lists for aggregated probabilities
top10_probs = []
avg_probs = []
max_probs = []
majority_probs = []
labels = []

# Aggregation methods
for slideidx, group in grouped:
    msi_probs = group["MSI_prob"].values
    nonmsi_probs = group["nonMSI_prob"].values
    true_label = group["label_id"].iloc[0]
    
    # Top10 MSI probabilities
    top10_msi_prob = np.mean(np.sort(msi_probs)[-10:]) if len(msi_probs) >= 10 else np.mean(msi_probs)
    top10_probs.append(top10_msi_prob)
    
    # Average MSI probabilities
    avg_msi_prob = np.mean(msi_probs)
    avg_probs.append(avg_msi_prob)
    
    # Maximum MSI probability
    max_msi_prob = np.max(msi_probs)
    max_probs.append(max_msi_prob)
    
    # Majority vote MSI probability
    majority_msi_prob = np.sum(msi_probs > 0.5) / len(msi_probs)
    majority_probs.append(majority_msi_prob)
    
    # Append true label
    labels.append(true_label)

# Save aggregated results in a new DataFrame
aggregated_results = pd.DataFrame({
    "slideidx": grouped.groups.keys(),
    "top10_msi_prob": top10_probs,
    "avg_msi_prob": avg_probs,
    "max_msi_prob": max_probs,
    "majority_msi_prob": majority_probs,
    "true_label": labels
})

# Calculate evaluation metrics for each method
def calculate_thresholds(y_true, y_probs):
    """
    Calculate optimal thresholds using various methods.
    
    Args:
        y_true (np.ndarray): True binary labels.
        y_probs (np.ndarray): Predicted probabilities.
    
    Returns:
        dict: Dictionary of thresholds for each method.
    """
    fpr, tpr, thresholds = roc_curve(y_true, y_probs)

    # 1. Youden's J statistic
    youdens_j = tpr - fpr
    youdens_threshold = thresholds[np.argmax(youdens_j)]

    # 2. Maximizing F1-Score
    best_f1_threshold = None
    best_f1 = -1
    for threshold in thresholds:
        preds = (y_probs >= threshold).astype(int)
        f1 = f1_score(y_true, preds)
        if f1 > best_f1:
            best_f1 = f1
            best_f1_threshold = threshold

    return {
        "youdens_j": youdens_threshold,
        "max_f1": best_f1_threshold,
    }
metrics = {}

for method in ["top10", "avg", "max", "majority"]:
    probs = aggregated_results[f"{method}_msi_prob"].values
    true_labels = aggregated_results["true_label"].values

    # Calculate optimal thresholds
    thresholds = calculate_thresholds(true_labels, probs)
    
    # Evaluate metrics for each threshold
    for name, threshold in thresholds.items():
        preds = (probs >= threshold).astype(int)
        metrics[f"{method}_{name}"] = {
            "threshold": threshold,
            "accuracy": accuracy_score(true_labels, preds),
            "balanced_accuracy": balanced_accuracy_score(true_labels, preds),
            "weighted_f1": f1_score(true_labels, preds, average="weighted"),
            "macro_f1": f1_score(true_labels, preds, average="macro"),
            "roc_auc": roc_auc_score(true_labels, probs),
        }

# Print metrics
for method, method_metrics in metrics.items():
    print(f"Metrics for {method} method:")
    for metric, value in method_metrics.items():
        print(f"  {metric}: {value:.4f}")

# save the results to a file 
aggregated_results.to_csv(r"E:\KSA Project\Project_Pipeline\existing_approaches\CAIMAN_Fivecrop_4Folds\MSI_vs_MSS_T50R50\fold4\aggregated_slide_predictions.csv", index=False)
print("done")
pd.DataFrame(metrics).T.to_csv(r"E:\KSA Project\Project_Pipeline\existing_approaches\CAIMAN_Fivecrop_4Folds\MSI_vs_MSS_T50R50\fold4\aggregated_slide_metrics.csv", index=True)
print("done")


Metrics for top10_youdens_j method:
  threshold: 0.8864
  accuracy: 0.8100
  balanced_accuracy: 0.7510
  weighted_f1: 0.8266
  macro_f1: 0.6974
  roc_auc: 0.7937
Metrics for top10_max_f1 method:
  threshold: 0.9857
  accuracy: 0.8900
  balanced_accuracy: 0.6882
  weighted_f1: 0.8754
  macro_f1: 0.7298
  roc_auc: 0.7937
Metrics for avg_youdens_j method:
  threshold: 0.0763
  accuracy: 0.8500
  balanced_accuracy: 0.8020
  weighted_f1: 0.8610
  macro_f1: 0.7513
  roc_auc: 0.8329
Metrics for avg_max_f1 method:
  threshold: 0.0763
  accuracy: 0.8500
  balanced_accuracy: 0.8020
  weighted_f1: 0.8610
  macro_f1: 0.7513
  roc_auc: 0.8329
Metrics for max_youdens_j method:
  threshold: 0.8956
  accuracy: 0.7000
  balanced_accuracy: 0.7686
  weighted_f1: 0.7426
  macro_f1: 0.6280
  roc_auc: 0.7663
Metrics for max_max_f1 method:
  threshold: 0.8956
  accuracy: 0.7000
  balanced_accuracy: 0.7686
  weighted_f1: 0.7426
  macro_f1: 0.6280
  roc_auc: 0.7663
Metrics for majority_youdens_j method:
  thre