In [23]:
# Imports
import optuna
import pandas as pd

import torch
import numpy as np
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, hamming_loss

# Threshold tuning

In [19]:
# Import predictions

# Load the CSV file
data = pd.read_csv('output_probs_labels.csv')

# Extract probabilities and labels as numpy arrays
probs = data.iloc[:, 0:9]  # Assuming all columns except the last are probabilities
labels = data.iloc[:, 9:18] # Assuming the last column contains labels

# Convert to numpy arrays
probs = probs.to_numpy()
labels = labels.to_numpy()

In [21]:

# probs: [num_samples, num_classes] – sigmoid outputs
# labels: [num_samples, num_classes] – binary labels
# device: torch.device

def tune_thresholds(probs, labels, num_classes):
    probs = np.array(probs)
    labels = np.array(labels)

    # Create a mask where labels are valid (not -1)
    valid_mask = (labels != -1)

    def objective(trial):
        thresholds = np.array([trial.suggest_float(f"thresh_{i}", 0.1, 0.9) for i in range(num_classes)])
        preds = (probs > thresholds).astype(int)

        # Mask invalid labels
        masked_preds = preds[valid_mask]
        masked_labels = labels[valid_mask]

        return f1_score(masked_labels, masked_preds, average="macro")

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)

    best_thresholds = [study.best_trial.params[f"thresh_{i}"] for i in range(num_classes)]
    return best_thresholds


In [22]:
best_thresholds = tune_thresholds(probs, labels, num_classes=9)
print("Optimal thresholds per class:", best_thresholds)

[I 2025-04-10 10:59:38,790] A new study created in memory with name: no-name-0a6ae7b3-8ced-41c3-88e1-15d6faec3755
[I 2025-04-10 10:59:38,810] Trial 0 finished with value: 0.42187819168951246 and parameters: {'thresh_0': 0.24497960445408218, 'thresh_1': 0.47019718307957725, 'thresh_2': 0.34537339120488175, 'thresh_3': 0.8492670484151638, 'thresh_4': 0.2921411211825372, 'thresh_5': 0.5402323698383705, 'thresh_6': 0.6442364727453451, 'thresh_7': 0.40935180191480447, 'thresh_8': 0.4466391425059346}. Best is trial 0 with value: 0.42187819168951246.
[I 2025-04-10 10:59:38,811] Trial 1 finished with value: 0.4020066036080583 and parameters: {'thresh_0': 0.5712085325640709, 'thresh_1': 0.5314428741109303, 'thresh_2': 0.7169625698025673, 'thresh_3': 0.45541769111340646, 'thresh_4': 0.7309855960268788, 'thresh_5': 0.3681825722531584, 'thresh_6': 0.40320592500746566, 'thresh_7': 0.2532487918284769, 'thresh_8': 0.2533512218155428}. Best is trial 0 with value: 0.42187819168951246.
[I 2025-04-10 10:

Optimal thresholds per class: [0.8390524090165663, 0.586388421787202, 0.7906251831029367, 0.4528015348405562, 0.14460909399916977, 0.6657532460300565, 0.2179021854659861, 0.6275462075758141, 0.8449389609779214]


In [34]:


def evaluate_metrics_with_mask(preds, targets, thresholds=None):
    """
    Calculate metrics for multi-label predictions with masking support.
    
    Args:
        preds (torch.Tensor): Raw logits or probabilities (B x C)
        targets (torch.Tensor): Ground truth labels (B x C), with -1 for missing
        thresholds (float or list/array of floats): Threshold(s) to binarize predictions. Default: 0.5 for all.
        
    Returns:
        dict with accuracy, F1, AUC-ROC, Hamming loss
    """
    preds = torch.sigmoid(torch.tensor(preds))
    
    # Create a binary mask for valid (non -1) entries
    mask = (targets != -1)
    
    # Apply thresholding
    if thresholds is None:
        thresholds = 0.5
    if isinstance(thresholds, float):
        thresholds = torch.full_like(preds, thresholds)
    elif isinstance(thresholds, (list, np.ndarray)):
        thresholds = torch.tensor(thresholds).repeat(preds.shape[0], 1)
    
    binarized_preds = (preds > thresholds).float()
    
    # Masked predictions and targets
    masked_preds = binarized_preds[mask]
    masked_targets = targets[mask]
    masked_probs = preds[mask]



    # Metrics
    accuracy = accuracy_score(masked_targets, masked_preds)
    f1 = f1_score(masked_targets, masked_preds, average="macro")
    hamming = hamming_loss(masked_targets, masked_preds)
    
    try:
        auc_roc = roc_auc_score(masked_targets, masked_probs)
    except:
        auc_roc = float("nan")  # Might fail if only one class is present
    
    return {
        "accuracy": accuracy,
        "f1_macro": f1,
        "hamming_loss": hamming,
        "auc_roc": auc_roc
    }


In [35]:
metrics = evaluate_metrics_with_mask(probs, labels, thresholds=best_thresholds)
print("Metrics with optimal thresholds:", metrics)

Metrics with optimal thresholds: {'accuracy': 0.6753623188405797, 'f1_macro': 0.6748767091270305, 'hamming_loss': 0.32463768115942027, 'auc_roc': 0.8047450868043412}
