### LIBRARIES

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss

### LOSS

In [2]:
def aggregated_log_loss(
    y_true,
    y_pred
):
    """
    Calculate the aggregated logistic loss.
    
    Parameters
    ----------
    y_true
        array-like or label indicator matrix. 
        Ground truth (correct) labels for n_samples samples.

    y_pred
        array-like of float, shape = (n_samples, n_classes) or (n_samples,)
        Predicted probabilities, as returned by a classifier’s predict_proba method
        
    Returns
    -------
    mean_log_loss
        The mean log loss for each label class.
        
    Notes
    -----
    https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html
    """
    log_losses = []
    for label in y_pred.columns:
        log_losses.append(log_loss(y_true[label], y_pred[label]))
    return np.array(log_losses).mean()

### DATA

In [3]:
metadata = pd.read_csv("data/metadata.csv", index_col="sample_id")

In [4]:
train_files = metadata[metadata["split"] == "train"]["features_path"].to_dict()
val_files = metadata[metadata["split"] == "val"]["features_path"].to_dict()
test_files = metadata[metadata["split"] == "test"]["features_path"].to_dict()

In [5]:
y_true = pd.read_csv("data/val_labels.csv", index_col="sample_id")
y_pred = pd.read_csv("submissions/benchmark_logreg_c2_submission.csv", index_col="sample_id")

# Select only the validation files.
y_pred = y_pred.loc[y_true.index]

In [6]:
aggregated_log_loss(y_true=y_true, y_pred=y_pred)

0.2278142295636966