In [None]:
# Import necessary libraries
import pandas as pd
import os
import platform
device_name = platform.node()

previous_folder = os.getcwd()
print ("This is the working folder: " + previous_folder)

if device_name == 'mmd-MS-7D98': 
    #This passage is done because everytime I log into the remote server the default folder is:
    #/mmd/home and I need to localize the correct folder to load the settings 
    os.chdir("/media/mmd/Samsung_T5/GitHub/UMD")

if previous_folder != os.getcwd(): # This is now the right working folder
    print("The current working folder has been changed, now the working folder is: " + os.getcwd())

import sys
sys.path.append("..")
sys.path.append("my_library")
from my_library.config import *
from my_library.metrics.cmp_metrics import *
sys.path.append("..")

n_classes = 2

**Finding Prediction Files**:
   - Walks through directories to find all Parquet files in folders containing "attention" in their name
   - Collects and sorts paths to these prediction files

In [None]:
test_parquet_paths = []  # Initialize an empty list to store the paths of .parquet files

# Recursively walk through all directories and files starting from root_dir
for dirpath, dirnames, filenames in os.walk(root_dir):
    # Check if the folder name contains the word "attention" (case-insensitive)
    if "attention" in os.path.basename(dirpath).lower():
        
        # Loop through all files in the current directory
        for f in filenames:
            file_path = os.path.join(dirpath, f)  # Construct the full file path
            
            # Check if the file has a .parquet extension (case-insensitive)
            if f.lower().endswith(".parquet"):
                test_parquet_paths.append(file_path)  # Add the file path to the list

# Sort the list of file paths alphabetically
test_parquet_paths.sort()


In [None]:
import json  # Import the JSON module to work with JSON files

# Replace "predictions.parquet" with "mil_params.json" in the first path from the list
json_p = test_parquet_paths[0].replace("predictions.parquet", "mil_params.json")

# Open the JSON file and load its contents
with open(json_p, 'r') as file:
    data = json.load(file)

# Extract the values of the 'outcome_labels' dictionary from the loaded data
outcome_labels = data['mil_params']['outcome_labels'].values()

# Extract the list or dictionary of outcomes
outcomes = data['mil_params']['outcomes']

# Print the outcome labels
print(f'These are the labels: {outcome_labels}')


In [None]:
test_preds = []  # Initialize an empty list to store prediction DataFrames

# Loop over each path in the list of test parquet files
for i, path in enumerate(test_parquet_paths):
    df = pd.read_parquet(path)  # Read the parquet file into a DataFrame

    # Select columns that start with "y_pred" (predicted class probabilities or scores)
    y_pred_columns = [col for col in df.columns if col.startswith("y_pred")]

    # For each row, find the column with the maximum value among the y_pred columns
    # Extract the class index from the column name (e.g., "y_pred_2" → 2)
    df["y_pred_max_idx"] = df[y_pred_columns].idxmax(axis=1).str.extract("(\d+)").astype(int)

    # Drop the original y_pred columns to keep only the prediction index
    df = df.drop(y_pred_columns, axis=1)

    # Append the processed DataFrame to the list
    test_preds.append(df)

# Concatenate all individual DataFrames into one big DataFrame
test_preds_df = pd.concat(test_preds, ignore_index=True)


In [None]:
# Group the data by 'slide' and 'y_true', and aggregate 'y_pred_max_idx' values into lists
df_final = test_preds_df.groupby(["slide", "y_true"])["y_pred_max_idx"].agg(list).reset_index()

# Expand the list of predictions into separate columns (one per prediction)
df_final_expanded = df_final["y_pred_max_idx"].apply(pd.Series)

# Rename the new columns to indicate their position (e.g., y_pred_max_idx_1, y_pred_max_idx_2, ...)
df_final_expanded.columns = [f"y_pred_max_idx_{i+1}" for i in df_final_expanded.columns]

# Concatenate the original DataFrame (without the list column) with the new expanded prediction columns
df_result = pd.concat([df_final.drop(columns=["y_pred_max_idx"]), df_final_expanded], axis=1)


In [None]:
# Select all columns that contain 'y_pred_max_idx_' (i.e., the expanded prediction columns)
colonne_pred = [col for col in df_result.columns if 'y_pred_max_idx_' in col]

# Compute the most frequent value (majority vote) across prediction columns for each row
df_result['y_pred'] = df_result[colonne_pred].mode(axis=1)[0]  # [0] gets the value (not the Series)

# Display the first few rows of the updated DataFrame
print(df_result.head())


In [None]:
df_test = df_result[["slide", "y_true", "y_pred"]]

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report, precision_score, recall_score, f1_score, balanced_accuracy_score

def extract_metrics(train_true_y, train_pred_y, class_names, model, name_set):
    results = []  # List to store metric results
    
    y_list = train_true_y  # True labels
    y_pred = train_pred_y  # Predicted labels

    # Compute standard metrics
    acc = accuracy_score(y_list, y_pred)
    precision = precision_score(y_list, y_pred, average="weighted", zero_division=0)
    weighted_accuracy = balanced_accuracy_score(y_list, y_pred)
    recall = recall_score(y_list, y_pred, average="weighted", zero_division=0)
    f1 = f1_score(y_list, y_pred, average="weighted", zero_division=0)

    # Get the full classification report as a dictionary
    report = classification_report(y_list, y_pred, digits=2, target_names=class_names, output_dict=True, zero_division=0)

    # Flatten the classification report into a row dictionary
    row = {
        "Dataset": name_set,
        "Fold": 1,  # ← Temporarily using 1; replace with 'i+1' if inside a loop
        "Accuracy": acc,
        "Balanced Accuracy": weighted_accuracy,
        "Precision": precision,
        "Recall": recall,
        "F1_score": f1
    }

    # Add per-class metrics to the row
    for label, metrics in report.items():
        if isinstance(metrics, dict):  # Skip 'accuracy', which is just a float
            for metric_name, value in metrics.items():
                row[f"{label}_{metric_name}"] = value

    results.append(row)  # Append metrics to the results list

    # Convert to DataFrame
    df_results = pd.DataFrame(results)

    display(df_results)  # Display the results nicely (only works in Jupyter or IPython)

    # Optional: save to Excel file
    # df_results.to_excel(model + f"_{name_set}_5_folds.xlsx")

    return df_results

In [None]:
y_true = df_test["y_true"].to_numpy()
y_pred = df_test["y_pred"].to_numpy()

In [None]:
df = extract_metrics(y_true, y_pred, outcome_labels, "br3", "test")