In [20]:
import pandas as pd
import os
from collections import defaultdict
from glob import glob
from sklearn.metrics import roc_auc_score


# find all the files with "predictions" in the path from the parent path "results"
pred_files = glob("results/**/**/*predictions*.csv", recursive=True)
# Check for potential duplicates by examining file paths
unique_files = set(pred_files)
if len(unique_files) < len(pred_files):
    print(f"Warning: Found {len(pred_files) - len(unique_files)} duplicate file paths")
    # Count occurrences of each file path
    path_counts = {}
    for path in pred_files:
        path_counts[path] = path_counts.get(path, 0) + 1
    # Show duplicated paths
    duplicates = {path: count for path, count in path_counts.items() if count > 1}
    print(f"Duplicated paths: {duplicates}")
    # Use only unique paths
    pred_files = list(unique_files)
print(f"Found {len(pred_files)} prediction files")

# Calculate ROC-AUC for each file and save results
model_counts = defaultdict(int)
results = []
for file_path in pred_files:
    # Extract model and data type from path
    path_parts = file_path.split(os.sep)
    model_name = path_parts[1]  # e.g., "BERT-base"
    
    # Handle special case for trained models with complex structure
    if 'trained_models' in file_path:
        filename = os.path.basename(file_path)
        
        if 'se_domainHF' in file_path:
            # Extract model type from filename (e.g., SE-COPD, SE-Autoimmune)
            # Extract the disease from the model name
            model_disease = filename.split('_')[2].replace('HF', '')
            # Extract the target dataset from the filename
            target_dataset = filename.split('_')[-2]
            model_name = f"SE-{model_disease.lower()}"
            data_type = target_dataset
        elif 'moe_tokensHF' in file_path:
            # Handle MOE models
            data_type = filename.split('_')[-2]
            model_name = "MOE-ALL"
        elif 'se_all_tokensHF' in file_path:
            # Handle SE-ALL models
            data_type = filename.split('_')[-2]
            model_name = "SE-ALL"
        else:
            # For other trained models
            data_type = filename.split('_')[-2]
    else:
        # For standard models, extract data type from filename
        filename = os.path.basename(file_path)
        data_type = filename.split('_')[1] if '_' in filename else 'unknown'

    model_counts[model_name] += 1
    
    # Load predictions and calculate ROC-AUC
    df = pd.read_csv(file_path)
    if 'prediction' in df.columns and 'label' in df.columns:
        try:
            auc = roc_auc_score(df['label'], df['prediction'])
            results.append({
                'model': model_name,
                'data_type': data_type,
                'roc_auc': round(auc, 4),
                'file_path': file_path
            })
        except Exception as e:
            print(f"Error calculating ROC-AUC for {file_path}: {e}")
    else:
        print(f"Missing required columns in {file_path}")

# Create and save results dataframe
results_df = pd.DataFrame(results)
# Sort by data_type and then by roc_auc
results_df = results_df.sort_values(by=['data_type', 'roc_auc'], ascending=[False, False])
results_df.to_csv("results/model_performance_roc_auc.csv", index=False)
print(f"Saved results for {len(results)} files")

# Add ROC-AUC values to the base table
base_table = pd.read_excel("results/base_table_moe.xlsx")

for i, row in results_df.iterrows():
    for j, row_base in base_table.iterrows():
        if row['model'].lower() == row_base['Model'].lower() and row['data_type'].lower() == row_base['Dataset'].lower():
         base_table.at[j, 'roc_auc'] = row['roc_auc']

base_table.to_excel("results/base_table_final.xlsx", index=False)
# Display the updated table
print(base_table.head())
results_df.head() if not results_df.empty else "No results calculated"

print(model_counts)

Duplicated paths: {'results\\BERT-base\\BERT-base_all_False_predictions.csv': 2, 'results\\BERT-base\\BERT-base_AUTOIMMUNE_False_predictions.csv': 2, 'results\\BERT-base\\BERT-base_CANCER_False_predictions.csv': 2, 'results\\BERT-base\\BERT-base_COPD_False_predictions.csv': 2, 'results\\BERT-base\\BERT-base_CVD_False_predictions.csv': 2, 'results\\BERT-base\\BERT-base_PARASITIC_False_predictions.csv': 2, 'results\\BERT-large\\BERT-large_all_False_predictions.csv': 2, 'results\\BERT-large\\BERT-large_AUTOIMMUNE_False_predictions.csv': 2, 'results\\BERT-large\\BERT-large_CANCER_False_predictions.csv': 2, 'results\\BERT-large\\BERT-large_COPD_False_predictions.csv': 2, 'results\\BERT-large\\BERT-large_CVD_False_predictions.csv': 2, 'results\\BERT-large\\BERT-large_PARASITIC_False_predictions.csv': 2, 'results\\BioBERT\\BioBERT_all_False_predictions.csv': 2, 'results\\BioBERT\\BioBERT_AUTOIMMUNE_False_predictions.csv': 2, 'results\\BioBERT\\BioBERT_CANCER_False_predictions.csv': 2, 'result