In [1]:
# notebooks/phase2_ablation_analysis.ipynb

import pandas as pd
import wandb
from pathlib import Path

# Configure pandas for better display
pd.set_option('display.precision', 4)
pd.set_option('display.max_rows', 100)



In [None]:
# --- 1. Fetch Data from W&B ---
print("Fetching ablation suite runs from Weights & Biases...")
api = wandb.Api()

# This query will select all runs from the ablation suite.
# We filter by the job_type, which is set in the run_benchmark.py script.
# We also filter for the 'jormungandr' method to exclude the baselines.
runs = api.runs(
    "mohan-rangan-research-nosfera/Jormungandr-Semantica",
    filters={
        "job_type": "ablation",
        "config.method": "jormungandr",
        "state": "finished" # Only get completed runs
    }
)

print(f"Found {len(runs)} runs to analyze.")



In [None]:
# --- 2. Process Runs into a DataFrame ---
summary_list = []
for run in runs:
    # .summary contains the final metrics logged
    # .config contains the hyperparameters
    summary_list.append({
        "representation": run.config.get("representation"),
        "seed": run.config.get("seed"),
        "ARI": run.summary.get("ARI"),
        "runtime_seconds": run.summary.get("runtime_seconds")
    })

results_df = pd.DataFrame(summary_list)



In [None]:
# --- 3. Data Cleaning and Initial Inspection ---
results_df = results_df.dropna().sort_values(by=["representation", "seed"]).reset_index(drop=True)

print("\nRaw Results DataFrame:")
display(results_df)



In [None]:
# --- 4. Generate Publication-Ready Summary Table ---
print("\n--- Ablation Results: Mean ARI ± Std. Dev. ---")

# We will group by the 'representation' and aggregate the ARI scores
summary_table = results_df.groupby('representation')['ARI'].agg(['mean', 'std'])

# Ensure the table is ordered logically
rep_order = ['direct', 'wavelet', 'acmw']
summary_table = summary_table.reindex(rep_order)

# Format the table for the paper
formatted_table = pd.DataFrame(index=summary_table.index)
formatted_table['Mean ARI'] = summary_table['mean']
formatted_table['Std. Dev.'] = summary_table['std']
formatted_table['Formatted ARI'] = summary_table.apply(
    lambda row: f"{row['mean']:.4f} ± {row['std']:.4f}", axis=1
)

display(formatted_table)



In [None]:
# --- 5. Save the Markdown table for easy inclusion in the paper ---
markdown_output = formatted_table[['Formatted ARI']].to_markdown()
print("\nMarkdown for Paper (Table 8.1):")
print(markdown_output)

# Save the raw data for future reference
output_path = Path("../data/phase2_ablation_results.csv")
results_df.to_csv(output_path, index=False)
print(f"\nRaw data for this analysis saved to {output_path}")