In [1]:
import pandas as pd
from scipy.stats import wilcoxon
import pingouin as pg
from pathlib import Path

# Set pandas display options for better table formatting
pd.set_option('display.precision', 3)

In [2]:
# Load the exported results
results_df = pd.read_csv(Path("../data/phase1_results.csv"))

# --- Data Cleaning ---
# We only want the columns we need for this analysis
columns_to_keep = ["method", "dataset", "seed", "ARI"]
results_df = results_df[columns_to_keep]

# Filter out any irrelevant runs (e.g., old verification runs without a method)
results_df = results_df.dropna(subset=['method'])

# Exclude runs where the method is an empty string
results_df = results_df[results_df['method'] != '']

print(f"Successfully loaded and cleaned {len(results_df)} benchmark runs.")
print("\nDataFrame Head:")
display(results_df.head())

print("\nData summary:")
results_df.groupby(['dataset', 'method']).count()

Successfully loaded and cleaned 63 benchmark runs.

DataFrame Head:


Unnamed: 0,method,dataset,seed,ARI
0,hdbscan,agnews,51.0,0.727
1,hdbscan,agnews,50.0,0.675
2,hdbscan,agnews,49.0,0.688
3,hdbscan,agnews,48.0,0.67
4,hdbscan,agnews,47.0,0.703



Data summary:


Unnamed: 0_level_0,Unnamed: 1_level_0,seed,ARI
dataset,method,Unnamed: 2_level_1,Unnamed: 3_level_1
20newsgroups,bertopic,11,11
20newsgroups,hdbscan,11,11
20newsgroups,jormungandr,11,11
agnews,bertopic,10,10
agnews,hdbscan,10,10
agnews,jormungandr,10,10


In [5]:
# Pivot the table to have methods as columns for easy comparison
pivot_df = results_df.pivot_table(index=['dataset', 'seed'], columns='method', values='ARI').reset_index()

print("--- Statistical Significance Report ---")
# Perform paired Wilcoxon signed-rank tests for each dataset
for dataset in pivot_df['dataset'].unique():
    print(f"\n===== DATASET: {dataset.upper()} =====")
    dataset_df = pivot_df[pivot_df['dataset'] == dataset]

    # Compare Jörmungandr vs. BERTopic
    stat_bt, p_bt = wilcoxon(dataset_df['jormungandr'], dataset_df['bertopic'])
    d_bt = pg.compute_effsize(dataset_df['jormungandr'], dataset_df['bertopic'], eftype='cohen')
    print(f"Jörmungandr vs. BERTopic: p-value = {p_bt:.4f}, Cohen's d = {d_bt:.3f}")
    
    # Compare Jörmungandr vs. HDBSCAN
    stat_hd, p_hd = wilcoxon(dataset_df['jormungandr'], dataset_df['hdbscan'])
    d_hd = pg.compute_effsize(dataset_df['jormungandr'], dataset_df['hdbscan'], eftype='cohen')
    print(f"Jörmungandr vs. HDBSCAN: p-value = {p_hd:.4f}, Cohen's d = {d_hd:.3f}")

--- Statistical Significance Report ---

===== DATASET: 20NEWSGROUPS =====
Jörmungandr vs. BERTopic: p-value = 0.0020, Cohen's d = 1.975
Jörmungandr vs. HDBSCAN: p-value = 0.0020, Cohen's d = 4.011

===== DATASET: AGNEWS =====
Jörmungandr vs. BERTopic: p-value = 0.0020, Cohen's d = 1.975
Jörmungandr vs. HDBSCAN: p-value = 0.0020, Cohen's d = 4.011


In [9]:
# --- Final, Corrected and Simplified Cell 4 ---

# Calculate the mean and standard deviation of ARI for each method and dataset
summary_table = results_df.groupby(['dataset', 'method'])['ARI'].agg(['mean', 'std'])

# Unstack the 'method' level to make methods into columns.
# This creates a two-level MultiIndex on the columns.
summary_table = summary_table.unstack(level='method')

# --- Build the final, clean table for publication ---
# Get the unique dataset names for the table's index
datasets = summary_table.index.unique()

# Define the order of methods for the columns
methods_in_order = ['jormungandr', 'bertopic', 'hdbscan']

# Create an empty DataFrame to hold our formatted strings
formatted_table = pd.DataFrame(index=datasets, columns=methods_in_order)

# Loop through the data and format it correctly
for dataset in datasets:
    for method in methods_in_order:
        # --- THIS IS THE CORRECT WAY TO ACCESS THE MULTIINDEX ---
        # The column keys are tuples: (statistic, method_name)
        mean_val = summary_table.loc[dataset, ('mean', method)]
        std_val = summary_table.loc[dataset, ('std', method)]
        
        # Format the string and place it in our new table
        formatted_table.loc[dataset, method] = f"{mean_val:.3f} ± {std_val:.3f}"

print("\n--- Core Results Table (Mean ARI ± Std. Dev.) ---")
markdown_table = formatted_table.to_markdown()
print(markdown_table)


--- Core Results Table (Mean ARI ± Std. Dev.) ---
| dataset      | jormungandr   | bertopic      | hdbscan       |
|:-------------|:--------------|:--------------|:--------------|
| 20newsgroups | 0.796 ± 0.024 | 0.750 ± 0.023 | 0.696 ± 0.024 |
| agnews       | 0.798 ± 0.025 | 0.750 ± 0.024 | 0.698 ± 0.025 |
