# Research Comparison (Hypothesis Stats)


This notebook runs the supervisor-required analysis only:
- mean/median
- shapiro-wilk normality
- paired t-test

If `results/analysis/per_item_metrics.csv` is missing, the script auto-builds it from available baseline/qlora run JSON files.


In [None]:
from pathlib import Path
import sys
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

# resolve project root in local or colab execution
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "results").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

sys.path.append(str(PROJECT_ROOT))
from scripts.generate_research_comparison import generate

OUT_DIR = PROJECT_ROOT / "results" / "analysis"
PER_ITEM = OUT_DIR / "per_item_metrics.csv"

summary = generate(per_item_csv=PER_ITEM, out_dir=OUT_DIR)
summary


## Mean/Median + Shapiro-Wilk


In [None]:
shapiro_df = pd.read_csv(OUT_DIR / "stats_mean_median_shapiro.csv")
display(shapiro_df.sort_values(["run_id", "metric"]).reset_index(drop=True))


## Paired t-tests


In [None]:
ttest_df = pd.read_csv(OUT_DIR / "stats_paired_ttests.csv")
display(ttest_df.sort_values(["comparison", "metric"]).reset_index(drop=True))


## Visual Check (EX mean deltas)


In [None]:
plot_df = ttest_df[ttest_df["metric"] == "ex"].copy()
if not plot_df.empty:
    ax = plot_df.set_index("comparison")["mean_diff_right_minus_left"].plot(
        kind="bar", figsize=(10, 4), rot=20
    )
    ax.axhline(0.0, color="black", linewidth=1)
    ax.set_ylabel("Delta (right - left)")
    ax.set_xlabel("Comparison")
    ax.set_title("EX Delta by Comparison")
    ax.grid(axis="y", alpha=0.25)
    plt.tight_layout()
    plt.show()
else:
    print("No EX rows found in paired t-test output.")
