# research comparison (hypothesis stats)

links:
- code: `/Users/mackenzieobrian/MacDoc/Dissertation/scripts/generate_research_comparison.py`
- docs: [shapiro](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.shapiro.html), [paired t-test](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.ttest_rel.html)
- literature: [dror et al. 2018](https://aclanthology.org/P18-1128/), [spider](https://aclanthology.org/D18-1425/)


this notebook runs the required stats on one chosen per-item file and shows compact visuals.

In [None]:
from pathlib import Path
import sys

import matplotlib.pyplot as plt
import pandas as pd
from IPython.display import display

# set project root for local or colab
PROJECT_ROOT = Path.cwd()
if not (PROJECT_ROOT / "results").exists():
    PROJECT_ROOT = PROJECT_ROOT.parent

sys.path.append(str(PROJECT_ROOT))
from scripts.generate_research_comparison import generate

OUT_DIR = PROJECT_ROOT / "results" / "analysis"

# choose input file for this demo
PER_ITEM = OUT_DIR / "per_item_metrics_extension_constrained.csv"
if not PER_ITEM.exists():
    PER_ITEM = OUT_DIR / "per_item_metrics.csv"

DEMO_OUT_DIR = OUT_DIR / "demo_stats"
summary = generate(per_item_csv=PER_ITEM, out_dir=DEMO_OUT_DIR)
print(summary)


## summary tables

In [None]:
shapiro_df = pd.read_csv(DEMO_OUT_DIR / "stats_mean_median_shapiro.csv")
ttest_df = pd.read_csv(DEMO_OUT_DIR / "stats_paired_ttests.csv")

display(shapiro_df.sort_values(["run_id", "metric"]).reset_index(drop=True))
display(ttest_df.sort_values(["comparison", "metric"]).reset_index(drop=True))


## how to read these stats

- mean: average metric value (for va/em/ex this is a rate).
- median: middle value across examples.
- shapiro_p: normality check p-value.
- paired t-test p_value: significance of right-minus-left delta.
- reject_H0: statistically significant difference at alpha=0.05.


In [None]:
# compact interpretation from current outputs
ex_rows = ttest_df[ttest_df["metric"] == "ex"].copy()
va_rows = ttest_df[ttest_df["metric"] == "va"].copy()
ts_rows = ttest_df[ttest_df["metric"] == "ts"].copy()

print("ex interpretation:")
if ex_rows.empty:
    print("- no ex paired rows found")
else:
    for _, r in ex_rows.iterrows():
        diff = r["mean_diff_right_minus_left"]
        pval = r["p_value"]
        dec = r["decision_alpha_0_05"]
        sign = "improved" if diff > 0 else "decreased"
        if dec == "reject_H0":
            print(f"- {r['comparison']}: {sign} by {abs(diff):.3f} (p={pval:.4g}, significant)")
        else:
            print(f"- {r['comparison']}: change {diff:.3f} (p={pval:.4g}, not significant)")

print()
print("va interpretation:")
if va_rows.empty:
    print("- no va paired rows found")
else:
    sig = va_rows[va_rows["decision_alpha_0_05"] == "reject_H0"]
    if sig.empty:
        print("- no significant va differences in current paired comparisons")
    else:
        for _, r in sig.iterrows():
            print(f"- {r['comparison']}: va delta {r['mean_diff_right_minus_left']:.3f} (p={r['p_value']:.4g})")

print()
print("ts interpretation:")
if ts_rows.empty:
    print("- no ts paired rows found")
else:
    usable = ts_rows[ts_rows["n_pairs"] > 0]
    if usable.empty:
        print("- ts exists in artifacts but has no paired coverage for these comparisons")
    else:
        print("- ts has paired coverage and can be interpreted for listed rows")

print()
print("note: this notebook uses the selected per-item file shown in the summary above.")


## run-level means (va and ex)

In [None]:
mean_df = shapiro_df[shapiro_df["metric"].isin(["va", "ex"])].copy()
mean_pivot = mean_df.pivot(index="run_id", columns="metric", values="mean").sort_index()
display(mean_pivot)

if not mean_pivot.empty:
    ax = mean_pivot.plot(kind="bar", figsize=(9, 4), rot=30)
    ax.set_ylim(0, 1.0)
    ax.set_ylabel("mean score")
    ax.set_title("run-level means")
    ax.grid(axis="y", alpha=0.25)
    plt.tight_layout()
    plt.show()


## paired ex deltas

In [None]:
plot_df = ttest_df[ttest_df["metric"] == "ex"].copy()
if not plot_df.empty:
    plot_df = plot_df.sort_values("mean_diff_right_minus_left")
    colors = ["#2a9d8f" if (p < 0.05) else "#457b9d" for p in plot_df["p_value"].fillna(1.0)]

    fig, ax = plt.subplots(figsize=(9, 4))
    ax.barh(plot_df["comparison"], plot_df["mean_diff_right_minus_left"], color=colors)
    ax.axvline(0.0, color="black", linewidth=1)
    ax.set_xlabel("delta (right - left)")
    ax.set_title("ex deltas with p<0.05 highlight")
    ax.grid(axis="x", alpha=0.25)

    for y, (_, row) in enumerate(plot_df.iterrows()):
        pv = row.get("p_value")
        label = f"p={pv:.3g}" if pd.notna(pv) else "p=na"
        ax.text(row["mean_diff_right_minus_left"], y, f"  {label}", va="center", fontsize=9)

    plt.tight_layout()
    plt.show()
else:
    print("no ex rows found in paired test output")
