In [None]:
import sys, platform
print("Python:", platform.python_version())
print("Path:", sys.executable)


In [None]:
# --- Setup & data load ---
import sys, platform, itertools
import numpy as np
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from scipy import stats

print("Python:", platform.python_version())
print("Path:", sys.executable)

DATA_PATH = "../data/sim_rater_dataset.csv"  # adjust if needed
df = pd.read_csv(DATA_PATH)

required = {"item_id","annotator_id","label","category"}
missing = required - set(df.columns)
assert not missing, f"Missing columns: {missing}"

df["label"] = df["label"].astype(int)
print("Rows:", len(df), "| Items:", df["item_id"].nunique(), "| Raters:", df["annotator_id"].nunique())
df.head()


In [None]:
# --- Agreement metrics: pairwise Cohen's κ, Fleiss' κ, Krippendorff's α (nominal) ---
# Pivot to items x raters matrix
M = df.pivot_table(index="item_id", columns="annotator_id", values="label", aggfunc="first")
raters = list(M.columns)

# 1) Pairwise Cohen's κ
pair_rows = []
for a, b in itertools.combinations(raters, 2):
    sub = M[[a,b]].dropna()
    k = np.nan
    if len(sub) > 0:
        k = cohen_kappa_score(sub[a], sub[b])
    pair_rows.append({"rater_a": a, "rater_b": b, "cohen_kappa": k, "n_items": len(sub)})
pairwise = pd.DataFrame(pair_rows)
mean_pairwise_kappa = pairwise["cohen_kappa"].mean()

# 2) Fleiss' κ (assumes each item has same # ratings; works reasonably with small variation)
counts = df.groupby(["item_id","label"]).size().unstack(fill_value=0).sort_index()
N = counts.shape[0]
n_per_item = counts.sum(axis=1).values
if N == 0 or np.min(n_per_item) < 2:
    fleiss_kappa = np.nan
else:
    # If items have varying #ratings, use the modal n to approximate P_i
    n = int(pd.Series(n_per_item).mode().iloc[0])
    # Re-normalize rows to 'n' by scaling (approx) if needed
    scaled = counts.div(counts.sum(axis=1), axis=0).mul(n)
    p_j = scaled.sum(axis=0).values / (N * n)
    P_i = ((scaled**2).sum(axis=1) - n) / (n*(n-1))
    P_bar = P_i.mean()
    P_e = (p_j**2).sum()
    fleiss_kappa = (P_bar - P_e) / (1 - P_e + 1e-12)

# 3) Krippendorff's α (nominal)
def kripp_alpha_nominal(matrix_items_by_raters: pd.DataFrame) -> float:
    A = matrix_items_by_raters.values  # shape: items x raters
    # map to 0/1 (or categories) and keep NaNs
    cats = np.sort(pd.unique(matrix_items_by_raters.stack()))
    # If no variation or too few labels:
    if len(cats) == 0:
        return np.nan
    idx = {c:i for i,c in enumerate(cats)}
    # coincidence matrix
    C = np.zeros((len(cats), len(cats)), dtype=float)
    for row in A:
        vals = row[~pd.isna(row)]
        if len(vals) < 2: 
            continue
        # count pairwise coincidences
        for i in range(len(vals)):
            for j in range(len(vals)):
                if i == j: 
                    continue
                C[idx[vals[i]], idx[vals[j]]] += 1
    Do = C.sum() - np.trace(C)
    marg = C.sum(axis=0)
    De = C.sum()**2 - (marg**2).sum()
    if De <= 0:
        return np.nan
    return 1 - Do/De

alpha = kripp_alpha_nominal(M)

# --- Bias slices: label rate by category + chi-square ---
label_of_interest = 1
slice_rates = df.groupby("category")["label"].apply(lambda s: (s == label_of_interest).mean()).sort_values(ascending=False)

ctab = pd.crosstab(df["category"], df["label"])
if ctab.size > 0 and ctab.shape[0] > 1 and ctab.shape[1] > 1:
    chi2, p, dof, exp = stats.chi2_contingency(ctab)
else:
    chi2, p, dof = (np.nan, np.nan, 0)

# --- Print a clean summary ---
print("\n=== AGREEMENT ===")
print(f"Mean pairwise Cohen's κ: {mean_pairwise_kappa:.3f}")
print(f"Fleiss' κ:                {fleiss_kappa:.3f}")
print(f"Krippendorff's α:         {alpha:.3f}")

print("\n=== TOP PAIRWISE DISAGREEMENTS (lowest κ) ===")
display(pairwise.sort_values('cohen_kappa').head(5))

print("\n=== BIAS SLICES (label=1 rate by category) ===")
display(slice_rates.to_frame("label_rate"))

print("\nChi-square across categories")
print(f"χ²={chi2:.2f}, dof={dof}, p-value={p:.4g}")

# Optional: quick “so-what” hints you can read aloud
print("\n=== TAKEAWAYS (you can say this out loud) ===")
if not np.isnan(mean_pairwise_kappa):
    if mean_pairwise_kappa < 0.4:
        print("- Pairwise κ is low; raters may interpret guidelines inconsistently.")
    elif mean_pairwise_kappa < 0.6:
        print("- Pairwise κ is moderate; check slices and recent guideline changes.")
    else:
        print("- Pairwise κ is healthy overall; still review weak pairs/slices.")
if not np.isnan(fleiss_kappa) and fleiss_kappa < 0.6:
    print("- Fleiss' κ below ~0.6 suggests multi-rater inconsistency worth triaging.")
if not np.isnan(alpha) and alpha < 0.6:
    print("- Krippendorff's α < 0.6: reliability guardrail breach in messy, missing-label settings.")
if not np.isnan(p) and p < 0.05:
    print("- Category label rates differ significantly (chi-square); investigate guideline clarity and rater mix for low-rate slices.")
