<a href="https://colab.research.google.com/github/MK316/Workingpapers/blob/main/2025-insights/Recall25_stats.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recall 2nd analysis (0818~)

data from Drive > Research > Recall25>recalldata.csv

In [None]:
# --- Colab: Wilcoxon pre–post within Level=UG/GRAD ---

import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, norm

# ---------- Load ----------
csv_path = "/content/recalldata.csv"   # <-- change to your file
df = pd.read_csv(csv_path)
df.columns = [c.strip() for c in df.columns]

# ---------- Cohort from Level ----------
# Expecting Level already as UG / GRAD; also normalize common variants just in case.
df["Cohort"] = (
    df["Level"].astype(str).str.strip().str.upper()
      .replace({"UNDERGRAD":"UG","UNDERGRADUATE":"UG","U":"UG",
                "GRADUATE":"GRAD","G":"GRAD"})
)
print("Cohort counts:\n", df["Cohort"].value_counts(dropna=False))

# ---------- Likert to numeric ----------
likert = ["Q1","Q2","Q3","Q4","Q5","Q6"]
for c in likert:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# ---------- Helpers ----------
def iqr(s): return s.quantile(0.25), s.quantile(0.75)

def rb_from_W(W, n_eff):
    T = n_eff * (n_eff + 1) / 2.0
    return (2*W/T) - 1.0

def bootstrap_ci_rb(pre, post, n_boot=5000, seed=123):
    rng = np.random.default_rng(seed)
    d_full = (post - pre)
    mask = d_full != 0
    pre, post = pre[mask], post[mask]
    n = len(pre)
    if n == 0:
        return np.nan, np.nan
    T = n * (n + 1) / 2.0
    vals = np.empty(n_boot)
    for b in range(n_boot):
        idx = rng.integers(0, n, n)
        d = post[idx] - pre[idx]
        ranks = pd.Series(np.abs(d)).rank(method="average").to_numpy()
        W_plus  = ranks[d > 0].sum()
        W_minus = ranks[d < 0].sum()
        vals[b] = (W_plus - W_minus) / T
    lo, hi = np.percentile(vals, [2.5, 97.5])
    return float(lo), float(hi)

def describe(x):
    x = pd.Series(x).dropna()
    if len(x)==0:
        return dict(n=0, median=np.nan, q1=np.nan, q3=np.nan, mean=np.nan, sd=np.nan)
    q1, q3 = x.quantile(0.25), x.quantile(0.75)
    return dict(n=len(x), median=float(x.median()), q1=float(q1), q3=float(q3),
                mean=float(x.mean()), sd=float(x.std(ddof=1)))

def analyze_pair(df_cohort, pre_col, post_col, construct, cohort):
    sub = df_cohort[[pre_col, post_col]].dropna().astype(float)
    pre = sub[pre_col].to_numpy()
    post = sub[post_col].to_numpy()

    pre_d, post_d = describe(pre), describe(post)
    if len(sub) == 0:
        return {**{"cohort":cohort,"construct":construct},
                **{k:np.nan for k in [
                    "pre_n","pre_median","pre_Q1","pre_Q3",
                    "post_n","post_median","post_Q1","post_Q3",
                    "W","p_value","r_rb","r_rb_CI_lo","r_rb_CI_hi","r_from_Z"
                ]}}

    # p-value (two-sided). Order as post, pre so “improvement” is post > pre
    res = wilcoxon(post, pre, zero_method="wilcox", alternative="two-sided", method="auto")
    W = float(res.statistic); p = float(res.pvalue)

    # Effect size from signed ranks
    d = post - pre
    mask = d != 0
    d = d[mask]
    n_eff = int(len(d))
    if n_eff == 0:
        r_rb = r_lo = r_hi = r_from_Z = np.nan
    else:
        ranks = pd.Series(np.abs(d)).rank(method="average").to_numpy()
        W_plus  = ranks[d > 0].sum()
        W_minus = ranks[d < 0].sum()
        T = n_eff * (n_eff + 1) / 2.0
        r_rb = (W_plus - W_minus) / T                 # positive if post > pre
        r_lo, r_hi = bootstrap_ci_rb(pre, post)       # same orientation
        z_abs = norm.isf(p/2.0) if 0 < p < 1 else np.nan
        r_from_Z = float(np.sign(r_rb) * z_abs / np.sqrt(n_eff)) if np.isfinite(z_abs) else np.nan

    return {
        "cohort": cohort, "construct": construct,
        "pre_n": pre_d["n"], "pre_median": pre_d["median"], "pre_Q1": pre_d["q1"], "pre_Q3": pre_d["q3"],
        "post_n": post_d["n"], "post_median": post_d["median"], "post_Q1": post_d["q1"], "post_Q3": post_d["q3"],
        "W": W, "p_value": p, "r_rb": r_rb, "r_rb_CI_lo": r_lo, "r_rb_CI_hi": r_hi, "r_from_Z": r_from_Z
    }

# ---------- Run analyses ----------
constructs = {
    "Digital confidence (DC)": ("Q1", "Q4"),
    "Perceived teaching benefit (PTB)": ("Q2", "Q5"),
    "Engagement & motivation (E&M)": ("Q3", "Q6"),
}

rows = []
for cohort in ["UG","GRAD"]:
    g = df[df["Cohort"] == cohort]
    for name, (pre_col, post_col) in constructs.items():
        rows.append(analyze_pair(g, pre_col, post_col, name, cohort))

summary = pd.DataFrame(rows)
summary
summary.to_csv("/content/survey_wilcoxon_summary.csv", index=False)
print("Saved to /content/survey_wilcoxon_summary.csv")
