<a href="https://colab.research.google.com/github/MK316/Workingpapers/blob/main/2025-insights/recall25_stats_figures0819.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recall 2nd analysis (0818~)

data from Drive > Research > Recall25>recalldata.csv

# [0] Descriptive stats

In [None]:
# --- Descriptives for Q1~Q6 (overall and by Level) ---

import pandas as pd
import numpy as np

# Path to your CSV
csv_path = "/content/recalldata.csv"   # <-- change if needed
df = pd.read_csv(csv_path)
df.columns = [c.strip() for c in df.columns]

# Items of interest
items = [f"Q{i}" for i in range(1, 7)]

# Coerce Likert items to numeric (e.g., blanks -> NaN)
for c in items:
    if c not in df.columns:
        raise ValueError(f"Column '{c}' not found in the CSV.")
    df[c] = pd.to_numeric(df[c], errors="coerce")

# ---------- helpers ----------
def summarize(series: pd.Series) -> dict:
    s = series.dropna()
    if s.empty:
        return dict(n=0, mean=np.nan, sd=np.nan, median=np.nan,
                    q1=np.nan, q3=np.nan, iqr=np.nan, min=np.nan, max=np.nan)
    q1 = s.quantile(0.25)
    q3 = s.quantile(0.75)
    return dict(
        n=int(s.size),
        mean=float(s.mean()),
        sd=float(s.std(ddof=1)),
        median=float(s.median()),
        q1=float(q1),
        q3=float(q3),
        iqr=float(q3 - q1),
        min=float(s.min()),
        max=float(s.max()),
    )

def build_summary(frame: pd.DataFrame, label: str | None = None) -> pd.DataFrame:
    rows = []
    for col in items:
        stats = summarize(frame[col])
        stats.update({"item": col})
        if label is not None:
            stats.update({"Level": label})
        rows.append(stats)
    out = pd.DataFrame(rows)
    # order columns
    cols = ["item","Level","n","mean","sd","median","q1","q3","iqr","min","max"]
    return out[[c for c in cols if c in out.columns]]

# ---------- overall ----------
overall_summary = build_summary(df).round(2)
print("Overall descriptives (Q1–Q6):")
display(overall_summary)

# ---------- by Level ----------
if "Level" not in df.columns:
    raise ValueError("Column 'Level' not found. Add/rename it to use by-Level summaries.")

by_level = pd.concat(
    [build_summary(g, lvl) for lvl, g in df.groupby(df["Level"].astype(str))]
).round(2).reset_index(drop=True)

print("\nDescriptives by Level (Q1–Q6):")
display(by_level)

# ---------- optional: Likert distributions ----------
def likert_counts(frame: pd.DataFrame, group_col: str | None = None):
    # counts for response options 1..6
    levels = [1,2,3,4,5,6]
    if group_col is None:
        out = pd.concat(
            {col: frame[col].value_counts().reindex(levels, fill_value=0) for col in items},
            axis=1
        ).T
        out.index.name = "item"
        out = out.reset_index().rename(columns={i: f"count_{i}" for i in levels})
        # percentages
        out[[f"pct_{i}" for i in levels]] = (
            out[[f"count_{i}" for i in levels]].div(out[[f"count_{i}" for i in levels]].sum(axis=1), axis=0).round(4)
        )
        return out
    else:
        rows = []
        for lvl, g in frame.groupby(group_col):
            for col in items:
                counts = g[col].value_counts().reindex(levels, fill_value=0)
                total = counts.sum()
                row = {"Level": lvl, "item": col}
                row.update({f"count_{i}": int(counts.get(i, 0)) for i in levels})
                for i in levels:
                    row[f"pct_{i}"] = (counts.get(i, 0) / total) if total else np.nan
                rows.append(row)
        return pd.DataFrame(rows)

likert_overall = likert_counts(df)
likert_byLevel = likert_counts(df, group_col="Level")

# ---------- save outputs ----------
overall_summary.to_csv("/content/descriptives_overall_Q1_Q6.csv", index=False)
by_level.to_csv("/content/descriptives_byLevel_Q1_Q6.csv", index=False)
likert_overall.to_csv("/content/likert_counts_overall_Q1_Q6.csv", index=False)
likert_byLevel.to_csv("/content/likert_counts_byLevel_Q1_Q6.csv", index=False)

print("\nSaved files:")
print("/content/descriptives_overall_Q1_Q6.csv")
print("/content/descriptives_byLevel_Q1_Q6.csv")
print("/content/likert_counts_overall_Q1_Q6.csv")
print("/content/likert_counts_byLevel_Q1_Q6.csv")


The user wants to combine two descriptive statistics tables (overall and by level) into a single, cohesive table.
The user also specifies the need for the table to be in APA style.
The tables provided show the following metrics:

n: number of participants

mean: mean score

sd: standard deviation

median: median score

q1: first quartile

q3: third quartile

iqr: interquartile range

min: minimum score

max: maximum score

Here's how to structure the combined table in APA style:

Table Number and Title: An APA table needs a table number (e.g., Table 1) and a descriptive title (in italics).

Layout: The table should be clear and easy to read. Use horizontal lines to separate the table title from the header and the main body from the note, but avoid vertical lines.

Grouping: The most logical way to combine these is to have a main column for the item (Q1-Q6), and then sub-columns for each group: Overall, Undergraduate (UG), and Graduate (GRAD).

Metrics: Within each group's column, present the key statistics. The most common and useful for a quick summary are n, Mean, and SD. The other metrics (Median, Q1, Q3, etc.) are also important but can be simplified or placed in a note if space is a concern. For a comprehensive table, it's best to include Median and IQR as well, as they are less sensitive to outliers, which is important for Likert scale data.

Notation: Use standard APA notation. M for Mean, SD for Standard Deviation, n for sample size.

Note: An APA table often includes a note at the bottom to provide additional context or clarify abbreviations. This is a good place to define the scale and any other relevant details.

Here is the combined table following these principles.

Table for Combined Data (APA Style)
Table 1
Descriptive Statistics for Pre- and Post-Survey Likert Items, Overall and by Level

| Item | Overall | Undergraduate (UG) | Graduate (GRAD) |
| :--- | :---: | :---: | :---: |
| | n = 34 | n = 18 | n = 16 |
| | M (SD) | M (SD) | M (SD) |
| Q1 | 2.94 (1.30) | 2.33 (1.08) | 3.62 (1.20) |
| Q2 | 4.29 (1.09) | 4.61 (1.04) | 3.94 (1.06) |
| Q3 | 4.74 (1.02) | 4.56 (1.10) | 4.94 (0.93) |
| Q4 | 4.00 (1.28) | 4.33 (1.03) | 3.62 (1.45) |
| Q5 | 5.29 (0.80) | 5.33 (0.69) | 5.25 (0.93) |
| Q6 | 5.00 (0.95) | 5.11 (0.83) | 4.88 (1.09) |

## Basic plots

In [None]:
#@markdown Single plot --- Overall boxplots for Q1–Q6 ---

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Path to your CSV
csv_path = "/content/recalldata.csv"  # <- keep or change

# Load and tidy
df = pd.read_csv(csv_path)
items = [f"Q{i}" for i in range(1, 7)]
for c in items:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Long form for convenience (optional)
long = df[items].melt(var_name="Item", value_name="Score").dropna()

# Labels to show under each box
label_map = {
    "Q1": "Q1: DC (pre)",
    "Q2": "Q2: PTB (pre)",
    "Q3": "Q3: E&M (pre)",
    "Q4": "Q4: DC (post)",
    "Q5": "Q5: PTB (post)",
    "Q6": "Q6: E&M (post)",
}
order = items
data = [long.loc[long["Item"] == k, "Score"].to_numpy() for k in order]

# --- Plot
plt.figure(figsize=(12, 5), dpi=150)
bp = plt.boxplot(
    data,
    labels=[label_map[k] for k in order],
    showmeans=True,         # show mean markers
    meanline=False,
    showfliers=False,       # hide outliers for cleaner look on small N
    notch=True,
    widths=0.6,
)

# Axes formatting
plt.ylim(1, 6.5)                          # Likert range
plt.yticks([1, 2, 3, 4, 5, 6])
plt.ylabel("Likert score (1–6)", fontsize=14)
plt.title("Overall distributions for Q1–Q6")
plt.grid(axis="y", alpha=0.3)

# Optional: overlay jittered points for visibility
rng = np.random.default_rng(42)
for i, y in enumerate(data, start=1):
    x = np.full_like(y, i, dtype=float) + rng.uniform(-0.12, 0.12, size=y.size)
    plt.scatter(x, y, s=24, alpha=0.5, edgecolor="none")

plt.tight_layout()
plt.savefig("/content/boxplots_overall_Q1_Q6.png", dpi=300, bbox_inches="tight")
plt.show()

print("Saved figure to /content/boxplots_overall_Q1_Q6.png")


## Two plots in one area

## Order by DC, PTB, E&M

In [None]:
#@markdown Final Two-panel boxplots (left = overall, right = by Level)
# with item order Q1, Q4, Q2, Q5, Q3, Q6 and two-line x-axis labels.

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.patches import Patch

# --------- Config ---------
csv_path = "/content/recalldata.csv"  # <- your file
items_order = ["Q1","Q4","Q2","Q5","Q3","Q6"]

# Font sizes
TITLE_FS, LABEL_FS, TICK_FS, LEGEND_FS = 16, 14, 14, 14

# Two-line labels
label_map = {
    "Q1": "Q1\n(pre-DC)",   "Q4": "Q4\n(post-DC)",
    "Q2": "Q2\n(pre-PTB)",  "Q5": "Q5\n(post-PTB)",
    "Q3": "Q3\n(pre-E&M)",  "Q6": "Q6\n(post-E&M)",
}

# Colorblind-friendly palette (Okabe–Ito)
palette = ["#0072B2", "#D55E00", "#009E73", "#CC79A7", "#E69F00", "#56B4E9"]

# --------- Load & prepare ---------
df = pd.read_csv(csv_path)
df.columns = [c.strip() for c in df.columns]

# Coerce Likert items to numeric
for c in items_order:
    if c not in df.columns:
        raise ValueError(f"Column '{c}' not found in the CSV.")
    df[c] = pd.to_numeric(df[c], errors="coerce")

# Level needed for right panel
if "Level" not in df.columns:
    raise ValueError("Column 'Level' not found.")
df["Level"] = df["Level"].astype(str).str.strip()

# Level ordering: UG, GRAD first if present, then others
levels_order = []
for lab in ["UG", "GRAD"]:
    if lab in df["Level"].unique():
        levels_order.append(lab)
for lab in df["Level"].unique():
    if lab not in levels_order:
        levels_order.append(lab)

# Data arrays
overall_data = [df[q].dropna().to_numpy() for q in items_order]
group_data = {
    lv: [df.loc[df["Level"] == lv, q].dropna().to_numpy() for q in items_order]
    for lv in levels_order
}
level_colors = {lv: palette[i % len(palette)] for i, lv in enumerate(levels_order)}

# --------- Plot ---------
# Turn off tight_layout to avoid auto-squash with multi-line labels; adjust margins manually.
plt.rcParams["figure.autolayout"] = False
fig, (ax_left, ax_right) = plt.subplots(1, 2, figsize=(15, 6), dpi=300, sharey=True)

# === Left: Overall ===
bp_left = ax_left.boxplot(
    overall_data,
    labels=[label_map.get(k, k) for k in items_order],
    showmeans=True, notch=True, widths=0.6, showfliers=False, patch_artist=True
)

# Style (neutral)
for box in bp_left['boxes']:
    box.set(facecolor="#FFFFFF", edgecolor="#555555", linewidth=1.2)
for element in ['whiskers', 'caps', 'medians']:
    for line in bp_left[element]:
        line.set(color="#555555", linewidth=1.2)
for mean in bp_left['means']:
    mean.set(marker='D', markersize=4, markerfacecolor="#555555", markeredgecolor="white")

ax_left.set_ylim(1, 6)
ax_left.set_yticks([1, 2, 3, 4, 5, 6])
ax_left.set_ylabel("Likert score (1–6)", fontsize=LABEL_FS)
ax_left.set_title("a. Overall distributions", fontsize=TITLE_FS)
ax_left.tick_params(axis="both", labelsize=TICK_FS)
ax_left.tick_params(axis="x", pad=10)  # pad for 2-line labels
ax_left.grid(axis="y", alpha=0.3)

# Optional jitter on left
rng = np.random.default_rng(42)
for i, y in enumerate(overall_data, start=1):
    if len(y):
        x = np.full_like(y, i, dtype=float) + rng.uniform(-0.12, 0.12, size=y.size)
        ax_left.scatter(x, y, s=18, alpha=0.45, color="#555555", edgecolor="none")

# === Right: Grouped by Level ===
base_positions = np.arange(1, len(items_order) + 1, dtype=float)
group_count = max(1, len(levels_order))
offset = 0.18 if group_count == 2 else 0.25
width  = 0.32 if group_count == 2 else 0.22

legend_handles = []
for k, lv in enumerate(levels_order):
    if group_count == 1:
        pos = base_positions
    else:
        shift = (k - (group_count - 1) / 2) * (2 * offset / (group_count - 1))
        pos = base_positions + shift

    bp = ax_right.boxplot(
        group_data[lv],
        positions=pos,
        widths=width,
        notch=True,
        showmeans=True,
        showfliers=False,
        patch_artist=True
    )
    col = level_colors[lv]

    for box in bp['boxes']:
        box.set(facecolor=col, edgecolor=col, linewidth=1.6, alpha=0.70)
    for line in bp['whiskers'] + bp['caps']:
        line.set(color=col, linewidth=1.4)
    for med in bp['medians']:
        med.set(color="#222222", linewidth=1.6)
    for mean in bp['means']:
        mean.set(marker='D', markersize=4, markerfacecolor="white", markeredgecolor=col)

    # Jitter per level (colored)
    for j, y in enumerate(group_data[lv], start=0):
        if len(y):
            x = np.full_like(y, pos[j], dtype=float) + rng.uniform(-width/3, width/3, size=y.size)
            ax_right.scatter(x, y, s=16, alpha=0.55, color=col, edgecolor="white", linewidths=0.4)

    legend_handles.append(Patch(facecolor=col, edgecolor=col, alpha=0.70, label=str(lv)))

ax_right.set_xticks(base_positions)
ax_right.set_xticklabels([label_map.get(k, k) for k in items_order], fontsize=TICK_FS)
ax_right.tick_params(axis="x", pad=10)  # pad for 2-line labels
ax_right.set_ylim(0.5, 6.5)
ax_right.set_title("b. Distributions by Level (Class)", fontsize=TITLE_FS)
ax_right.grid(axis="y", alpha=0.3)
ax_right.tick_params(axis="both", labelsize=TICK_FS)
ax_right.legend(handles=legend_handles, title="Level", loc="lower right",
                frameon=False, prop={"size": LEGEND_FS}, title_fontsize=LEGEND_FS)

# Margins so the two-line labels don't clip and both panels are visible
plt.subplots_adjust(left=0.08, right=0.98, top=0.90, bottom=0.22, wspace=0.1)

out_path = "/content/boxplots_two_panel_Q1_Q4_Q2_Q5_Q3_Q6.png"
plt.savefig(out_path, dpi=300, bbox_inches="tight")
plt.show()
print(f"Saved figure to {out_path}")


# [1] Wilcoxon ordinal comparison

In [None]:
#@markdown --- Colab: Wilcoxon pre–post within Level=UG/GRAD ---

import pandas as pd
import numpy as np
from scipy.stats import wilcoxon, norm

# ---------- Load ----------
csv_path = "/content/recalldata.csv"   # <-- change to your file
df = pd.read_csv(csv_path)
df.columns = [c.strip() for c in df.columns]

# ---------- Cohort from Level ----------
# Expecting Level already as UG / GRAD; also normalize common variants just in case.
df["Cohort"] = (
    df["Level"].astype(str).str.strip().str.upper()
      .replace({"UNDERGRAD":"UG","UNDERGRADUATE":"UG","U":"UG",
                "GRADUATE":"GRAD","G":"GRAD"})
)
print("Cohort counts:\n", df["Cohort"].value_counts(dropna=False))

# ---------- Likert to numeric ----------
likert = ["Q1","Q2","Q3","Q4","Q5","Q6"]
for c in likert:
    df[c] = pd.to_numeric(df[c], errors="coerce")

# ---------- Helpers ----------
def iqr(s): return s.quantile(0.25), s.quantile(0.75)

def rb_from_W(W, n_eff):
    T = n_eff * (n_eff + 1) / 2.0
    return (2*W/T) - 1.0

def bootstrap_ci_rb(pre, post, n_boot=5000, seed=123):
    rng = np.random.default_rng(seed)
    d_full = (post - pre)
    mask = d_full != 0
    pre, post = pre[mask], post[mask]
    n = len(pre)
    if n == 0:
        return np.nan, np.nan
    T = n * (n + 1) / 2.0
    vals = np.empty(n_boot)
    for b in range(n_boot):
        idx = rng.integers(0, n, n)
        d = post[idx] - pre[idx]
        ranks = pd.Series(np.abs(d)).rank(method="average").to_numpy()
        W_plus  = ranks[d > 0].sum()
        W_minus = ranks[d < 0].sum()
        vals[b] = (W_plus - W_minus) / T
    lo, hi = np.percentile(vals, [2.5, 97.5])
    return float(lo), float(hi)

def describe(x):
    x = pd.Series(x).dropna()
    if len(x)==0:
        return dict(n=0, median=np.nan, q1=np.nan, q3=np.nan, mean=np.nan, sd=np.nan)
    q1, q3 = x.quantile(0.25), x.quantile(0.75)
    return dict(n=len(x), median=float(x.median()), q1=float(q1), q3=float(q3),
                mean=float(x.mean()), sd=float(x.std(ddof=1)))

def analyze_pair(df_cohort, pre_col, post_col, construct, cohort):
    sub = df_cohort[[pre_col, post_col]].dropna().astype(float)
    pre = sub[pre_col].to_numpy()
    post = sub[post_col].to_numpy()

    pre_d, post_d = describe(pre), describe(post)
    if len(sub) == 0:
        return {**{"cohort":cohort,"construct":construct},
                **{k:np.nan for k in [
                    "pre_n","pre_median","pre_Q1","pre_Q3",
                    "post_n","post_median","post_Q1","post_Q3",
                    "W","p_value","r_rb","r_rb_CI_lo","r_rb_CI_hi","r_from_Z"
                ]}}

    # p-value (two-sided). Order as post, pre so “improvement” is post > pre
    res = wilcoxon(post, pre, zero_method="wilcox", alternative="two-sided", method="auto")
    W = float(res.statistic); p = float(res.pvalue)

    # Effect size from signed ranks
    d = post - pre
    mask = d != 0
    d = d[mask]
    n_eff = int(len(d))
    if n_eff == 0:
        r_rb = r_lo = r_hi = r_from_Z = np.nan
    else:
        ranks = pd.Series(np.abs(d)).rank(method="average").to_numpy()
        W_plus  = ranks[d > 0].sum()
        W_minus = ranks[d < 0].sum()
        T = n_eff * (n_eff + 1) / 2.0
        r_rb = (W_plus - W_minus) / T                 # positive if post > pre
        r_lo, r_hi = bootstrap_ci_rb(pre, post)       # same orientation
        z_abs = norm.isf(p/2.0) if 0 < p < 1 else np.nan
        r_from_Z = float(np.sign(r_rb) * z_abs / np.sqrt(n_eff)) if np.isfinite(z_abs) else np.nan

    return {
        "cohort": cohort, "construct": construct,
        "pre_n": pre_d["n"], "pre_median": pre_d["median"], "pre_Q1": pre_d["q1"], "pre_Q3": pre_d["q3"],
        "post_n": post_d["n"], "post_median": post_d["median"], "post_Q1": post_d["q1"], "post_Q3": post_d["q3"],
        "W": W, "p_value": p, "r_rb": r_rb, "r_rb_CI_lo": r_lo, "r_rb_CI_hi": r_hi, "r_from_Z": r_from_Z
    }

# ---------- Run analyses ----------
constructs = {
    "Digital confidence (DC)": ("Q1", "Q4"),
    "Perceived teaching benefit (PTB)": ("Q2", "Q5"),
    "Engagement & motivation (E&M)": ("Q3", "Q6"),
}

rows = []
for cohort in ["UG","GRAD"]:
    g = df[df["Cohort"] == cohort]
    for name, (pre_col, post_col) in constructs.items():
        rows.append(analyze_pair(g, pre_col, post_col, name, cohort))

summary = pd.DataFrame(rows)
summary
summary.to_csv("/content/survey_wilcoxon_summary.csv", index=False)
print("Saved to /content/survey_wilcoxon_summary.csv")
