In [2]:
import pandas as pd

In [3]:
import pandas as pd
from typing import Tuple

# ----------------------------
# Core tables (MEAN-based)
# ----------------------------
def preference_counts(
    df: pd.DataFrame,
    pref_col: str = "preference_mean",
    by: Tuple[str, ...] = ("model", "language"),
) -> pd.DataFrame:
    """
    Count inverse vs surface using pref_col (default: preference_mean)
    Returns wide table: by..., inverse, surface, total, p_inverse
    """
    missing = [c for c in (*by, pref_col) if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns for preference_counts: {missing}")

    counts_long = (
        df.groupby(list(by) + [pref_col], dropna=False)
          .size()
          .rename("count")
          .reset_index()
          .rename(columns={pref_col: "preference"})
    )

    wide = (
        counts_long.pivot_table(
            index=list(by),
            columns="preference",
            values="count",
            aggfunc="sum",
            fill_value=0,
        )
        .reset_index()
    )

    for c in ["inverse", "surface"]:
        if c not in wide.columns:
            wide[c] = 0

    wide["total"] = wide["inverse"] + wide["surface"]
    wide["p_inverse"] = wide["inverse"] / wide["total"].replace(0, pd.NA)

    return wide[list(by) + ["inverse", "surface", "total", "p_inverse"]]


def delta_summary(
    df: pd.DataFrame,
    by: Tuple[str, ...] = ("model", "language"),
    cols: Tuple[str, ...] = ("delta_mean", "ratio_mean"),
) -> pd.DataFrame:
    """
    Continuous summaries (count/mean/median/std) for mean-based columns
    (default: delta_mean, ratio_mean).
    """
    missing = [c for c in by if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns for delta_summary groupby: {missing}")

    cols_present = [c for c in cols if c in df.columns]
    if not cols_present:
        return pd.DataFrame()

    out = (
        df.groupby(list(by), dropna=False)[cols_present]
          .agg(["count", "mean", "median", "std"])
    )
    out.columns = ["__".join(map(str, c)).strip() for c in out.columns.to_flat_index()]
    return out.reset_index()


def paired_preferences(
    df: pd.DataFrame,
    lang_a: str = "en",
    lang_b: str = "zh",
    value_col: str = "preference_mean",
) -> pd.DataFrame:
    """
    Pair lang_a vs lang_b by (model, pair_id), using value_col (default: preference_mean).

    Returns columns:
      model, pair_id, <lang_a>, <lang_b>, agree, pattern
    """
    needed = ["model", "language", "pair_id", value_col]
    missing = [c for c in needed if c not in df.columns]
    if missing:
        raise ValueError(f"Missing columns for paired_preferences: {missing}")

    sub = df[df["language"].isin([lang_a, lang_b])][
        ["model", "language", "pair_id", value_col]
    ].copy()

    pivot = (
        sub.pivot_table(
            index=["model", "pair_id"],
            columns="language",
            values=value_col,
            aggfunc="first",
        )
        .dropna()
        .reset_index()
    )

    if lang_a not in pivot.columns or lang_b not in pivot.columns:
        return pd.DataFrame()

    pivot["agree"] = pivot[lang_a] == pivot[lang_b]
    pivot["pattern"] = (
        pivot[lang_a].astype(str) + f"_{lang_a.upper()}__" +
        pivot[lang_b].astype(str) + f"_{lang_b.upper()}"
    )
    return pivot


def pair_agreement_by_model(pairs: pd.DataFrame) -> pd.DataFrame:
    """Agreement rate per model from paired_preferences output."""
    if pairs.empty:
        return pairs
    if "model" not in pairs.columns or "agree" not in pairs.columns:
        raise ValueError("pairs must have columns: model, agree")
    return (
        pairs.groupby("model")["agree"]
             .mean()
             .rename("agreement_rate")
             .reset_index()
    )


def pair_pattern_counts(pairs: pd.DataFrame) -> pd.DataFrame:
    """Pattern counts per model from paired_preferences output."""
    if pairs.empty:
        return pairs
    if "model" not in pairs.columns or "pattern" not in pairs.columns:
        raise ValueError("pairs must have columns: model, pattern")
    return (
        pairs.groupby(["model", "pattern"])
             .size()
             .rename("count")
             .reset_index()
             .sort_values(["model", "count"], ascending=[True, False])
    )


In [4]:
import json
from pathlib import Path
from typing import Any, Dict, Iterable, List
import pandas as pd

def read_jsonl(path: str | Path) -> pd.DataFrame:
    """Read JSONL file (one JSON object per line) into a DataFrame."""
    path = Path(path)
    rows: List[Dict[str, Any]] = []
    with path.open("r", encoding="utf-8") as f:
        for i, line in enumerate(f, start=1):
            line = line.strip()
            if not line:
                continue
            try:
                rows.append(json.loads(line))
            except json.JSONDecodeError as e:
                raise ValueError(f"Bad JSON at {path}:{i}") from e

    df = pd.DataFrame(rows)
    if df.empty:
        return df

    # Normalize common string cols
    for col in ["model", "language", "pair_id", "stimulus_id", "template_id"]:
        if col in df.columns:
            df[col] = df[col].astype(str)

    # Normalize preference labels (mean-based + sum-based if present)
    for col in ["preference_mean", "preference_sum"]:
        if col in df.columns:
            df[col] = df[col].astype(str).str.lower()

    # Normalize numeric cols (mean-based preferred, but keep both if present)
    numeric_cols = [
        "delta_mean", "ratio_mean",
        "delta_sum", "ratio_sum",
        "cont_log_probs_sum_inverse", "cont_log_probs_sum_surface",
        "cont_log_probs_mean_inverse", "cont_log_probs_mean_surface",
        "n_cont_tokens_inverse", "n_cont_tokens_surface",
    ]
    for c in numeric_cols:
        if c in df.columns:
            df[c] = pd.to_numeric(df[c], errors="coerce")

    return df


def read_jsonl_many(paths: Iterable[str | Path]) -> pd.DataFrame:
    """
    Read multiple JSONL files and concatenate.
    Adds a `_source_file` column with the basename of each input file.
    """
    dfs: List[pd.DataFrame] = []
    for p in paths:
        p = Path(p)
        d = read_jsonl(p)
        if not d.empty:
            d = d.copy()
            d["_source_file"] = p.name
            dfs.append(d)

    return pd.concat(dfs, ignore_index=True) if dfs else pd.DataFrame()


In [5]:
# ----------------------------
# Pretty display helpers (Markdown)
# ----------------------------
def _fmt_percent(x: Any, digits: int = 1) -> str:
    if pd.isna(x):
        return ""
    return f"{float(x) * 100:.{digits}f}%"

def _fmt_float(x: Any, digits: int = 3) -> str:
    if pd.isna(x):
        return ""
    return f"{float(x):.{digits}f}"

def _fmt_int(x: Any) -> str:
    if pd.isna(x):
        return ""
    return f"{int(x):,d}"

def to_markdown_table(
    df: pd.DataFrame,
    index: bool = False,
    tablefmt: str = "github",
) -> str:
    """
    Return a text-based markdown table string.
    Requires either 'tabulate' installed or pandas>=1.0 that bundles it.
    """
    if df.empty:
        return "_(empty table)_"
    return df.to_markdown(index=index, tablefmt=tablefmt)

def pretty_counts_markdown(
    t: pd.DataFrame,
    pct_col: str = "p_inverse",
    pct_digits: int = 1,
    index: bool = False,
) -> str:
    """
    Markdown version of pretty_counts_table:
    - formats p_inverse as percent
    - formats inverse/surface/total as ints with commas
    """
    if t.empty:
        return "_(empty table)_"

    df = t.copy()

    # ints
    for c in ["inverse", "surface", "total"]:
        if c in df.columns:
            df[c] = df[c].map(_fmt_int)

    # percent
    if pct_col in df.columns:
        df[pct_col] = df[pct_col].map(lambda x: _fmt_percent(x, digits=pct_digits))

    return to_markdown_table(df, index=index)

def pretty_delta_markdown(
    t: pd.DataFrame,
    float_digits: int = 3,
    index: bool = False,
) -> str:
    """
    Markdown version of pretty_delta_table:
    - formats __count columns as ints
    - formats other numeric columns as floats
    """
    if t.empty:
        return "_(empty table)_"

    df = t.copy()
    for c in df.columns:
        if pd.api.types.is_numeric_dtype(df[c]):
            if str(c).endswith("__count"):
                df[c] = df[c].map(_fmt_int)
            else:
                df[c] = df[c].map(lambda x: _fmt_float(x, digits=float_digits))

    return to_markdown_table(df, index=index)

def pretty_pair_agreement_markdown(
    t: pd.DataFrame,
    rate_col: str = "agreement_rate",
    pct_digits: int = 1,
    index: bool = False,
) -> str:
    if t.empty:
        return "_(empty table)_"
    df = t.copy()
    if rate_col in df.columns:
        df[rate_col] = df[rate_col].map(lambda x: _fmt_percent(x, digits=pct_digits))
    return to_markdown_table(df, index=index)


In [13]:
df = read_jsonl_many(["results/scored_wide_uer_gpt2-distil-chinese-cluecorpussmall.jsonl"])

counts = preference_counts(df)                 # DataFrame
delta  = delta_summary(df)                     # DataFrame
pairs  = paired_preferences(df, "en", "zh")     # DataFrame
agree  = pair_agreement_by_model(pairs)        # DataFrame
patts  = pair_pattern_counts(pairs)            # DataFrame

print(pretty_counts_markdown(counts))
print()

print(pretty_delta_markdown(delta))
print()

print(pretty_pair_agreement_markdown(agree))
print()

print(to_markdown_table(patts))


| model                                   | language   |   inverse |   surface |   total | p_inverse   |
|-----------------------------------------|------------|-----------|-----------|---------|-------------|
| uer_gpt2-distil-chinese-cluecorpussmall | en         |        64 |         0 |      64 | 100.0%      |
| uer_gpt2-distil-chinese-cluecorpussmall | zh         |         0 |        64 |      64 | 0.0%        |

| model                                   | language   |   delta_mean__count |   delta_mean__mean |   delta_mean__median |   delta_mean__std |   ratio_mean__count |   ratio_mean__mean |   ratio_mean__median |   ratio_mean__std |
|-----------------------------------------|------------|---------------------|--------------------|----------------------|-------------------|---------------------|--------------------|----------------------|-------------------|
| uer_gpt2-distil-chinese-cluecorpussmall | en         |                  64 |              1.15  |                1.047 