# Multilingual Semantics Probe

## Step 1: Corpus Generation

In [1]:
from __future__ import annotations

import itertools
import json
from dataclasses import dataclass
from typing import Dict, List

import pandas as pd
import os

In [2]:
STIMULI_DIR = "./stimuli"

if not os.path.exists(STIMULI_DIR):
    os.mkdir(STIMULI_DIR)

In [3]:
# --- English lexicon ---
EN_SUBJECTS = [
    "shark",
    "robot",
    "chef",
    "dog",
]

EN_OBJECTS = [
    "pirate",
    "student",
    "doctor",
    "tourist",
]

# Use correct simple past forms
EN_VERBS_PAST = [
    "ate",
    "helped",
    "pushed",
    "chased",
]

# --- Mandarin lexicon ---
# Bare nouns only (no quantifiers inside)
ZH_SUBJECTS = [
    "鲨鱼",
    "机器人",
    "厨师",
    "狗",
]

ZH_OBJECTS = [
    "海盗",
    "学生",
    "医生",
    "游客",
]

# Verb stems compatible with 了
ZH_VERBS = [
    "吃",
    "帮助",
    "推",
    "追",
]

# Optional classifier map (defaults to 个)
ZH_CLASSIFIER: Dict[str, str] = {
    "鲨鱼": "只",
    "狗": "只",
    "机器人": "个",
    "厨师": "个",
    "海盗": "个",
    "学生": "个",
    "医生": "个",
    "游客": "个",
}

In [4]:
EN_TEMPLATES = [
    # Classic ambiguous English form
    "A {subj} {verb_past} every {obj}.",
]

ZH_TEMPLATES = [
    # Canonical Mandarin surface-scope reading
    "有一{cl}{subj}{verb}了每个{obj}。",
]

In [5]:
@dataclass(frozen=True)
class Stimulus:
    language: str
    template_id: str
    subj: str
    obj: str
    verb: str
    sentence: str


def get_classifier(noun: str, cl_map: Dict[str, str]) -> str:
    return cl_map.get(noun, "个")


def generate_english(
    subjects: List[str],
    objects: List[str],
    verbs_past: List[str],
) -> List[Stimulus]:
    out: List[Stimulus] = []
    for tid, tmpl in enumerate(EN_TEMPLATES):
        for subj, obj, verb in itertools.product(subjects, objects, verbs_past):
            out.append(
                Stimulus(
                    language="en",
                    template_id=f"en_{tid}",
                    subj=subj,
                    obj=obj,
                    verb=verb,
                    sentence=tmpl.format(
                        subj=subj,
                        obj=obj,
                        verb_past=verb,
                    ),
                )
            )
    return out


def generate_mandarin(
    subjects: List[str],
    objects: List[str],
    verbs: List[str],
    cl_map: Dict[str, str],
) -> List[Stimulus]:
    out: List[Stimulus] = []
    for tid, tmpl in enumerate(ZH_TEMPLATES):
        for subj, obj, verb in itertools.product(subjects, objects, verbs):
            cl = get_classifier(subj, cl_map)
            out.append(
                Stimulus(
                    language="zh",
                    template_id=f"zh_{tid}",
                    subj=subj,
                    obj=obj,
                    verb=verb,
                    sentence=tmpl.format(
                        cl=cl,
                        subj=subj,
                        obj=obj,
                        verb=verb,
                    ),
                )
            )
    return out

In [6]:
stimuli = []
stimuli += generate_english(EN_SUBJECTS, EN_OBJECTS, EN_VERBS_PAST)
stimuli += generate_mandarin(ZH_SUBJECTS, ZH_OBJECTS, ZH_VERBS, ZH_CLASSIFIER)

continuation_df = pd.DataFrame([s.__dict__ for s in stimuli])

# Stable IDs for downstream scoring
continuation_df.insert(
    0,
    "stimulus_id",
    [
        f"{row.language}-{row.template_id}-{row.Index:06d}"
        for row in continuation_df.itertuples()
    ],
)

In [7]:
print("Total stimuli:", len(continuation_df))
print(continuation_df["language"].value_counts())

display(
    continuation_df[continuation_df["language"] == "en"][["stimulus_id", "sentence"]].sample(
        min(5, (continuation_df["language"] == "en").sum()),
        random_state=0,
    )
)

display(
    continuation_df[continuation_df["language"] == "zh"][["stimulus_id", "sentence"]].sample(
        min(5, (continuation_df["language"] == "zh").sum()),
        random_state=0,
    )
)

Total stimuli: 128
language
en    64
zh    64
Name: count, dtype: int64


Unnamed: 0,stimulus_id,sentence
45,en-en_0-000045,A chef helped every tourist.
29,en-en_0-000029,A robot helped every tourist.
43,en-en_0-000043,A chef chased every doctor.
61,en-en_0-000061,A dog helped every tourist.
34,en-en_0-000034,A chef pushed every pirate.


Unnamed: 0,stimulus_id,sentence
109,zh-zh_0-000109,有一个厨师帮助了每个游客。
93,zh-zh_0-000093,有一个机器人帮助了每个游客。
107,zh-zh_0-000107,有一个厨师追了每个医生。
125,zh-zh_0-000125,有一只狗帮助了每个游客。
98,zh-zh_0-000098,有一个厨师推了每个海盗。


In [8]:
# Serialize
continuation_df.to_csv(os.path.join(STIMULI_DIR,"stimuli.csv"), index=False)

with open(os.path.join(STIMULI_DIR, "stimuli.jsonl"), "w", encoding="utf-8") as f:
    for row in continuation_df.to_dict(orient="records"):
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Wrote stimuli.csv and stimuli.jsonl")

Wrote stimuli.csv and stimuli.jsonl


### Add Natural Language Continuations

In [9]:
EN_CONTINUATIONS = {
    "surface": " There was only one {subj}.",
    "inverse": " There were many {subj}.",
}

# Mandarin: keep equally short.
# Note: plural is usually implicit; "很多" is a decent lexical cue.
ZH_CONTINUATIONS = {
    "surface": " 只有一{cl}{subj}。",
    "inverse": " 有很多{cl}{subj}。",
}

In [10]:
def add_continuations(df: pd.DataFrame) -> pd.DataFrame:
    rows = []
    for r in df.itertuples(index=False):
        base = r._asdict()

        if base["language"] == "en":
            # naive pluralization: add "s"
            # If you care about irregular plurals later, add a map.
            subj_plural = base["subj"] + "s"
            cont_map = {
                "surface": EN_CONTINUATIONS["surface"].format(subj=base["subj"]),
                "inverse": EN_CONTINUATIONS["inverse"].format(subj=subj_plural),
            }

        elif base["language"] == "zh":
            cl = ZH_CLASSIFIER.get(base["subj"], "个")
            cont_map = {
                "surface": ZH_CONTINUATIONS["surface"].format(cl=cl, subj=base["subj"]),
                "inverse": ZH_CONTINUATIONS["inverse"].format(cl=cl, subj=base["subj"]),
            }
        else:
            raise ValueError(f"Unknown language: {base['language']}")

        for cont_type, cont_text in cont_map.items():
            ex = dict(base)
            ex["continuation_type"] = cont_type            # "surface" or "inverse"
            ex["continuation_text"] = cont_text            # the thing you'll score
            ex["full_text"] = base["sentence"] + cont_text # convenient for debugging
            rows.append(ex)

    return pd.DataFrame(rows)



In [11]:
df_cont = add_continuations(continuation_df)

if "concept_id" not in df_cont.columns:
    concept_series = df_cont["subj"] + "|" + df_cont["obj"] + "|" + df_cont["verb"]
    df_cont.insert(1, "concept_id", concept_series)

In [12]:
df_cont.to_csv(os.path.join(STIMULI_DIR, "stimuli_with_continuations.csv"), index=False)

with open(os.path.join(STIMULI_DIR, "stimuli_with_continuations.jsonl"), "w", encoding="utf-8") as f:
    for row in df_cont.to_dict(orient="records"):
        f.write(json.dumps(row, ensure_ascii=False) + "\n")

print("Wrote stimuli_with_continuations.csv and stimuli_with_continuations.jsonl")

Wrote stimuli_with_continuations.csv and stimuli_with_continuations.jsonl
