**Outputs (overwritten on each run):**
- `experiment/resources/lists/con_run1.csv`
- `experiment/resources/lists/abs_run1.csv`
- `experiment/resources/lists/con_run2.csv`
- `experiment/resources/lists/abs_run2.csv`

**Outputs overwritten by 04_generate_baseline_files.ipynb:**
- `experiment/resources/lists/base_run1.csv`
- `experiment/resources/lists/base_run2.csv`

In [1]:
# imports

from __future__ import annotations
import re
from pathlib import Path
import numpy as np
import pandas as pd
from scipy.stats import ks_2samp, levene

In [None]:
# config

CSV_ENCODING = "utf-8-sig"

RNG_SEED = 42
rng = np.random.default_rng(RNG_SEED)

THIS_DIR = Path.cwd()
ROOT = THIS_DIR.parents[1]          
STIMSEL = ROOT / "stimuli_selection"
LEX = STIMSEL / "lexicons"

SUBTLEX_PATH = LEX / "subtlex-pl.csv"
ANPW_PATH    = LEX / "anpw_r.csv"

OUT_LISTS = ROOT / "experiment" / "resources" / "lists"
OUT_LISTS.mkdir(parents=True, exist_ok=True)

# global blacklist

BLACKLIST = {
    "północ", "czar", "moment", "słowo", "śmierć", "wrogi", "LOL",
    "demon", "masakra", "typ", "rok", "ciasteczko", "turbo", "szał",
    "tyłek", "korek", "kolor", "brak", "data", "głupiec", "frajer",
    "słodycze", "układność", "faza", "pakt", "budynek", "brawo",
    "sezon", "rodzaj", "mistrz", "wartość", "smoking", "Budda",
    "orgazm", "Graal", "flaki", "szaleniec", "istota", "drań", "Wigilia",
    "szok", "racja", "wiedza", "gniew", "sprzeciw", "tchórz", "chwała",
    "guru", "podbródek", "skóra", "stodoła", "szczur", "pies", "żaba",
    "władza", "wstyd", "ból", "wola", "macho", "geniusz", "wóz", "szyja",
    "pióro", "ryba", "radio", "ser", "styl", "suma", "trud", "standard",
    "równie", "zupa", "bank", "sposób", "głupota", "śmiech", "plus", "norma",
    "pech", "szczerość", "stan", "ego", "solo", "los", "stół", "blok", "strata",
    "spór", "sprawa", "wpadka", "czapka", "dziób", "kask", "jeep", "miód",
    "gamma", "hańba", "kwiat", "Frisbee", "spojrzenie", "plan", "skala",
    "dzbanek", "drzewo", "ziemniaki", "sok", "nuda", "plugawstwo", "malarstwo",
    "lód", "chleb"

}

BLACKLIST = {w.strip().lower() for w in BLACKLIST if isinstance(w, str) and w.strip()}


In [None]:
# design

N_TRIALS_PER_BLOCK = 8
N_CON_BLOCKS_PER_RUN = 3
N_ABS_BLOCKS_PER_RUN = 3
N_BASE_BLOCKS_PER_RUN = 6
N_RUNS = 2

N_CON_PER_RUN = N_TRIALS_PER_BLOCK * N_CON_BLOCKS_PER_RUN
N_ABS_PER_RUN = N_TRIALS_PER_BLOCK * N_ABS_BLOCKS_PER_RUN
N_BASE_PER_RUN = N_CON_PER_RUN

N_CON_TOTAL = N_CON_PER_RUN * N_RUNS
N_ABS_TOTAL = N_ABS_PER_RUN * N_RUNS

N_CON_PER_RUN, N_ABS_PER_RUN, N_CON_TOTAL, N_ABS_TOTAL, N_BASE_PER_RUN


(24, 24, 48, 48, 24)

In [4]:
def read_subtlex_auto(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=None, engine="python", encoding="utf-8")
    df = df.rename(columns={"spelling": "word", "zipf.freq": "zipf"})
    if "word" not in df.columns:
        raise ValueError("SUBTLEX: missing 'spelling' column.")
    if "zipf" not in df.columns:
        raise ValueError("SUBTLEX: missing 'zipf.freq' column.")
    if "nchar" not in df.columns:
        df["nchar"] = df["word"].astype(str).str.len()
    df["word"] = df["word"].astype(str).str.strip()
    df["zipf"] = pd.to_numeric(df["zipf"], errors="coerce")
    df["nchar"] = pd.to_numeric(df["nchar"], errors="coerce")
    return df[["word", "zipf", "nchar"]].drop_duplicates("word")

def read_anpw(path: Path) -> pd.DataFrame:
    df = pd.read_csv(path, sep=";", encoding="utf-8", engine="python")
    if "polish word" in df.columns:
        df = df.rename(columns={"polish word": "word"})
    if "part of speach" in df.columns:
        df = df.rename(columns={"part of speach": "pos"})
    df["word"] = df["word"].astype(str).str.strip()

    num_cols = [
        "Number of Letters",
        "concretness_M",
        "imegability_M",
        "Valence_M",
        "arousal_M",
        "subtlex_pl frequency",
    ]
    for c in num_cols:
        if c in df.columns:
            df[c] = (
                df[c].astype(str)
                    .str.replace(",", ".", regex=False)
                    .replace({"nan": np.nan, "": np.nan})
            )
            df[c] = pd.to_numeric(df[c], errors="coerce")

    if "Number of Letters" in df.columns:
        df["nchar"] = df["Number of Letters"]
    return df

In [5]:
subtlex = read_subtlex_auto(SUBTLEX_PATH)
anpw = read_anpw(ANPW_PATH)

df = anpw.merge(subtlex, on="word", how="left", suffixes=("", "_subtlex"))
df["nchar"] = df["nchar"].fillna(df.get("nchar_subtlex")).fillna(df["word"].str.len())
df["nchar"] = pd.to_numeric(df["nchar"], errors="coerce")
df.head()

Unnamed: 0.1,Unnamed: 0,word,english word,subtlex_pl frequency,Kazojć (2011) frequency,pos,Number of Letters,Valence_N,Valence_MIN,Valence_MAX,...,imegability_M_Male,imegability_SD_Male,ageOfAquisition_N_Male,ageOfAquisition_MIN_Male,ageOfAquisition_MAX_Male,ageOfAquisition_M_Male,ageOfAquisition_SD_Male,nchar,zipf,nchar_subtlex
0,131,belka,beam,72,332,N,5.0,50,2,8,...,748,1782320585,25,3,14,788,3059411708,5.0,2.696093,5.0
1,173,bęben,drum,228,522,N,5.0,50,3,8,...,784,1885912688,25,4,12,764,2360790828,5.0,3.192606,5.0
2,220,boa,boa,170,139,ign,3.0,49,1,8,...,764,18,25,5,18,1024,3950527391,3.0,3.065766,3.0
3,1177,ketchup,ketchup,281,18,N,7.0,50,3,9,...,76,194,25,4,12,72,242,7.0,3.283019,7.0
4,1394,królik,bunny / rabbit,1084,528,N,6.0,50,3,8,...,816,162,25,3,12,6,233,6.0,3.8682,6.0


In [6]:
VOWELS = set("aąeęioóuy")

def count_syllables_pl(word: str) -> int:
    w = word.lower()
    groups = 0
    in_vowel = False
    for ch in w:
        is_v = ch in VOWELS
        if is_v and not in_vowel:
            groups += 1
        in_vowel = is_v
    return max(groups, 1)

df["syllables"] = df["word"].map(count_syllables_pl)


In [7]:
def is_simple_word(w: str) -> bool:
    return bool(re.fullmatch(r"[A-Za-zĄĆĘŁŃÓŚŹŻąćęłńóśźż]+", w))

df_f = df.copy()

# Apply blacklist early (before any quantiles/sampling)
df_f["word_norm"] = df_f["word"].astype(str).str.strip().str.lower()
df_f = df_f[~df_f["word_norm"].isin(BLACKLIST)].copy()

if "pos" in df_f.columns:
    df_f = df_f[df_f["pos"].astype(str).str.upper().eq("N")]

df_f = df_f[df_f["word"].map(is_simple_word)]
df_f = df_f[df_f["concretness_M"].notna()]
df_f = df_f[df_f["nchar"].between(3, 10)]
df_f = df_f.drop_duplicates(subset=["word"]).reset_index(drop=True)

df_f.shape

(2500, 132)

In [None]:
q_low, q_high = 0.25, 0.75
low_thr = df_f["concretness_M"].quantile(q_low)
high_thr = df_f["concretness_M"].quantile(q_high)

# ANPW_R: lower concreteness_M = more concrete; higher = more abstract

con_pool = df_f[df_f["concretness_M"] <= low_thr].copy().reset_index(drop=True)
abs_pool = df_f[df_f["concretness_M"] >= high_thr].copy().reset_index(drop=True)

low_thr, high_thr, abs_pool.shape, con_pool.shape

(np.float64(2.04), np.float64(4.24), (630, 132), (637, 132))

In [None]:
# stronger preference for common words, but only within the extreme concreteness
W_CONC = 2.0    # concreteness primary
W_ZIPF = 1.2    # frequency preference
ZIPF_MIN = 3.0  # hard floor

def z(x: pd.Series) -> pd.Series:
    return (x - x.mean()) / (x.std(ddof=0) + 1e-8)

def rank_pool(pool: pd.DataFrame, cond: str) -> pd.DataFrame:
    p = pool.copy()
    p["zipf_filled"] = p["zipf"].fillna(p["zipf"].median())
    if ZIPF_MIN is not None:
        p = p[p["zipf_filled"] >= ZIPF_MIN].copy()

    # CON: low concretness_M
    # ABS: very high concretness_M
    extremeness = (-p["concretness_M"]) if cond == "CON" else p["concretness_M"]
    p["score"] = W_CONC * z(extremeness) + W_ZIPF * z(p["zipf_filled"])
    return p.sort_values("score", ascending=False).reset_index(drop=True)

abs_ranked = rank_pool(abs_pool, "ABS")
con_ranked = rank_pool(con_pool, "CON")

K_MULT = 8
K_abs = min(len(abs_ranked), max(N_ABS_TOTAL * K_MULT, N_ABS_TOTAL))
K_con = min(len(con_ranked), max(N_CON_TOTAL * K_MULT, N_CON_TOTAL))

abs_pool2 = abs_ranked.iloc[:K_abs].copy().reset_index(drop=True)
con_pool2 = con_ranked.iloc[:K_con].copy().reset_index(drop=True)

abs_pool2.shape, con_pool2.shape


((380, 134), (381, 134))

In [10]:
FEATURES = ["nchar", "syllables", "zipf_filled"]

def prep_features(df_in: pd.DataFrame) -> pd.DataFrame:
    d = df_in.copy()
    d["zipf_filled"] = d["zipf"].fillna(d["zipf"].median())
    return d

abs_pool2 = prep_features(abs_pool2)
con_pool2 = prep_features(con_pool2)

def greedy_match(abs_df: pd.DataFrame, con_df: pd.DataFrame, n_pairs: int, seed: int):
    rng_local = np.random.default_rng(seed)

    abs_feat = np.column_stack([abs_df[f].to_numpy() for f in FEATURES])
    con_feat = np.column_stack([con_df[f].to_numpy() for f in FEATURES])

    all_feat = np.vstack([abs_feat, con_feat])
    mu = all_feat.mean(axis=0)
    sd = all_feat.std(axis=0) + 1e-8

    abs_z = (abs_feat - mu) / sd
    con_z = (con_feat - mu) / sd

    order = rng_local.permutation(len(abs_df))

    chosen_abs = []
    chosen_con = []
    available_con = set(range(len(con_df)))

    for i in order:
        if len(chosen_abs) >= n_pairs or not available_con:
            break
        con_list = np.array(sorted(list(available_con)))
        dists = np.linalg.norm(con_z[con_list] - abs_z[i], axis=1)
        j = con_list[int(np.argmin(dists))]
        chosen_abs.append(i)
        chosen_con.append(j)
        available_con.remove(j)

    return abs_df.iloc[chosen_abs].copy(), con_df.iloc[chosen_con].copy()

def mismatch_penalty(abs_sel: pd.DataFrame, con_sel: pd.DataFrame) -> float:
    ks_ps = []
    var_ps = []
    for f in FEATURES:
        ks_ps.append(ks_2samp(abs_sel[f], con_sel[f]).pvalue)
        var_ps.append(levene(abs_sel[f], con_sel[f], center="median").pvalue)
    eps = 1e-12
    return float(-np.sum(np.log(np.clip(ks_ps, eps, 1.0))) - np.sum(np.log(np.clip(var_ps, eps, 1.0))))

def score_total(abs_sel: pd.DataFrame, con_sel: pd.DataFrame) -> float:
    pen = mismatch_penalty(abs_sel, con_sel)
    mean_zipf = float(abs_sel["zipf_filled"].mean() + con_sel["zipf_filled"].mean())
    FREQ_REWARD_WEIGHT = 1.0
    return pen - FREQ_REWARD_WEIGHT * mean_zipf

def search_best(abs_df: pd.DataFrame, con_df: pd.DataFrame, n_pairs: int, n_tries: int = 800):
    best_score = None
    best_abs = None
    best_con = None
    meta = None
    for _ in range(n_tries):
        seed = int(rng.integers(0, 1_000_000))
        a_sel, c_sel = greedy_match(abs_df, con_df, n_pairs=n_pairs, seed=seed)
        if len(a_sel) < n_pairs or len(c_sel) < n_pairs:
            continue
        s = score_total(a_sel, c_sel)
        if best_score is None or s < best_score:
            best_score = s
            best_abs, best_con = a_sel, c_sel
            meta = {
                "score": float(s),
                "mismatch_penalty": mismatch_penalty(a_sel, c_sel),
                "mean_zipf_total": float(a_sel["zipf_filled"].mean() + c_sel["zipf_filled"].mean()),
            }
    if best_abs is None:
        raise RuntimeError("No solution found — relax ZIPF_MIN/K_MULT/filters.")
    return best_abs, best_con, meta

n_pairs = min(N_ABS_TOTAL, len(abs_pool2), len(con_pool2))
abs_sel_all, con_sel_all, meta = search_best(abs_pool2, con_pool2, n_pairs=n_pairs, n_tries=800)
meta


  res = hypotest_fun_out(*samples, **kwds)


{'score': -7.791430701211294,
 'mismatch_penalty': 0.13604527263502922,
 'mean_zipf_total': 7.927475973846324}

In [None]:
# split to runs

n_needed = min(N_ABS_TOTAL, len(abs_sel_all), len(con_sel_all))

perm = rng.permutation(len(abs_sel_all))
abs_sel_all = abs_sel_all.iloc[perm].reset_index(drop=True).iloc[:n_needed]
con_sel_all = con_sel_all.iloc[perm].reset_index(drop=True).iloc[:n_needed]

abs_run1 = abs_sel_all.iloc[:N_ABS_PER_RUN].copy()
abs_run2 = abs_sel_all.iloc[N_ABS_PER_RUN:N_ABS_PER_RUN*2].copy()

con_run1 = con_sel_all.iloc[:N_CON_PER_RUN].copy()
con_run2 = con_sel_all.iloc[N_CON_PER_RUN:N_CON_PER_RUN*2].copy()

abs_run1.shape, abs_run2.shape, con_run1.shape, con_run2.shape


((24, 134), (24, 134), (24, 134), (24, 134))

In [None]:
def make_list(df_in: pd.DataFrame, cond: str, run: int) -> pd.DataFrame:
    return pd.DataFrame({
        "word": df_in["word"].astype(str),
        "condition": cond,
        "run": run,
        "stimFile": [f"resources/audio/{cond.lower()}_run{run}_{i+1:03d}.wav" for i in range(len(df_in))],
    })

con1 = make_list(con_run1, "CON", 1)
abs1 = make_list(abs_run1, "ABS", 1)
con2 = make_list(con_run2, "CON", 2)
abs2 = make_list(abs_run2, "ABS", 2)

base1 = pd.DataFrame({
    "word": ["<BASELINE>"] * N_BASE_PER_RUN,
    "condition": ["BASE"] * N_BASE_PER_RUN,
    "run": [1] * N_BASE_PER_RUN,
    "stimFile": ["resources/audio/base_run1.wav"] * N_BASE_PER_RUN,
})
base2 = pd.DataFrame({
    "word": ["<BASELINE>"] * N_BASE_PER_RUN,
    "condition": ["BASE"] * N_BASE_PER_RUN,
    "run": [2] * N_BASE_PER_RUN,
    "stimFile": ["resources/audio/base_run2.wav"] * N_BASE_PER_RUN,
})

con1.to_csv(OUT_LISTS / "con_run1.csv", index=False, encoding=CSV_ENCODING)
abs1.to_csv(OUT_LISTS / "abs_run1.csv", index=False, encoding=CSV_ENCODING)
base1.to_csv(OUT_LISTS / "base_run1.csv", index=False, encoding=CSV_ENCODING)

con2.to_csv(OUT_LISTS / "con_run2.csv", index=False, encoding=CSV_ENCODING)
abs2.to_csv(OUT_LISTS / "abs_run2.csv", index=False, encoding=CSV_ENCODING)
base2.to_csv(OUT_LISTS / "base_run2.csv", index=False, encoding=CSV_ENCODING)

OUT_LISTS


WindowsPath('c:/Users/kinga/Documents/Blindbrain/4. Courses/fMRI - design of the experiment and data analysis/cognes-auditory-1back-pilot/experiment/resources/lists')