# Script Description

This script takes two input dataframes and produces a JSON file that serves as the item table for constructing trials in the listener-side experiment.

## Output Format

The resulting JSON contains the following fields:

-	Q1 – the first quantifier used in the utterance (e.g., "some", "most", "none").
-	Q2 – the second quantifier (e.g., "some", "all").
-	A – the adverbial phrase describing the correctness of answers ("right" or "wrong").

These three fields (Q1, Q2, A) are used to construct the linguistic stimulus, e.g.:
“some of the students got some of the answers wrong.”

-	observation – an array of 0s and 1s representing the correctness pattern in the exam.
This is used to generate the state table shown to participants as the “true” results of the exam. They are of the shape e.g. [0, 0, 0, 0, 0].
-	condition – the ground-truth speaker type for that utterance–state pair.
This specifies which argumentative strategy is correct according to the model and has three levels:
	-	"high" – high-argumentative speaker
	-	"low" – low-argumentative speaker
	-	"info" – information-oriented (neutral) speaker

The JSON output is directly usable as the item specification for trial generation in the listener experiment.

In [45]:
import json
import pandas as pd
import numpy as np
from pathlib import Path

# -----------------------------
# File paths
# -----------------------------
file_high = Path("./analysis_values/most_discriminative_stimuli_for_listener_exp_high_fulldata.json")
file_low  = Path("./analysis_values/all_stimuli_discriminative_values_for_listener_exp_low_fulldata.json")
file_combined = Path("./analysis_values/most_discriminative_stimuli_for_listener_exp_fulldata.json")
# -----------------------------

In [51]:
df= pd.DataFrame(json.load(open(file_combined)))

df['delta_log_score_high'] = df['log_score_high'] - df['log_score_info']
df["delta_log_score_low"]  = df['log_score_low'] - df['log_score_info']
df['delta_log_score'] = np.max(df[["delta_log_score_low",'delta_log_score_high']], axis = 1)

# Show smallest 10 items by delta_log_score_high
df[df['model'] == 'lr'].sort_values(by="delta_log_score", ascending=True).head(10)


Unnamed: 0,model,Q,A1,A2,observation,log_factor_high_vs_info,log_factor_low_vs_info,factor_ci_high_vs_info_low,factor_ci_high_vs_info_high,factor_ci_low_vs_info_low,factor_ci_low_vs_info_high,extremeness_high_vs_info,extremness_low_vs_info,log_score_high,log_score_info,log_score_low,delta_log_score_high,delta_log_score_low,delta_log_score
220,lr,most,all,right,"[12, 12, 12, 12, 12]",-1.243407,-0.909367,-1.387901,-1.11376,-1.035499,-0.786316,1.243407,0.909367,-3.520581,-2.276721,-3.186586,-1.24386,-0.909864,-0.909864
227,lr,most,all,wrong,"[0, 0, 0, 0, 0]",-0.909367,-1.243407,-1.035499,-0.786316,-1.387901,-1.11376,0.909367,1.243407,-3.186586,-2.276721,-3.520581,-0.909864,-1.24386,-0.909864
123,lr,some,most,right,"[12, 12, 12, 0, 0]",-0.907814,-1.132813,-1.037945,-0.789261,-1.246167,-1.026767,0.907814,1.132813,-3.634773,-2.726411,-3.859788,-0.908362,-1.133377,-0.908362
126,lr,some,most,wrong,"[12, 12, 0, 0, 0]",-1.132813,-0.907814,-1.246167,-1.026767,-1.037945,-0.789261,1.132813,0.907814,-3.859788,-2.726411,-3.634773,-1.133377,-0.908362,-0.908362
163,lr,most,none,right,"[0, 0, 0, 0, 0]",-0.886818,-1.218598,-1.119578,-0.665993,-1.487261,-0.96941,0.886818,1.218598,-5.370795,-4.475174,-5.702453,-0.895621,-1.227279,-0.895621
164,lr,most,none,wrong,"[12, 12, 12, 12, 12]",-1.218598,-0.886818,-1.487261,-0.96941,-1.119578,-0.665993,1.218598,0.886818,-5.702453,-4.475174,-5.370795,-1.227279,-0.895621,-0.895621
193,lr,most,some,wrong,"[12, 12, 3, 0, 0]",-0.562403,-0.807705,-0.626282,-0.501636,-0.876219,-0.742458,0.562403,0.807705,-2.968186,-2.405729,-3.213487,-0.562456,-0.807758,-0.562456
172,lr,most,some,right,"[12, 12, 9, 0, 0]",-0.807705,-0.562403,-0.876219,-0.742458,-0.626282,-0.501636,0.807705,0.562403,-3.213487,-2.405729,-2.968186,-0.807758,-0.562456,-0.562456
75,lr,some,some,right,"[12, 12, 9, 0, 0]",-0.797884,-0.526273,-0.879167,-0.723652,-0.601495,-0.457274,0.797884,0.526273,-3.394879,-2.59679,-3.12327,-0.79809,-0.52648,-0.52648
99,lr,some,some,wrong,"[12, 12, 3, 0, 0]",-0.526273,-0.797884,-0.601495,-0.457274,-0.879167,-0.723652,0.526273,0.797884,-3.12327,-2.59679,-3.394879,-0.52648,-0.79809,-0.52648


In [44]:
df_high = pd.DataFrame(json.load(open(file_high)))
df_low  = pd.DataFrame(json.load(open(file_low)))

df_high = df_high.rename(columns={"log_score_high": "log_score"})
df_low  = df_low.rename(columns={"log_score_low": "log_score"})

df_high["observation"] = df_high["observation"].apply(tuple)
df_low["observation"]  = df_low["observation"].apply(tuple)

# columns that identify the same item in both dfs
keys = ["model", "Q", "A1", "A2", "observation"]

df = (
    df_high
      .merge(
          df_low,
          on=keys,
          how="inner",
          suffixes=("_high", "_low")
      )
)

df['delta_log_score_high'] = df['log_score_high'] - df['log_score_info_high']
df["delta_log_score_low"]  = df['log_score_low'] - df['log_score_info_low']
df['delta_log_score'] = np.max(df[["delta_log_score_low",'delta_log_score_high']], axis = 1)

# Show smallest 10 items by delta_log_score_high
df[df['model'] == 'lr'].sort_values(by="delta_log_score", ascending=True).head(10)


Unnamed: 0,model,Q,A1,A2,observation,factor_high,factor_ci_low_high,factor_ci_high_high,extremeness_high,log_score_high,log_score_info_high,factor_low,factor_ci_low_low,factor_ci_high_low,extremeness_low,log_score_low,log_score_info_low,delta_log_score_high,delta_log_score_low,delta_log_score
123,lr,some,most,right,"(12, 12, 12, 0, 0)",-0.911694,-1.042829,-0.788414,0.911694,-3.638494,-2.72624,-1.133233,-1.25004,-1.023701,1.133233,-3.861203,-2.727436,-0.912254,-1.133766,-0.912254
220,lr,most,all,right,"(12, 12, 12, 12, 12)",-1.247776,-1.391088,-1.113944,1.247776,-3.524814,-2.276577,-0.908977,-1.039622,-0.790458,0.908977,-3.187149,-2.277714,-1.248238,-0.909435,-0.909435
126,lr,some,most,wrong,"(12, 12, 0, 0, 0)",-1.133863,-1.251797,-1.022958,1.133863,-3.860646,-2.72624,-0.908353,-1.044248,-0.78201,0.908353,-3.636317,-2.727436,-1.134406,-0.90888,-0.90888
227,lr,most,all,wrong,"(0, 0, 0, 0, 0)",-0.906197,-1.032211,-0.789415,0.906197,-3.183218,-2.276577,-1.244115,-1.392804,-1.10414,1.244115,-3.522266,-2.277714,-0.906642,-1.244552,-0.906642
164,lr,most,none,wrong,"(12, 12, 12, 12, 12)",-1.223402,-1.500677,-0.977393,1.223402,-5.706483,-4.474725,-0.887207,-1.128657,-0.675058,0.887207,-5.371834,-4.476194,-1.231757,-0.89564,-0.89564
163,lr,most,none,right,"(0, 0, 0, 0, 0)",-0.884409,-1.119239,-0.675166,0.884409,-5.367349,-4.474725,-1.219897,-1.498295,-0.970868,1.219897,-5.7044,-4.476194,-0.892624,-1.228206,-0.892624
193,lr,most,some,wrong,"(12, 12, 3, 0, 0)",-0.563343,-0.628792,-0.499526,0.563343,-2.969072,-2.405677,-0.808346,-0.880365,-0.739355,0.808346,-3.214462,-2.406066,-0.563394,-0.808395,-0.563394
172,lr,most,some,right,"(12, 12, 9, 0, 0)",-0.808783,-0.878959,-0.739699,0.808783,-3.214511,-2.405677,-0.56299,-0.629995,-0.498201,0.56299,-2.969106,-2.406066,-0.808834,-0.56304,-0.56304
99,lr,some,some,wrong,"(12, 12, 3, 0, 0)",-0.527852,-0.604326,-0.454257,0.527852,-3.124743,-2.596686,-0.798355,-0.881973,-0.716583,0.798355,-3.395965,-2.597414,-0.528057,-0.79855,-0.528057
75,lr,some,some,right,"(12, 12, 9, 0, 0)",-0.799597,-0.882575,-0.719971,0.799597,-3.396487,-2.596686,-0.526679,-0.602967,-0.452043,0.526679,-3.12429,-2.597414,-0.799801,-0.526876,-0.526876


In [38]:
print(df_low.columns)
print(df_high.columns)


Index(['model', 'Q', 'A1', 'A2', 'observation', 'factor', 'factor_ci_low',
       'factor_ci_high', 'extremeness', 'log_score', 'log_score_info'],
      dtype='object')
Index(['model', 'Q', 'A1', 'A2', 'observation', 'factor', 'factor_ci_low',
       'factor_ci_high', 'extremeness', 'log_score', 'log_score_info'],
      dtype='object')


In [41]:
np.allclose(df['log_score_info_high'], df['log_score_info_low'])
np.max(np.exp(df['log_score_info_high']) - np.exp(df['log_score_info_low']))

0.00011670730867459345

In [28]:
df.sort_values(by="delta_log_score_low").head(10)

Unnamed: 0,model,Q,A1,A2,observation,factor_high,factor_ci_low_high,factor_ci_high_high,extremeness_high,log_score_high,log_score_info_high,factor_low,factor_ci_low_low,factor_ci_high_low,extremeness_low,log_score_low,log_score_info_low,delta_log_score_high,delta_log_score_low
494,maximin,all,some,right,"(3, 3, 3, 3, 3)",1.176568,1.131644,1.220193,1.176568,-1.051559,-2.22809,-4.017799,-4.261307,-3.799202,4.017799,-6.246164,-2.228343,1.176531,-4.017821
247,lr,all,some,wrong,"(0, 0, 0, 0, 0)",2.007818,1.903498,2.112927,2.007818,-1.289632,-3.293937,-3.907016,-4.238042,-3.610377,3.907016,-7.207103,-3.296904,2.004306,-3.910199
17,lr,none,none,wrong,"(0, 0, 0, 0, 0)",2.030937,1.792733,2.260496,2.030937,-3.474332,-5.490286,-3.877657,-4.331428,-3.499041,3.877657,-9.385821,-5.493494,2.015954,-3.892327
37,lr,none,all,right,"(0, 0, 0, 0, 0)",2.030937,1.792733,2.260496,2.030937,-3.474332,-5.490286,-3.877657,-4.331428,-3.499041,3.877657,-9.385821,-5.493494,2.015954,-3.892327
754,nonparametric,all,some,right,"(3, 3, 3, 3, 3)",1.524061,1.478782,1.568301,1.524061,-0.704068,-2.22809,-3.891836,-4.116391,-3.684835,3.891836,-6.120224,-2.228343,1.524022,-3.891881
197,lr,most,some,wrong,"(0, 0, 0, 0, 0)",2.093342,1.953385,2.228985,2.093342,-1.728109,-3.8152,-3.863495,-4.237373,-3.533253,3.863495,-7.688345,-3.819112,2.087091,-3.869233
1027,prag,all,some,wrong,"(0, 0, 0, 0, 0)",2.007143,1.89934,2.109752,2.007143,-1.290289,-3.293937,-3.864987,-4.183957,-3.579915,3.864987,-7.165271,-3.296904,2.003648,-3.868367
304,maximin,none,all,wrong,"(3, 3, 3, 3, 3)",1.335662,1.156001,1.505607,1.335662,-3.097889,-4.426694,-3.853444,-4.204236,-3.55639,3.853444,-8.287883,-4.42731,1.328805,-3.860573
264,maximin,none,none,right,"(3, 3, 3, 3, 3)",1.335662,1.156001,1.505607,1.335662,-3.097889,-4.426694,-3.853444,-4.204236,-3.55639,3.853444,-8.287883,-4.42731,1.328805,-3.860573
103,lr,some,some,wrong,"(0, 0, 0, 0, 0)",2.129357,1.973233,2.278919,2.129357,-1.883789,-4.005693,-3.851836,-4.242854,-3.510383,3.851836,-7.868652,-4.009953,2.121905,-3.858699


In [24]:

def build_listener_items(
    file_high: str,
    file_low: str,
    n_per_condition: int = 12,
) -> pd.DataFrame:
    """
    From high/low full model outputs, construct 10 high-arg, 10 low-arg,
    and 10 info items.

    Steps:
    0. Load and tag with 'condition' (high / low).
    1. Average factors across models (prag, lr, maximin, nonparametric).
    2. For 'high': take n_per_condition highest averaged factors.
    3. For 'low' : take n_per_condition highest averaged factors.
    4. Merge high+low, compute average across conditions, and take
       n_per_condition lowest items as 'info'.
    5. Return a DataFrame with columns:
       Q1, Q2, A, observation, condition.
    """

    # ---------- Load ----------
    df_high = pd.DataFrame(json.load(open(file_high)))
    df_low  = pd.DataFrame(json.load(open(file_low)))

    df_high["condition"] = "high"
    df_low["condition"]  = "low"

    # ---------- Make observation hashable ----------
    df_high["obs_tuple"] = df_high["observation"].apply(lambda x: tuple(x))
    df_low["obs_tuple"]  = df_low["observation"].apply(lambda x: tuple(x))

    group_cols = ["Q", "A1", "A2", "obs_tuple"]

    # ---------- 1. Average across models for each condition ----------
    high_avg = (
        df_high.groupby(group_cols)
               .agg(factor_high=("factor", "mean"))
               .reset_index()
    )

    low_avg = (
        df_low.groupby(group_cols)
              .agg(factor_low=("factor", "mean"))
              .reset_index()
    )

    # ---------- 2. Select high-arg items ----------
    top_high = (
        high_avg.sort_values("factor_high", ascending=False)
                .head(n_per_condition)
                .copy()
    )
    top_high["condition"] = "high"

    # ---------- 3. Select low-arg items ----------
    top_low = (
        low_avg.sort_values("factor_low", ascending=False)
               .head(n_per_condition)
               .copy()
    )
    top_low["condition"] = "low"

    # ---------- 4. Select info items (average across high+low) ----------
    merged = pd.merge(high_avg, low_avg, on=group_cols, how="inner")

    merged["avg_factor"] = (merged["factor_high"] + merged["factor_low"]) / 2.0

    info_items = (
        merged.sort_values("avg_factor", ascending=True)
              .head(n_per_condition)
              .copy()
    )
    info_items["condition"] = "info"

    # ---------- 5. Harmonize & rename columns ----------
    def tidy(df, factor_col_name):
        # reconstruct observation as list
        df["observation"] = df["obs_tuple"].apply(list)
        # rename Q, A1, A2 -> Q1, Q2, A
        df = df.rename(columns={"Q": "Q1", "A1": "Q2", "A2": "A"})
        # keep only what we need (+ factor if you want to inspect)
        keep_cols = ["Q1", "Q2", "A", "observation", "condition"]
        if factor_col_name in df:
            keep_cols.append(factor_col_name)
        return df[keep_cols]

    top_high_tidy = tidy(top_high, "factor_high")
    top_low_tidy  = tidy(top_low, "factor_low")
    info_tidy     = tidy(info_items, "avg_factor")

    # ---------- Combine all three sets ----------
    items = pd.concat([top_high_tidy, top_low_tidy, info_tidy],
                      ignore_index=True)

    return items


# Example usage:
# items_df = build_listener_items(
#     "most_discriminative_stimuli_for_listener_exp_high_fulldata.json",
#     "most_discriminative_stimuli_for_listener_exp_low_fulldata.json",
#     n_per_condition=10,
# )
# print(items_df)
# items_df.to_json("listener_items.json", orient="records", indent=2)

In [34]:
info_items = build_listener_items(file_high, file_low, n_per_condition=12)
print(info_items.to_string(index=True))

# Rows to modify
# observation in row 26 change from [12, 12, 0, 0, 0] to [12, 9, 9, 9, 9] + most some wrong
info_items.at[26, 'observation'] = [12, 9, 9, 9, 9]

# observation in row 27 change from [12, 12, 12, 0, 0] to [3, 3, 3, 3, 0] + most some right
info_items.at[27, 'observation'] = [3, 3, 3, 3, 0]

# observation in row 34 change from [12, 12, 12, 0, 0] to [12, 12, 0, 0, 0] + some all right
info_items.at[34, 'observation'] = [12, 12, 0, 0, 0]
info_items.at[34, 'Q2'] = "all"

# observation in row 35 change from [12, 12, 12, 0, 0]  + some all wrong
info_items.at[35, 'Q2'] = "all"


# Rows to exclude
row_to_exclude = [6, # high
                  8, # high
                17, #low
                20, # low
                31, # info
                33, # info#
                ]
info_items = info_items.drop(row_to_exclude).reset_index(drop=True)
# Rearrange according to condition
info_items = info_items.sort_values(by=['condition']).reset_index(drop=True)
print("\nAfter manual edits:\n")
print(info_items.to_string(index=True))

# Save to csv file
info_items.to_csv("../experiments/listener_side/items/final_listener_items.csv", index=False)


      Q1    Q2      A           observation condition  factor_high  factor_low  avg_factor
0   some  some  wrong       [0, 0, 0, 0, 0]      high     2.090269         NaN         NaN
1   most  some  wrong       [0, 0, 0, 0, 0]      high     1.906597         NaN         NaN
2   some  most  right       [9, 9, 0, 0, 0]      high     1.739603         NaN         NaN
3   some  none  wrong     [12, 12, 0, 0, 0]      high     1.676246         NaN         NaN
4   some   all  right     [12, 12, 0, 0, 0]      high     1.596101         NaN         NaN
5   none   all  right       [0, 0, 0, 0, 0]      high     1.591761         NaN         NaN
6   none  none  wrong       [0, 0, 0, 0, 0]      high     1.591761         NaN         NaN
7    all  some  wrong       [0, 0, 0, 0, 0]      high     1.509167         NaN         NaN
8   some  some  right       [3, 3, 0, 0, 0]      high     1.435395         NaN         NaN
9   some  most  wrong       [0, 0, 0, 0, 0]      high     1.397498         NaN         NaN