In [1]:
import pandas as pd
import numpy as np
import json
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.contingency_tables import cochrans_q
from scipy.stats import combine_pvalues
from scipy.stats import norm

In [2]:
# The function `read_json_answers` is designed to extract and aggregate 
# structured responses generated by a Large Language Model (LLM) from a JSON file. 
# Specifically, it reads the file located at `file_path`, parses its contents, 
# and retrieves all values associated with the key `answer` from objects within a list structure.

def read_json_answers(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    answers = []
    if isinstance(data, list):
        for obj in data:
            if isinstance(obj, dict) and "answer" in obj:
                answers.append(obj["answer"])
    return answers

# The function `read_unstructured_parsed` serves a similar purpose to 
# `read_json_answers` but operates on unstructured outputs that have been 
# post-processed into a structured format. It reads a JSON file 
# containing a list of dictionaries and extracts values associated with the key `parsed`.

def read_unstructured_parsed(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    answers = []
    if isinstance(data, list):
        for obj in data:
            if isinstance(obj, dict) and "parsed" in obj:
                answers.append(obj["parsed"])
    return answers

In [3]:
# Extract the list recording answers from structured output of LLM
# These are read from files located in the `sot_results` directory, with each 
# file corresponding to a specific persona or role (standard, artist, chef, 
# detective, judge). The extracted lists enable comparative evaluation of instruction intervention 
# conditioning influences LLM behavior in structured contexts.
json_standard_answers = read_json_answers("sot_results/seven_objects_json.json")
json_artist_answers = read_json_answers("sot_results/seven_objects_json_artist.json")
json_chef_answers = read_json_answers("sot_results/seven_objects_json_chef.json")
json_detective_answers = read_json_answers("sot_results/seven_objects_json_detective.json")
json_judge_answers = read_json_answers("sot_results/seven_objects_json_judge.json")

# Extract the list recording answers from unstructured output of LLM
unstructured_standard_answers = read_unstructured_parsed("sot_results/seven_objects_unstructured_parsed.json")
unstructured_artist_answers = read_unstructured_parsed("sot_results/seven_objects_unstructured_artist_parsed.json")
unstructured_chef_answers = read_unstructured_parsed("sot_results/seven_objects_unstructured_chef_parsed.json")
unstructured_detective_answers = read_unstructured_parsed("sot_results/seven_objects_unstructured_detective_parsed.json")
unstructured_judge_answers = read_unstructured_parsed("sot_results/seven_objects_unstructured_judge_parsed.json")

In [4]:
# This block computes binary correctness indicators for each LLM-generated response 
# by comparing it against the ground truth labels provided in the dataset. The evaluation 
# metric is based on exact match between the predicted answer and the reference label.
# The ground truth labels are loaded from `data/seven_objects.json`. 
# Each comparison yields a binary outcome: `1` if the predicted answer contains the 
# ground truth label (indicating correctness), and `0` otherwise. These results are 
# stored in separate lists for each persona (standard, artist, chef, detective, judge) 
# and for each output type (structured vs. unstructured).

with open("data/seven_objects.json", "r") as f:
    label = json.load(f)
unstructured_standard_results = []
unstructured_artist_results = []
unstructured_chef_results = []
unstructured_detective_results = []
unstructured_judge_results = []

json_standard_results = []
json_artist_results = []
json_chef_results = []
json_detective_results = []
json_judge_results = []

for i in range(len(unstructured_standard_answers)):
    label_tmp = next((k for k, v in label["examples"][i]['target_scores'].items() if v == 1), None)
    label_tmp = label_tmp.replace(" ball.", "")
    # Unstructured answers
    if label_tmp.lower() in unstructured_standard_answers[i].lower():
        unstructured_standard_results.append(1)
    else:
        unstructured_standard_results.append(0)
        
    if label_tmp.lower() in unstructured_artist_answers[i].lower():
        unstructured_artist_results.append(1)
    else:
        unstructured_artist_results.append(0)
        
    if label_tmp.lower() in unstructured_chef_answers[i].lower():
        unstructured_chef_results.append(1)
    else:
        unstructured_chef_results.append(0)
        
    if label_tmp.lower() in unstructured_detective_answers[i].lower():
        unstructured_detective_results.append(1)
    else:
        unstructured_detective_results.append(0)
        
    if label_tmp.lower() in unstructured_judge_answers[i].lower():
        unstructured_judge_results.append(1)
    else:
        unstructured_judge_results.append(0)
    
    # Structured answers
    if label_tmp.lower() in json_standard_answers[i].lower():
        json_standard_results.append(1)
    else:
        json_standard_results.append(0)   
        
    if label_tmp.lower() in json_artist_answers[i].lower():
        json_artist_results.append(1)
    else:
        json_artist_results.append(0) 
        
    if label_tmp.lower() in json_chef_answers[i].lower():
        json_chef_results.append(1)
    else:
        json_chef_results.append(0) 
        
    if label_tmp.lower() in json_detective_answers[i].lower():
        json_detective_results.append(1)
    else:
        json_detective_results.append(0) 
        
    if label_tmp.lower() in json_judge_answers[i].lower():
        json_judge_results.append(1)
    else:
        json_judge_results.append(0) 

In [5]:
# This section evaluates whether the output format (structured JSON vs. unstructured text) 
# significantly influences the accuracy of LLM-generated responses under diverse instructional conditions.
# Compute and display the averaged accuracy for each output 
# format across all persona-based instructions (standard, artist, chef, detective, judge). 
print(f"Averaged accuracy of unstructured format with diverse instruction: {format((sum(unstructured_standard_results)+sum(unstructured_artist_results)+sum(unstructured_chef_results)+sum(unstructured_detective_results)+sum(unstructured_judge_results))/(5*len(unstructured_standard_results)),'.3f')}")
print(f"Averaged accuracy of JSON format with diverse instruction: {format((sum(json_standard_results)+sum(json_artist_results)+sum(json_chef_results)+sum(json_detective_results)+sum(json_judge_results))/(5*len(json_standard_results)),'.3f')}")

# To formally test whether the observed differences are statistically significant, 
# McNemar's test is applied within each stratum (persona). McNemar's test is appropriate 
# for paired nominal data and evaluates whether the marginal proportions differ between 
# two correlated binary outcomes (here, correctness under JSON vs. unstructured format).
data_subsets = [
    # Stratum of control on standard instruction
    (np.array(json_standard_results), np.array(unstructured_standard_results)),
    # Stratum of control on artist instruction
    (np.array(json_artist_results), np.array(unstructured_artist_results)),
    # Stratum of control on chef instruction
    (np.array(json_chef_results), np.array(unstructured_chef_results)),
    # Stratum of control on detective instruction
    (np.array(json_detective_results), np.array(unstructured_detective_results)),
    # Stratum of control on judge instruction
    (np.array(json_judge_results), np.array(unstructured_judge_results)),
]

p_values = []

for i, (correct_D, correct_E) in enumerate(data_subsets, 1):
    # Build contingency table for this stratum
    table = np.zeros((2, 2), dtype=int)
    for d, e in zip(correct_D, correct_E):
        table[d, e] += 1

    # Run McNemar's test
    result = mcnemar(table, exact=True)
    p_values.append(result.pvalue)

# To aggregate evidence across multiple strata (personas), Stouffer’s method is employed. 
# This technique combines p-values by converting them into Z-scores and computing a weighted 
# sum, adjusted for correlation among strata (due to shared samples). The correlation-adjusted 
# Stouffer Z statistic accounts for potential dependence between tests, yielding a combined 
# p-value that reflects the overall significance of output format effects.
# Convert p-values to Z-scores (two-sided)
z_scores = norm.isf(np.array(p_values) / 2)  # inverse survival function

# Assume weights (e.g., uniform or by number of samples per stratum)
weights = np.ones_like(z_scores)
# Assume correlation between strata (due to shared samples)
rho = 0.3
k = len(p_values)
R = np.full((k, k), rho)
np.fill_diagonal(R, 1)  # correlation matrix

# Stouffer's Z with correlation adjustment
numerator = np.sum(weights * z_scores)
denominator = np.sqrt(np.dot(weights, R @ weights))
z_combined = numerator / denominator
p_combined = 2 * norm.sf(abs(z_combined))

# Report combined significance level
print(f"Combined p-value across strata (Stouffer’s method): {p_combined:.3f}")

Averaged accuracy of unstructured format with diverse instruction: 0.992
Averaged accuracy of JSON format with diverse instruction: 0.965
Combined p-value across strata (Stouffer’s method): 0.030


In [6]:
# This section investigates whether variations in instructional framing (standard vs. 
# persona-based interventions) significantly influence the accuracy of LLM-generated 
# responses across both output formats (structured JSON and unstructured text).
print(f"Averaged accuracy of unstructured & JSON format with standard instruction: {format((sum(unstructured_standard_results)+sum(json_standard_results))/(2*len(unstructured_standard_results)),'.3f')}")
print(f"Averaged accuracy of unstructured & JSON format with intervened instruction: {format((sum(unstructured_artist_results)+sum(unstructured_chef_results)+sum(unstructured_detective_results)+sum(unstructured_judge_results)+sum(json_artist_results)+sum(json_chef_results)+sum(json_detective_results)+sum(json_judge_results))/(8*len(json_standard_results)),'.3f')}")

stratum_json = np.array([json_standard_results, json_artist_results, json_chef_results, json_detective_results, json_judge_results]).transpose()
stratum_unstructured = np.array([unstructured_standard_results, unstructured_artist_results, unstructured_chef_results, unstructured_detective_results, unstructured_judge_results]).transpose()
data_strata = [stratum_json, stratum_unstructured]

# Run Cochran's Q test to assess whether the differences in accuracy across 
# multiple instructional conditions are statistically significant within each output format
# Two strata are considered:
# Stratum 1: Structured JSON outputs across all instructions.
# Stratum 2: Unstructured outputs across all instructions.
# Apply Cochran's Q test to each stratum and collect p-values

p_values = []

for i, data in enumerate(data_strata):
    result = cochrans_q(data)
    p_values.append(result.pvalue)
    
# To aggregate evidence across multiple strata (personas), Stouffer’s method is employed. 
# This technique combines p-values by converting them into Z-scores and computing a weighted 
# sum, adjusted for correlation among strata (due to shared samples). The correlation-adjusted 
# Stouffer Z statistic accounts for potential dependence between tests, yielding a combined 
# p-value that reflects the overall significance of output format effects.
# Convert p-values to Z-scores (two-sided)
z_scores = norm.isf(np.array(p_values) / 2)  # inverse survival function

# Assume weights (e.g., uniform or by number of samples per stratum)
weights = np.ones_like(z_scores)
# Assume correlation between strata (due to shared samples)
rho = 0.3
k = len(p_values)
R = np.full((k, k), rho)
np.fill_diagonal(R, 1)  # correlation matrix

# Stouffer's Z with correlation adjustment
numerator = np.sum(weights * z_scores)
denominator = np.sqrt(np.dot(weights, R @ weights))
z_combined = numerator / denominator
p_combined = 2 * norm.sf(abs(z_combined))

# Report combined significance level
print(f"Combined p-value across strata (Stouffer’s method): {p_combined:.3f}")

Averaged accuracy of unstructured & JSON format with standard instruction: 0.990
Averaged accuracy of unstructured & JSON format with intervened instruction: 0.976
Combined p-value across strata (Stouffer’s method): 0.019


In [7]:
# This section constructs a dataset for modeling the conditional association between 
# output format (structured vs. unstructured) and instruction type (standard vs. persona-based) on LLM generation quality. 
# The analysis is framed within a mixed-effects multinomial logistic regression.

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.bayes_mixed_glm import BinomialBayesMixedGLM
# Constructs a pandas DataFrame with three key variables:
# `Z` (Instruction Type): Encoded as categorical labels ('a' = standard, 'b' = artist, 
# 'c' = chef, 'd' = detective, 'e' = judge). This variable captures the instructional framing applied to the LLM.
# `X` (Output Format): Binary indicator where `1` denotes structured JSON output and `0` denotes unstructured text output.
# `Y` (Generation Quality): Binary outcome where `1` indicates correctness (answer matches ground truth) and `0` indicates incorrectness.

df = pd.DataFrame({
    'Z': np.concatenate([np.repeat('a', len(json_standard_results)),
                         np.repeat('b', len(json_artist_results)),
                         np.repeat('c', len(json_chef_results)),
                         np.repeat('d', len(json_detective_results)),
                         np.repeat('e', len(json_judge_results)),
                         # Unstructured
                         np.repeat('a', len(unstructured_standard_results)),
                         np.repeat('b', len(unstructured_artist_results)),
                         np.repeat('c', len(unstructured_chef_results)),
                         np.repeat('d', len(unstructured_detective_results)),
                         np.repeat('e', len(unstructured_judge_results)),
                        ]),
    'X': np.concatenate([np.repeat(1, len(json_standard_results)),
                         np.repeat(1, len(json_artist_results)),
                         np.repeat(1, len(json_chef_results)),
                         np.repeat(1, len(json_detective_results)),
                         np.repeat(1, len(json_judge_results)),
                         # Unstructured
                         np.repeat(0, len(unstructured_standard_results)),
                         np.repeat(0, len(unstructured_artist_results)),
                         np.repeat(0, len(unstructured_chef_results)),
                         np.repeat(0, len(unstructured_detective_results)),
                         np.repeat(0, len(unstructured_judge_results)),
                        ]),
    'Y': np.concatenate([np.repeat(0, len(np.where(np.array(json_standard_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(json_standard_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(json_artist_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(json_artist_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(json_chef_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(json_chef_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(json_detective_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(json_detective_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(json_judge_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(json_judge_results)==1)[0])),
                         # Unstructured
                         np.repeat(0, len(np.where(np.array(unstructured_standard_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_standard_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_artist_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_artist_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_chef_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_chef_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_detective_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_detective_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_judge_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_judge_results)==1)[0])),
                        ])
})

In [8]:
# Prepare design matrices for mixed-effects multinomial logistic regression
df_dummy = pd.get_dummies(df, columns=['Z'], drop_first=True)
# Define model components
endog = df_dummy['X']
exog = sm.add_constant(df_dummy[[col for col in df_dummy.columns if col.startswith('Z_')]])  # fixed effects
groups = df_dummy['Y']
exog_re = pd.get_dummies(groups)  # random intercepts per stratum
ident = np.ones(exog_re.shape[1], dtype=int)
# Fit mixed-effects multinomial logistic regression
model = BinomialBayesMixedGLM(endog, exog, exog_re, ident=ident)
result = model.fit_vb()
# Display posterior summaries for fixed and random effects
print(result.summary())

             Binomial Mixed GLM Results
      Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
----------------------------------------------------
const    M     0.6237   0.0447                      
Z_b      M    -0.0159   0.1000                      
Z_c      M    -0.0396   0.1003                      
Z_d      M     0.0017   0.0997                      
Z_e      M    -0.0071   0.0999                      
VC_1     V     0.0000   1.0000 1.000   0.135   7.389
VC_2     V    -0.0967   0.4561 0.908   0.365   2.260
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations


In [9]:
# Extract coefficient names, values, and std errors
# names of fixed effects (same order as fe_mean)
param_names = model.exog_names  
# posterior means of fixed effects
coefs = result.fe_mean          
# posterior standard deviations
ses = result.fe_sd              

# Compute z-scores and p-values
z_scores = coefs / ses
# two-tailed p-values
p_values = 2 * (1 - norm.cdf(np.abs(z_scores)))  

# Display formatted summary for each fixed effect
# We use the most significant p value as the representative value to suprious correlation
for name, coef, se, z, p in zip(param_names, coefs, ses, z_scores, p_values):
    print(f"{name:20} coef = {coef: .4f}, SE = {se:.4f}, z = {z:.2f}, p = {p:.4f}")

const                coef =  0.6237, SE = 0.0447, z = 13.95, p = 0.0000
Z_b                  coef = -0.0159, SE = 0.1000, z = -0.16, p = 0.8740
Z_c                  coef = -0.0396, SE = 0.1003, z = -0.39, p = 0.6933
Z_d                  coef =  0.0017, SE = 0.0997, z = 0.02, p = 0.9863
Z_e                  coef = -0.0071, SE = 0.0999, z = -0.07, p = 0.9437
