In [1]:
import pandas as pd
import numpy as np
import json
import re
import xml.etree.ElementTree as ET
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.contingency_tables import cochrans_q
from scipy.stats import combine_pvalues
from scipy.stats import norm

In [2]:
def read_xml_answers(file_path, positions):
    with open(file_path, "r") as f:
        data = json.load(f)
    data = [data[i] for i in positions]
    answers = []
    for obj in data:
        obj = obj.replace("```", "")
        obj = re.sub(r'[\x00-\x1f\x7f]', '', obj)
        obj = "<root>" + obj.split("<root>")[-1]
        obj = ET.fromstring(obj)
        answer = obj.find("answer")
        number = str(answer.text).replace(',', '')
        number = re.findall(r'\d+\.\d+|\d+', number)
        if len(number) != 0:
            answers.append(number[-1])
        else:
            answers.append('10086')
    return answers

def read_unstructured_parsed(file_path, positions):
    with open(file_path, "r") as f:
        data = json.load(f)
    data = [data[i] for i in positions]
    answers = []
    if isinstance(data, list):
        for obj in data:
            if isinstance(obj, dict) and "parsed" in obj:
                number = str(obj["parsed"]).replace(',', '')
                number = re.findall(r'\d+\.\d+|\d+', number)
                if len(number) != 0:
                    answers.append(number[0])
                else:
                    answers.append('10086')
    return answers

In [3]:
def validate_xml_structure(xml_object):
    try:
        # Parse the XML string
        root = ET.fromstring(xml_object)
        
        # Check if the root tag is 'root'
        if root.tag != 'root':
            return False

        # Check for <reasoning> and <answer> children
        reasoning = root.find('reasoning')
        answer = root.find('answer')

        if reasoning is None:
            return False
        if answer is None:
            return False

        return True

    except ET.ParseError as e:
        return False
        
def get_response_id(path):
    with open(path) as json_data:
        data_pred = json.load(json_data)
    response_id = []
    for i in range(200):
        output = data_pred[i]
        output = output.replace("```", "")
        output = re.sub(r'[\x00-\x1f\x7f]', '', output)
        output = "<root>" + output.split("<root>")[-1]
        if validate_xml_structure(output):
            response_id.append(1)
        else:
            response_id.append(0)
    return response_id

def common_one_positions(*lists):
    if not lists:
        return []
    
    # Use zip to group elements by position, then check if all are 1
    return [i for i, values in enumerate(zip(*lists)) if all(v == 1 for v in values)]

In [4]:
response_id_standard = get_response_id('results/llama_8b_xml.json')
response_id_artist = get_response_id('results/llama_8b_xml_artist.json')
response_id_chef = get_response_id('results/llama_8b_xml_chef.json')
response_id_detective = get_response_id('results/llama_8b_xml_detective.json')
response_id_judge = get_response_id('results/llama_8b_xml_judge.json')
positions = common_one_positions(response_id_standard, response_id_artist, response_id_chef, response_id_detective, response_id_judge)

In [5]:
# Structured answers
xml_standard_answers = read_xml_answers("results/llama_8b_xml.json", positions)
xml_artist_answers = read_xml_answers("results/llama_8b_xml_artist.json", positions)
xml_chef_answers = read_xml_answers("results/llama_8b_xml_chef.json", positions)
xml_detective_answers = read_xml_answers("results/llama_8b_xml_detective.json", positions)
xml_judge_answers = read_xml_answers("results/llama_8b_xml_judge.json", positions)

In [6]:
# Unstructured answers
unstructured_standard_answers = read_unstructured_parsed("results/llama_8b_unstructured_parsed.json", positions)
unstructured_artist_answers = read_unstructured_parsed("results/llama_8b_unstructured_artist_parsed.json", positions)
unstructured_chef_answers = read_unstructured_parsed("results/llama_8b_unstructured_chef_parsed.json", positions)
unstructured_detective_answers = read_unstructured_parsed("results/llama_8b_unstructured_detective_parsed.json", positions)
unstructured_judge_answers = read_unstructured_parsed("results/llama_8b_unstructured_judge_parsed.json", positions)

In [7]:
with open("../data/gsm8k_test.json", "r") as f:
    label = json.load(f)
label = [label[i] for i in positions]

unstructured_standard_results = []
unstructured_artist_results = []
unstructured_chef_results = []
unstructured_detective_results = []
unstructured_judge_results = []

xml_standard_results = []
xml_artist_results = []
xml_chef_results = []
xml_detective_results = []
xml_judge_results = []

for i in range(len(unstructured_standard_answers)):
    label_tmp = float(label[i]['answer'].split("\n#### ")[-1].replace(',', ''))
    # Unstructured answers
    if label_tmp == float(unstructured_standard_answers[i]):
        unstructured_standard_results.append(1)
    else:
        unstructured_standard_results.append(0)
        
    if label_tmp == float(unstructured_artist_answers[i]):
        unstructured_artist_results.append(1)
    else:
        unstructured_artist_results.append(0)
        
    if label_tmp == float(unstructured_chef_answers[i]):
        unstructured_chef_results.append(1)
    else:
        unstructured_chef_results.append(0)
        
    if label_tmp == float(unstructured_detective_answers[i]):
        unstructured_detective_results.append(1)
    else:
        unstructured_detective_results.append(0)
        
    if label_tmp == float(unstructured_judge_answers[i]):
        unstructured_judge_results.append(1)
    else:
        unstructured_judge_results.append(0)
    
    # Structured answers
    if label_tmp == float(xml_standard_answers[i]):
        xml_standard_results.append(1)
    else:
        xml_standard_results.append(0)   
        
    if label_tmp == float(xml_artist_answers[i]):
        xml_artist_results.append(1)
    else:
        xml_artist_results.append(0) 
        
    if label_tmp == float(xml_chef_answers[i]):
        xml_chef_results.append(1)
    else:
        xml_chef_results.append(0) 
        
    if label_tmp == float(xml_detective_answers[i]):
        xml_detective_results.append(1)
    else:
        xml_detective_results.append(0)
        
    if label_tmp == float(xml_judge_answers[i]):
        xml_judge_results.append(1)
    else:
        xml_judge_results.append(0) 

In [8]:
# Test whether output format significantly influences LLMs' generation
print(f"Averaged accuracy of unstructured format with diverse instruction: {format((sum(unstructured_standard_results)+sum(unstructured_artist_results)+sum(unstructured_chef_results)+sum(unstructured_detective_results)+sum(unstructured_judge_results))/(5*len(unstructured_standard_results)),'.3f')}")
print(f"Averaged accuracy of XML format with diverse instruction: {format((sum(xml_standard_results)+sum(xml_artist_results)+sum(xml_chef_results)+sum(xml_detective_results)+sum(xml_judge_results))/(5*len(xml_standard_results)),'.3f')}")
data_subsets = [
    # Stratum of control on standard instruction
    (np.array(xml_standard_results), np.array(unstructured_standard_results)),
    # Stratum of control on artist instruction
    (np.array(xml_artist_results), np.array(unstructured_artist_results)),
    # Stratum of control on chef instruction
    (np.array(xml_chef_results), np.array(unstructured_chef_results)),
    # Stratum of control on detective instruction
    (np.array(xml_detective_results), np.array(unstructured_detective_results)),
    # Stratum of control on judge instruction
    (np.array(xml_judge_results), np.array(unstructured_judge_results)),
]

p_values = []

for i, (correct_D, correct_E) in enumerate(data_subsets, 1):
    # Build contingency table for this stratum
    table = np.zeros((2, 2), dtype=int)
    for d, e in zip(correct_D, correct_E):
        table[d, e] += 1

    # Run McNemar's test
    result = mcnemar(table, exact=True)
    p_values.append(result.pvalue)

    # print(f"Stratum {i} contingency table:\n{table}")
    # print(f"Stratum {i} McNemar p-value: {result.pvalue:.5f}\n")

# Combine p-values using Stouffer’s method with correlation adjustment
# Convert p-values to Z-scores (two-sided)
z_scores = norm.isf(np.array(p_values) / 2)  # inverse survival function

# Assume weights (e.g., uniform or by number of samples per stratum)
weights = np.ones_like(z_scores)
# Assume correlation between strata (due to shared samples)
rho = 0.3
k = len(p_values)
R = np.full((k, k), rho)
np.fill_diagonal(R, 1)  # correlation matrix

# Stouffer's Z with correlation adjustment
numerator = np.sum(weights * z_scores)
denominator = np.sqrt(np.dot(weights, R @ weights))
z_combined = numerator / denominator
p_combined = 2 * norm.sf(abs(z_combined))

# print(f"Combined Stouffer Z: {z_combined:.3f}")
print(f"Combined p-value across strata (Stouffer’s method): {p_combined:.3f}")

Averaged accuracy of unstructured format with diverse instruction: 0.875
Averaged accuracy of XML format with diverse instruction: 0.882
Combined p-value across strata (Stouffer’s method): 0.042


In [9]:
# Test whether instruction significantly influences LLMs' generation
print(f"Averaged accuracy of unstructured & XML format with standard instruction: {format((sum(unstructured_standard_results)+sum(xml_standard_results))/(2*len(xml_standard_results)),'.3f')}")
print(f"Averaged accuracy of unstructured & XML format with intervened instruction: {format((sum(unstructured_artist_results)+sum(unstructured_chef_results)+sum(unstructured_detective_results)+sum(unstructured_judge_results)+sum(xml_artist_results)+sum(xml_chef_results)+sum(xml_detective_results)+sum(xml_judge_results))/(8*len(xml_standard_results)),'.3f')}")

stratum_xml = np.array([xml_standard_results, xml_artist_results, xml_chef_results, xml_detective_results, xml_judge_results]).transpose()
stratum_unstructured = np.array([unstructured_standard_results, unstructured_artist_results, unstructured_chef_results, unstructured_detective_results, unstructured_judge_results]).transpose()
data_strata = [stratum_xml, stratum_unstructured]

# Run Cochran's Q test on each stratum
p_values = []
# print("Cochran's Q test per stratum:")
for i, data in enumerate(data_strata):
    result = cochrans_q(data)
    p_values.append(result.pvalue)
    # print(f"  Stratum {i+1}: Q = {result.statistic:.4f}, p = {result.pvalue:.5f}")
    
# Combine p-values using Stouffer’s method with correlation adjustment
# Convert p-values to Z-scores (two-sided)
z_scores = norm.isf(np.array(p_values) / 2)  # inverse survival function

# Assume weights (e.g., uniform or by number of samples per stratum)
weights = np.ones_like(z_scores)
# Assume correlation between strata (due to shared samples)
rho = 0.3
k = len(p_values)
R = np.full((k, k), rho)
np.fill_diagonal(R, 1)  # correlation matrix

# Stouffer's Z with correlation adjustment
numerator = np.sum(weights * z_scores)
denominator = np.sqrt(np.dot(weights, R @ weights))
z_combined = numerator / denominator
p_combined = 2 * norm.sf(abs(z_combined))

# print(f"Combined Stouffer Z: {z_combined:.3f}")
print(f"Combined p-value across strata (Stouffer’s method): {p_combined:.3f}")

Averaged accuracy of unstructured & XML format with standard instruction: 0.899
Averaged accuracy of unstructured & XML format with intervened instruction: 0.873
Combined p-value across strata (Stouffer’s method): 0.000


In [10]:
# If statistical significance threshold is 0.1
# Test conditional association between output format and instruction type
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.genmod.bayes_mixed_glm import BinomialBayesMixedGLM
# Transform xml_standard_results into dataframe
# Comprising output format, instruction, and stratum of generation quality
# Output format: 1 structured 0 unstructured
# Instruction: 'a' standard 'b' artist 'c' chef 'd' detective 'e' judge
# Generation qiality: 1 true 0 false
df = pd.DataFrame({
    'Z': np.concatenate([np.repeat('a', len(xml_standard_results)),
                         np.repeat('b', len(xml_artist_results)),
                         np.repeat('c', len(xml_chef_results)),
                         np.repeat('d', len(xml_detective_results)),
                         np.repeat('e', len(xml_judge_results)),
                         # Unstructured
                         np.repeat('a', len(unstructured_standard_results)),
                         np.repeat('b', len(unstructured_artist_results)),
                         np.repeat('c', len(unstructured_chef_results)),
                         np.repeat('d', len(unstructured_detective_results)),
                         np.repeat('e', len(unstructured_judge_results)),
                        ]),
    'X': np.concatenate([np.repeat(1, len(xml_standard_results)),
                         np.repeat(1, len(xml_artist_results)),
                         np.repeat(1, len(xml_chef_results)),
                         np.repeat(1, len(xml_detective_results)),
                         np.repeat(1, len(xml_judge_results)),
                         # Unstructured
                         np.repeat(0, len(unstructured_standard_results)),
                         np.repeat(0, len(unstructured_artist_results)),
                         np.repeat(0, len(unstructured_chef_results)),
                         np.repeat(0, len(unstructured_detective_results)),
                         np.repeat(0, len(unstructured_judge_results)),
                        ]),
    'Y': np.concatenate([np.repeat(0, len(np.where(np.array(xml_standard_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(xml_standard_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(xml_artist_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(xml_artist_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(xml_chef_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(xml_chef_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(xml_detective_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(xml_detective_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(xml_judge_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(xml_judge_results)==1)[0])),
                         # Unstructured
                         np.repeat(0, len(np.where(np.array(unstructured_standard_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_standard_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_artist_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_artist_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_chef_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_chef_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_detective_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_detective_results)==1)[0])),
                         np.repeat(0, len(np.where(np.array(unstructured_judge_results)==0)[0])),
                         np.repeat(1, len(np.where(np.array(unstructured_judge_results)==1)[0])),
                        ])
})

In [11]:
df_dummy = pd.get_dummies(df, columns=['Z'], drop_first=True)
# Define model components
endog = df_dummy['X']
exog = sm.add_constant(df_dummy[[col for col in df_dummy.columns if col.startswith('Z_')]])  # fixed effects
groups = df_dummy['Y']
exog_re = pd.get_dummies(groups)  # random intercepts per stratum
ident = np.ones(exog_re.shape[1], dtype=int)
# Fit mixed logistic model
model = BinomialBayesMixedGLM(endog, exog, exog_re, ident=ident)
result = model.fit_vb()
# Print results
print(result.summary())

             Binomial Mixed GLM Results
      Type Post. Mean Post. SD   SD  SD (LB) SD (UB)
----------------------------------------------------
const    M    -0.0152   0.0465                      
Z_b      M     0.0009   0.1038                      
Z_c      M     0.0035   0.1038                      
Z_d      M     0.0004   0.1038                      
Z_e      M    -0.0005   0.1038                      
VC_1     V     0.0000   1.0000 1.000   0.135   7.389
VC_2     V    -1.4249   0.6820 0.241   0.061   0.941
Parameter types are mean structure (M) and variance
structure (V)
Variance parameters are modeled as log standard
deviations


In [12]:
# Extract coefficient names, values, and std errors
param_names = model.exog_names  # names of fixed effects (same order as fe_mean)
coefs = result.fe_mean          # posterior means of fixed effects
ses = result.fe_sd              # posterior standard deviations

# Compute z-scores and p-values
z_scores = coefs / ses
p_values = 2 * (1 - norm.cdf(np.abs(z_scores)))  # two-tailed p-values

# Display results
for name, coef, se, z, p in zip(param_names, coefs, ses, z_scores, p_values):
    print(f"{name:20} coef = {coef: .4f}, SE = {se:.4f}, z = {z:.2f}, p = {p:.4f}")

const                coef = -0.0152, SE = 0.0465, z = -0.33, p = 0.7435
Z_b                  coef =  0.0009, SE = 0.1038, z = 0.01, p = 0.9933
Z_c                  coef =  0.0035, SE = 0.1038, z = 0.03, p = 0.9729
Z_d                  coef =  0.0004, SE = 0.1038, z = 0.00, p = 0.9968
Z_e                  coef = -0.0005, SE = 0.1038, z = -0.00, p = 0.9961


In [14]:
def read_unstructured_parsed_all(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)
    answers = []
    if isinstance(data, list):
        for obj in data:
            if isinstance(obj, dict) and "parsed" in obj:
                number = str(obj["parsed"]).replace(',', '')
                number = re.findall(r'\d+\.\d+|\d+', number)
            if len(number) != 0:
                answers.append(number[0])
            else:
                answers.append('10086')
    return answers

# Unstructured answers (all samples)
unstructured_standard_answers = read_unstructured_parsed_all("results/llama_8b_unstructured_parsed.json")
unstructured_artist_answers = read_unstructured_parsed_all("results/llama_8b_unstructured_artist_parsed.json")
unstructured_chef_answers = read_unstructured_parsed_all("results/llama_8b_unstructured_chef_parsed.json")
unstructured_detective_answers = read_unstructured_parsed_all("results/llama_8b_unstructured_detective_parsed.json")
unstructured_judge_answers = read_unstructured_parsed_all("results/llama_8b_unstructured_judge_parsed.json")

unstructured_standard_results = []
unstructured_artist_results = []
unstructured_chef_results = []
unstructured_detective_results = []
unstructured_judge_results = []

with open("../data/gsm8k_test.json", "r") as f:
    label = json.load(f)

for i in range(len(unstructured_standard_answers)):
    label_tmp = float(label[i]['answer'].split("\n#### ")[-1].replace(',', ''))
    # Unstructured answers
    if label_tmp == float(unstructured_standard_answers[i]):
        unstructured_standard_results.append(1)
    else:
        unstructured_standard_results.append(0)
        
    if label_tmp == float(unstructured_artist_answers[i]):
        unstructured_artist_results.append(1)
    else:
        unstructured_artist_results.append(0)
        
    if label_tmp == float(unstructured_chef_answers[i]):
        unstructured_chef_results.append(1)
    else:
        unstructured_chef_results.append(0)
        
    if label_tmp == float(unstructured_detective_answers[i]):
        unstructured_detective_results.append(1)
    else:
        unstructured_detective_results.append(0)
        
    if label_tmp == float(unstructured_judge_answers[i]):
        unstructured_judge_results.append(1)
    else:
        unstructured_judge_results.append(0)

print(f"Averaged accuracy of unstructured format with diverse instruction (all samples): {format((sum(unstructured_standard_results)+sum(unstructured_artist_results)+sum(unstructured_chef_results)+sum(unstructured_detective_results)+sum(unstructured_judge_results))/(5*len(unstructured_standard_results)),'.3f')}")

Averaged accuracy of unstructured format with diverse instruction (all samples): 0.836
