In [1]:
import pandas as pd
import numpy as np
import json
import xml.etree.ElementTree as ET
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.contingency_tables import cochrans_q
from scipy.stats import combine_pvalues
from scipy.stats import norm

In [2]:
def read_xml_answers(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    answers = []
    for obj in root.findall("root"):
        answer = obj.find("answer")
        if answer is not None:
            answers.append(answer.text)
    return answers

def read_unstructured_parsed(file_path):
    with open(file_path, "r") as f:
        data = json.load(f)

    answers = []
    if isinstance(data, list):
        for obj in data:
            if isinstance(obj, dict) and "parsed" in obj:
                answers.append(obj["parsed"])
    return answers

In [3]:
# Structured answers
xml_standard_answers = read_xml_answers("code_results/test_xml_0.xml")
xml_artist_answers = read_xml_answers("code_results/test_xml_artist_0.xml")
xml_chef_answers = read_xml_answers("code_results/test_xml_chef_2.xml")
xml_detective_answers = read_xml_answers("code_results/test_xml_detective_3.xml")
xml_judge_answers = read_xml_answers("code_results/test_xml_judge_0.xml")
# Unstructured answers
unstructured_standard_answers = list(pd.read_csv("code_results/test_unstructured_0.json.csv")['extracted'])
unstructured_artist_answers = list(pd.read_csv("code_results/test_unstructured_artist_0.json.csv")['extracted'])
unstructured_chef_answers = list(pd.read_csv("code_results/test_unstructured_chef_3.json.csv")['extracted'])
unstructured_detective_answers = list(pd.read_csv("code_results/test_unstructured_detective_2.json.csv")['extracted'])
unstructured_judge_answers = list(pd.read_csv("code_results/test_unstructured_judge_1.json.csv")['extracted'])

In [4]:
with open("data/code_compile.json", "r") as f:
    label = json.load(f)
unstructured_standard_results = []
unstructured_artist_results = []
unstructured_chef_results = []
unstructured_detective_results = []
unstructured_judge_results = []

xml_standard_results = []
xml_artist_results = []
xml_chef_results = []
xml_detective_results = []
xml_judge_results = []

for i in range(len(unstructured_standard_answers)):
    label_tmp = label[i]['compilation_error']
    # Unstructured answers
    if label_tmp == unstructured_standard_answers[i]:
        unstructured_standard_results.append(1)
    else:
        unstructured_standard_results.append(0)
        
    if label_tmp == unstructured_artist_answers[i]:
        unstructured_artist_results.append(1)
    else:
        unstructured_artist_results.append(0)
        
    if label_tmp == unstructured_chef_answers[i]:
        unstructured_chef_results.append(1)
    else:
        unstructured_chef_results.append(0)
        
    if label_tmp == unstructured_detective_answers[i]:
        unstructured_detective_results.append(1)
    else:
        unstructured_detective_results.append(0)
        
    if label_tmp == unstructured_judge_answers[i]:
        unstructured_judge_results.append(1)
    else:
        unstructured_judge_results.append(0)
    
    # Structured answers
    # if label_tmp == True if xml_standard_answers[i] == "True" else False:
    if str(label_tmp) == xml_standard_answers[i]:
        xml_standard_results.append(1)
    else:
        xml_standard_results.append(0)   
        
    # if label_tmp == True if xml_artist_answers[i] == "True" else False:
    if str(label_tmp) == xml_artist_answers[i]:
        xml_artist_results.append(1)
    else:
        xml_artist_results.append(0) 
        
    # if label_tmp == True if xml_chef_answers[i] == "True" else False:
    if str(label_tmp) == xml_chef_answers[i]:
        xml_chef_results.append(1)
    else:
        xml_chef_results.append(0) 
        
    # if label_tmp == True if xml_detective_answers[i] == "True" else False:
    if str(label_tmp) == xml_detective_answers[i]:
        xml_detective_results.append(1)
    else:
        xml_detective_results.append(0) 
        
    # if label_tmp == True if xml_judge_answers[i] == "True" else False:
    if str(label_tmp) == xml_judge_answers[i]:
        xml_judge_results.append(1)
    else:
        xml_judge_results.append(0) 

In [5]:
# Test whether output format significantly influences LLMs' generation
print(f"Averaged accuracy of unstructured format with diverse instruction: {format((sum(unstructured_standard_results)+sum(unstructured_artist_results)+sum(unstructured_chef_results)+sum(unstructured_detective_results)+sum(unstructured_judge_results))/(5*len(unstructured_standard_results)),'.3f')}")
print(f"Averaged accuracy of XML format with diverse instruction: {format((sum(xml_standard_results)+sum(xml_artist_results)+sum(xml_chef_results)+sum(xml_detective_results)+sum(xml_judge_results))/(5*len(xml_standard_results)),'.3f')}")
data_subsets = [
    # Stratum of control on standard instruction
    (np.array(xml_standard_results), np.array(unstructured_standard_results)),
    # Stratum of control on artist instruction
    (np.array(xml_artist_results), np.array(unstructured_artist_results)),
    # Stratum of control on chef instruction
    (np.array(xml_chef_results), np.array(unstructured_chef_results)),
    # Stratum of control on detective instruction
    (np.array(xml_detective_results), np.array(unstructured_detective_results)),
    # Stratum of control on judge instruction
    (np.array(xml_judge_results), np.array(unstructured_judge_results)),
]

p_values = []

for i, (correct_D, correct_E) in enumerate(data_subsets, 1):
    # Build contingency table for this stratum
    table = np.zeros((2, 2), dtype=int)
    for d, e in zip(correct_D, correct_E):
        table[d, e] += 1

    # Run McNemar's test
    result = mcnemar(table, exact=True)
    p_values.append(result.pvalue)

    # print(f"Stratum {i} contingency table:\n{table}")
    # print(f"Stratum {i} McNemar p-value: {result.pvalue:.5f}\n")

# Combine p-values using Stouffer’s method with correlation adjustment
# Convert p-values to Z-scores (two-sided)
z_scores = norm.isf(np.array(p_values) / 2)  # inverse survival function

# Assume weights (e.g., uniform or by number of samples per stratum)
weights = np.ones_like(z_scores)
# Assume correlation between strata (due to shared samples)
rho = 0.3
k = len(p_values)
R = np.full((k, k), rho)
np.fill_diagonal(R, 1)  # correlation matrix

# Stouffer's Z with correlation adjustment
numerator = np.sum(weights * z_scores)
denominator = np.sqrt(np.dot(weights, R @ weights))
z_combined = numerator / denominator
p_combined = 2 * norm.sf(abs(z_combined))

# print(f"Combined Stouffer Z: {z_combined:.3f}")
print(f"Combined p-value across strata (Stouffer’s method): {p_combined:.3f}")

Averaged accuracy of unstructured format with diverse instruction: 0.831
Averaged accuracy of XML format with diverse instruction: 0.818
Combined p-value across strata (Stouffer’s method): 0.263


In [6]:
# Test whether instruction significantly influences LLMs' generation
print(f"Averaged accuracy of unstructured & XML format with standard instruction: {format((sum(unstructured_standard_results)+sum(xml_standard_results))/(2*len(xml_standard_results)),'.3f')}")
print(f"Averaged accuracy of unstructured & XML format with intervened instruction: {format((sum(unstructured_artist_results)+sum(unstructured_chef_results)+sum(unstructured_detective_results)+sum(unstructured_judge_results)+sum(xml_artist_results)+sum(xml_chef_results)+sum(xml_detective_results)+sum(xml_judge_results))/(8*len(xml_standard_results)),'.3f')}")

stratum_xml = np.array([xml_standard_results, xml_artist_results, xml_chef_results, xml_detective_results, xml_judge_results]).transpose()
stratum_unstructured = np.array([unstructured_standard_results, unstructured_artist_results, unstructured_chef_results, unstructured_detective_results, unstructured_judge_results]).transpose()
data_strata = [stratum_xml, stratum_unstructured]

# Run Cochran's Q test on each stratum
p_values = []
# print("Cochran's Q test per stratum:")
for i, data in enumerate(data_strata):
    result = cochrans_q(data)
    p_values.append(result.pvalue)
    # print(f"  Stratum {i+1}: Q = {result.statistic:.4f}, p = {result.pvalue:.5f}")

# Combine p-values using Stouffer’s method with correlation adjustment
# Convert p-values to Z-scores (two-sided)
z_scores = norm.isf(np.array(p_values) / 2)  # inverse survival function

# Assume weights (e.g., uniform or by number of samples per stratum)
weights = np.ones_like(z_scores)
# Assume correlation between strata (due to shared samples)
rho = 0.3
k = len(p_values)
R = np.full((k, k), rho)
np.fill_diagonal(R, 1)  # correlation matrix

# Stouffer's Z with correlation adjustment
numerator = np.sum(weights * z_scores)
denominator = np.sqrt(np.dot(weights, R @ weights))
z_combined = numerator / denominator
p_combined = 2 * norm.sf(abs(z_combined))

# print(f"Combined Stouffer Z: {z_combined:.3f}")
print(f"Combined p-value across strata (Stouffer’s method): {p_combined:.3f}")

Averaged accuracy of unstructured & XML format with standard instruction: 0.825
Averaged accuracy of unstructured & XML format with intervened instruction: 0.824
Combined p-value across strata (Stouffer’s method): 0.645
