In [98]:
import json
import re
import csv
import os

def extract_answer(response):
    """Extract the final numeric answer from a given response text."""
    match = re.findall(r'[\d,]+', response)
    if match:
        cleaned_number = match[-1].replace(',', '')  # Remove commas
        try:
            return int(cleaned_number)
        except ValueError:
            print(f"Warning: Unable to convert '{cleaned_number}' to an integer.")
            return None
    else:
        print("Warning: No numeric answer found in the response.")
    return None

def calculate_micro_accuracy(data):
    """Calculate micro accuracy based on original_response."""
    correct_count = 0
    total_count = len(data)

    for entry in data:
        expected_answer = int(str(entry["answer"]).replace(",", ""))
        original_response_answer = extract_answer(entry["original_response"])

        if original_response_answer == expected_answer:
            correct_count += 1

    return (correct_count / total_count * 100) if total_count > 0 else 0

def calculate_macro_accuracy(data):
    """Calculate macro accuracy based on the consistency of new_response for each unique base problem (id)."""
    base_problem_groups = {}

    # Group entries by base id
    for entry in data:
        base_id = entry["id"]
        expected_answer = int(str(entry["answer"]).replace(",", ""))
        new_response_answer = extract_answer(entry["new_response"])

        # Initialize group if not already in dictionary
        if base_id not in base_problem_groups:
            base_problem_groups[base_id] = {"expected_answer": expected_answer, "all_correct": True}

        # Check if the current answer is correct for this variant
        if new_response_answer != expected_answer:
            base_problem_groups[base_id]["all_correct"] = False

    # Calculate macro accuracy as the percentage of fully correct base problems
    macro_correct = sum(1 for group in base_problem_groups.values() if group["all_correct"])
    macro_total = len(base_problem_groups)

    return (macro_correct / macro_total * 100) if macro_total > 0 else 0

def calculate_and_save_accuracies(input_json, output_csv, method_type, context):
    # Load data and calculate micro and macro accuracy
    with open(input_json, 'r') as file:
        data = json.load(file)

    micro_accuracy = calculate_micro_accuracy(data)
    macro_accuracy = calculate_macro_accuracy(data)

    # Check if CSV exists, and append if it does; otherwise, create a new file
    file_exists = os.path.isfile(output_csv)

    with open(output_csv, 'a', newline='') as csvfile:
        writer = csv.writer(csvfile)
        
        # Write headers only if the file is new
        if not file_exists:
            writer.writerow(["File", "Method", "Context", "Micro Accuracy (%)", "Macro Accuracy (%)"])
        
        # Write the accuracy results
        writer.writerow([os.path.basename(input_json), method_type, context, f"{micro_accuracy:.2f}", f"{macro_accuracy:.2f}"])

    print(f"Micro Accuracy for {method_type} - {context}: {micro_accuracy:.2f}%")
    print(f"Macro Accuracy for {method_type} - {context}: {macro_accuracy:.2f}%")

# Example usage:
# calculate_and_save_accuracies(
#     "path/to/data.json",
#     "path/to/output_accuracy_summary.csv",
#     method_type="LTM",
#     context="Role Overlap: Yes"
# )


In [99]:
# CoT 0-shot accuracy w/ 2 steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_2.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="CoT",
                              context="Overall 2")

Micro Accuracy for CoT - Overall 2: 92.00%
Macro Accuracy for CoT - Overall 2: 87.00%


In [100]:
# LTM 0-shot accuracy w/ 2 steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_2.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="LTM",
                              context="Overall 2")

Micro Accuracy for LTM - Overall 2: 86.00%
Macro Accuracy for LTM - Overall 2: 82.00%


In [101]:
# Python 0-shot accuracy w/ 2 steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_2.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall 2")

Micro Accuracy for Python - Overall 2: 80.00%
Macro Accuracy for Python - Overall 2: 76.00%


In [102]:
# CoT 0-shot accuracy w/ m steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="CoT",
                              context="Overall m")

Micro Accuracy for CoT - Overall m: 95.00%
Macro Accuracy for CoT - Overall m: 95.00%


In [103]:
# LTM 0-shot accuracy w/ m steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="LTM",
                              context="Overall m")

Micro Accuracy for LTM - Overall m: 89.00%
Macro Accuracy for LTM - Overall m: 94.00%


In [104]:
# Python 0-shot accuracy w/ m steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 95.00%
Macro Accuracy for Python - Overall m: 91.00%


In [105]:
# LTM 0-shot accuracy w/ m steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_instructed_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="LTM",
                              context="Overall m")

Micro Accuracy for LTM - Overall m: 94.00%
Macro Accuracy for LTM - Overall m: 87.00%


In [106]:
# CoT 0-shot accuracy w/ m steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_instructed_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="CoT",
                              context="Overall m")

Micro Accuracy for CoT - Overall m: 91.00%
Macro Accuracy for CoT - Overall m: 90.00%


In [107]:
# CoT 0-shot accuracy w/ 2 steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_instructed_2.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="CoT",
                              context="Overall 2")

Micro Accuracy for CoT - Overall 2: 85.00%
Macro Accuracy for CoT - Overall 2: 84.00%


In [108]:
# LTM 0-shot accuracy w/ 2 steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_instructed_2.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="LTM",
                              context="Overall 2")

Micro Accuracy for LTM - Overall 2: 81.00%
Macro Accuracy for LTM - Overall 2: 83.00%


In [109]:
# Python 0-shot accuracy w/ 2 steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_instructed_2.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall 2")

Micro Accuracy for Python - Overall 2: 85.00%
Macro Accuracy for Python - Overall 2: 82.00%


In [110]:
# Python 0-shot accuracy w/ 2 steps
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_instructed_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 95.00%
Macro Accuracy for Python - Overall m: 95.00%


In [111]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_in_range_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 86.00%
Macro Accuracy for Python - Overall m: 85.00%


In [112]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_out_range_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 83.00%
Macro Accuracy for Python - Overall m: 76.00%


In [113]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_out_range_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 98.00%
Macro Accuracy for Python - Overall m: 96.00%


In [114]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_in_range_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 98.00%
Macro Accuracy for Python - Overall m: 95.00%


In [115]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_out_range_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 97.00%
Macro Accuracy for Python - Overall m: 99.00%


In [116]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_in_range_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 98.00%
Macro Accuracy for Python - Overall m: 94.00%


In [117]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_overlapped_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 85.00%
Macro Accuracy for Python - Overall m: 85.00%


In [118]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_nonoverlapped_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 86.00%
Macro Accuracy for Python - Overall m: 86.00%


In [119]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_overlapped_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 96.00%
Macro Accuracy for Python - Overall m: 96.00%


In [120]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_nonoverlapped_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 95.00%
Macro Accuracy for Python - Overall m: 98.00%


In [121]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_overlapped_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 95.00%
Macro Accuracy for Python - Overall m: 95.00%


In [122]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_nonoverlapped_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 96.00%
Macro Accuracy for Python - Overall m: 93.00%


In [123]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_in_topic_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 85.00%
Macro Accuracy for Python - Overall m: 90.00%


In [124]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_Python_out_topic_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 94.00%
Macro Accuracy for Python - Overall m: 91.00%


In [125]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_in_topic_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 99.00%
Macro Accuracy for Python - Overall m: 97.00%


In [126]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_CoT_out_topic_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 97.00%
Macro Accuracy for Python - Overall m: 98.00%


In [127]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_in_topic_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 96.00%
Macro Accuracy for Python - Overall m: 91.00%


In [128]:
calculate_and_save_accuracies("/Users/byfoot/Desktop/ECE570/reimplement/results/overall_result_LTM_out_topic_m.json",
                              "/Users/byfoot/Desktop/ECE570/reimplement/csvs/analyze.csv",
                              method_type="Python",
                              context="Overall m")

Micro Accuracy for Python - Overall m: 97.00%
Macro Accuracy for Python - Overall m: 96.00%
