In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import glob

from functools import partial
from tqdm import tqdm

import json

import os

In [None]:
ALL_RESULTS_PATHS = glob.glob("results/*/*/*.json")
ALL_RESULTS_PATHS[:5], len(ALL_RESULTS_PATHS)

In [None]:
RESULTS_DICT = {} # model -> perturbation -> list of results

In [None]:
for result_file in tqdm(ALL_RESULTS_PATHS):
    parts = result_file.split("\\")
    model = parts[1]
    perturbation = parts[2]
    if perturbation == "UnitConv":
        continue
    q_id = int(parts[3].split(".")[0])
    
    if model not in RESULTS_DICT:
        RESULTS_DICT[model] = {}
    if perturbation not in RESULTS_DICT[model]:
        RESULTS_DICT[model][perturbation] = {}
    
    with open(result_file, "r") as f:
        result_data = json.load(f)
        RESULTS_DICT[model][perturbation][q_id] = result_data

In [None]:
RESULTS_DICT.keys()

In [None]:
# model to size map
import math
MODEL_SIZE_MAP = {
    'openai_gpt_5_2': 1500,
    'anthropic_claude_sonnet_4_5': 1500,
    'anthropic_claude_haiku_4_5': 1000,
    # 'google_gemini_3_pro_preview': 2000,
    'openai_gpt_4o_mini': 1000,
    'google_gemini_3_flash_preview': 1000,
    'deepseek_deepseek_v3_2': 685,
    'mistralai_mistral_large_2512': 675,
    'qwen_qwen3_235b_a22b_2507': 235,
    'meta_llama_llama_3_1_8b_instruct': 8,
    'meta_llama_llama_4_scout': 109,
    'mistralai_ministral_8b_2512': 8,
    'mistralai_ministral_3b': 3,
    'google_gemma_3_4b_it': 4,
}

In [None]:
# RESULTS_DICT['azure_gpt_4o_mini'].keys()

In [None]:
for key in RESULTS_DICT['openai_gpt_5_2'].keys():
    print(key)
    q_id_random = list(RESULTS_DICT['openai_gpt_5_2'][key].keys())[0]
    print("Question ID: ", q_id_random)
    print("Correct answer: ", RESULTS_DICT['openai_gpt_5_2'][key][q_id_random]['answer'])
    print("Model answer clean question: ", RESULTS_DICT['openai_gpt_5_2'][key][q_id_random]['answer_solution_clean'])
    print("Model answer perturbed question: ", RESULTS_DICT['openai_gpt_5_2'][key][q_id_random]['answer_solution_perturbed'])

In [None]:
import re
def extract_answer(s):
    ANS_RE = re.compile(r"#### (\-?[0-9\.\,]+)")
    match = ANS_RE.search(s)
    if match:
        match_str = match.group(1).strip()
        match_str = match_str.replace(",", "")
        return match_str
    else:
        # check if the last part of the string is a number
        match_str = s.split()[-1].strip()
        if re.match(r'(\-?[0-9\.\,]+)', match_str):
            return match_str
    return 'invalid'

In [None]:
for model in RESULTS_DICT.keys():
    for perturbation in RESULTS_DICT[model].keys():
        for q_id in RESULTS_DICT[model][perturbation].keys():
            # if RESULTS_DICT[model][perturbation][q_id]['answer'] is None:
            RESULTS_DICT[model][perturbation][q_id]['correct_answer'] = extract_answer(
                RESULTS_DICT[model][perturbation][q_id]['solution']
            )
            if RESULTS_DICT[model][perturbation][q_id]['correct_answer'] != 'invalid':
                RESULTS_DICT[model][perturbation][q_id]['correct_answer'] = float(
                    RESULTS_DICT[model][perturbation][q_id]['correct_answer']
                )
            # print("Correct answer: ", RESULTS_DICT[model][perturbation][q_id]['correct_answer'])
            try:
                RESULTS_DICT[model][perturbation][q_id]['is_correct_clean'] = \
                    float(RESULTS_DICT[model][perturbation][q_id]['answer_solution_clean'].strip().lower()) == \
                    RESULTS_DICT[model][perturbation][q_id]['correct_answer']
            except:
                RESULTS_DICT[model][perturbation][q_id]['is_correct_clean'] = False
            
            try:
                RESULTS_DICT[model][perturbation][q_id]['is_correct_perturbed'] = \
                    float(RESULTS_DICT[model][perturbation][q_id]['answer_solution_perturbed'].strip().lower()) == \
                    RESULTS_DICT[model][perturbation][q_id]['correct_answer']
            except:
                RESULTS_DICT[model][perturbation][q_id]['is_correct_perturbed'] = False

In [None]:
def accuracy_and_confusion_matrix(model, perturbation):
    results = RESULTS_DICT[model][perturbation]
    clean_correct = sum([1 for q_id in results if results[q_id]['is_correct_clean']])
    perturbed_correct = sum([1 for q_id in results if results[q_id]['is_correct_perturbed']])
    total = len(results)
    print(f"Model: {model}, Perturbation: {perturbation}")
    print(f"Clean Accuracy: {clean_correct/total:.2%} ({clean_correct}/{total})")
    print(f"Perturbed Accuracy: {perturbed_correct/total:.2%} ({perturbed_correct}/{total})")

    # from sklearn.metrics import confusion_matrix
    # import seaborn as sns
    # y_true = []
    # y_pred = []
    # for q_id in results:
    #     y_true.append(int(results[q_id]['is_correct_clean']))
    #     y_pred.append(int(results[q_id]['is_correct_perturbed']))
    # cm = confusion_matrix(y_true, y_pred, labels=[0,1])
    # print(cm)
    # plt.figure(figsize=(6,5))
    # sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Incorrect', 'Correct'], yticklabels=['Incorrect', 'Correct'])
    # plt.xlabel('Perturbed')
    # plt.ylabel('Clean')
    # plt.title(f'Confusion Matrix for {model} with {perturbation}')
    # plt.show()

    clean_corr_perturbed_corr = 0
    clean_corr_perturbed_corr_ids = []

    clean_corr_perturbed_incorr = 0
    clean_corr_perturbed_incorr_ids = []

    clean_incorr_perturbed_corr = 0
    clean_incorr_perturbed_corr_ids = []

    clean_incorr_perturbed_incorr = 0
    clean_incorr_perturbed_incorr_ids = []

    for q_id in results:
        is_clean_correct = results[q_id]['is_correct_clean']
        is_perturbed_correct = results[q_id]['is_correct_perturbed']
        if is_clean_correct and is_perturbed_correct:
            clean_corr_perturbed_corr += 1
            clean_corr_perturbed_corr_ids.append(q_id)
        elif is_clean_correct and not is_perturbed_correct:
            clean_corr_perturbed_incorr += 1
            clean_corr_perturbed_incorr_ids.append(q_id)
        elif not is_clean_correct and is_perturbed_correct:
            clean_incorr_perturbed_corr += 1
            clean_incorr_perturbed_corr_ids.append(q_id)
        else:
            clean_incorr_perturbed_incorr += 1
            clean_incorr_perturbed_incorr_ids.append(q_id)

    cm = [ 
        [clean_incorr_perturbed_incorr, clean_incorr_perturbed_corr],
        [clean_corr_perturbed_incorr, clean_corr_perturbed_corr]
    ]

    cm_ids = [
        [clean_incorr_perturbed_incorr_ids, clean_incorr_perturbed_corr_ids],
        [clean_corr_perturbed_incorr_ids, clean_corr_perturbed_corr_ids]
    ]

    clean_acc = clean_correct / total
    perturbed_acc = perturbed_correct / total

    return clean_acc, perturbed_acc, cm, cm_ids

In [None]:
# accuracy_and_confusion_matrix('azure_gpt_4o_mini', 'ExtraSteps')

In [None]:
# accuracy_and_confusion_matrix('azure_gpt_4o_mini', 'MathError')

In [None]:
# accuracy_and_confusion_matrix('llama3_1_latest', 'MathError')

In [None]:
# accuracy_and_confusion_matrix('azure_gpt_4o_mini', 'SkippedSteps')

In [None]:
# accuracy_and_confusion_matrix('azure_gpt_4o_mini', 'Sycophancy')

In [None]:
# accuracy_and_confusion_matrix('azure_gpt_4o_mini', 'UnitConv')

In [None]:
for perturbation in RESULTS_DICT['openai_gpt_5_2'].keys():
    accuracy_and_confusion_matrix('openai_gpt_5_2', perturbation)


In [None]:
for perturbation in RESULTS_DICT['openai_gpt_4o_mini'].keys():
    accuracy_and_confusion_matrix('openai_gpt_4o_mini', perturbation)


In [None]:
for perturbation in RESULTS_DICT['google_gemini_3_flash_preview'].keys():
    accuracy_and_confusion_matrix('google_gemini_3_flash_preview', perturbation)

In [None]:
for perturbation in RESULTS_DICT['mistralai_mistral_large_2512'].keys():
    accuracy_and_confusion_matrix('mistralai_mistral_large_2512', perturbation)

In [None]:
for perturbation in RESULTS_DICT['mistralai_ministral_8b_2512'].keys():
    accuracy_and_confusion_matrix('mistralai_ministral_8b_2512', perturbation)

In [None]:
for perturbation in RESULTS_DICT['qwen_qwen3_235b_a22b_2507'].keys():
    accuracy_and_confusion_matrix('qwen_qwen3_235b_a22b_2507', perturbation)

In [None]:
# for perturbation in RESULTS_DICT['google_gemma_3_27b_it'].keys():
#     accuracy_and_confusion_matrix('google_gemma_3_27b_it', perturbation)

In [None]:
for perturbation in RESULTS_DICT['deepseek_deepseek_v3_2'].keys():
    accuracy_and_confusion_matrix('deepseek_deepseek_v3_2', perturbation)

In [None]:
accuracy_results = {}

In [None]:
for llm in RESULTS_DICT.keys():
    accuracy_results[llm] = {}
    for perturbation in RESULTS_DICT[llm].keys():
        accuracy_results[llm][perturbation] = {}
        clean_acc, perturbed_acc, cm, cm_ids = accuracy_and_confusion_matrix(llm, perturbation)

        clean_corr_perturbed_corr = cm[1][1]
        clean_corr_perturbed_incorr = cm[1][0]
        clean_incorr_perturbed_corr = cm[0][1]
        clean_incorr_perturbed_incorr = cm[0][0]

        clean_corr_perturbed_corr_ids = cm_ids[1][1]
        clean_corr_perturbed_incorr_ids = cm_ids[1][0]
        clean_incorr_perturbed_corr_ids = cm_ids[0][1]
        clean_incorr_perturbed_incorr_ids = cm_ids[0][0]

        accuracy_results[llm][perturbation]['clean_accuracy'] = clean_acc
        accuracy_results[llm][perturbation]['perturbed_accuracy'] = perturbed_acc
        accuracy_results[llm][perturbation]['clean_corr_perturbed_corr'] = int(clean_corr_perturbed_corr)
        accuracy_results[llm][perturbation]['clean_corr_perturbed_incorr'] = int(clean_corr_perturbed_incorr)
        accuracy_results[llm][perturbation]['clean_incorr_perturbed_corr'] = int(clean_incorr_perturbed_corr)
        accuracy_results[llm][perturbation]['clean_incorr_perturbed_incorr'] = int(clean_incorr_perturbed_incorr)
        accuracy_results[llm][perturbation]['clean_corr_perturbed_corr_ids'] = clean_corr_perturbed_corr_ids
        accuracy_results[llm][perturbation]['clean_corr_perturbed_incorr_ids'] = clean_corr_perturbed_incorr_ids
        accuracy_results[llm][perturbation]['clean_incorr_perturbed_corr_ids'] = clean_incorr_perturbed_corr_ids
        accuracy_results[llm][perturbation]['clean_incorr_perturbed_incorr_ids'] = clean_incorr_perturbed_incorr_ids
    print("="*50)

In [None]:
import json
with open("results/accuracy_summary.json", "w") as f:
    json.dump(accuracy_results, f, indent=4)

In [None]:
# Accuracy summary without ids
accuracy_summary_no_ids = {}
for llm in accuracy_results.keys():
    accuracy_summary_no_ids[llm] = {}
    for perturbation in accuracy_results[llm].keys():
        accuracy_summary_no_ids[llm][perturbation] = {
            'clean_accuracy': accuracy_results[llm][perturbation]['clean_accuracy'],
            'perturbed_accuracy': accuracy_results[llm][perturbation]['perturbed_accuracy'],
            'clean_corr_perturbed_corr': accuracy_results[llm][perturbation]['clean_corr_perturbed_corr'],
            'clean_corr_perturbed_incorr': accuracy_results[llm][perturbation]['clean_corr_perturbed_incorr'],
            'clean_incorr_perturbed_corr': accuracy_results[llm][perturbation]['clean_incorr_perturbed_corr'],
            'clean_incorr_perturbed_incorr': accuracy_results[llm][perturbation]['clean_incorr_perturbed_incorr'],
        }

In [None]:
with open("results/accuracy_summary_no_ids.json", "w") as f:
    json.dump(accuracy_summary_no_ids, f, indent=4)

In [None]:
# Get all unique perturbation types
perturbation_types = set()
for model_data in accuracy_summary_no_ids.values():
    perturbation_types.update(model_data.keys())
perturbation_types = sorted(perturbation_types)

# Create a separate table for each perturbation type
all_tables = []

for pert in perturbation_types:
    latex = []
    latex.append("\\begin{table}[h]")
    latex.append("\\centering")
    latex.append("\\begin{tabular}{lccc}")
    latex.append("\\hline")
    latex.append("Model & Clean & Perturbed & Diff \\\\")
    latex.append("\\hline")
    
    # Check if any model has data for this perturbation
    has_data = False
    clean_accs = []
    perturbed_accs = []
    diffs = []
    
    for model_name, model_data in accuracy_summary_no_ids.items():
        # Clean up model name for display
        display_name = model_name.replace("_", " ").title()
        
        if pert in model_data and model_data[pert]:
            clean = model_data[pert].get("clean_accuracy", "")
            perturbed = model_data[pert].get("perturbed_accuracy", "")
            
            if clean != "" and perturbed != "":
                diff = perturbed - clean
                latex.append(f"{display_name} & {clean:.2f} & {perturbed:.2f} & {diff:+.2f} \\\\")
                has_data = True
                clean_accs.append(clean)
                perturbed_accs.append(perturbed)
                diffs.append(diff)
    
    # Only add the table if it has data
    if has_data:
        # Calculate averages
        avg_clean = np.mean(clean_accs)
        avg_perturbed = np.mean(perturbed_accs)
        avg_diff = np.mean(diffs)
        
        latex.append("\\hline")
        latex.append(f"Average & {avg_clean:.3f} & {avg_perturbed:.3f} & {avg_diff:+.3f} \\\\")
        latex.append("\\hline")
        latex.append("\\end{tabular}")
        latex.append(f"\\caption{{Model accuracy for {pert} perturbation}}")
        latex.append(f"\\label{{tab:{pert.lower()}}}")
        latex.append("\\end{table}")
        latex.append("")  # Add blank line between tables
        
        all_tables.append("\n".join(latex))

# Print all tables
print("\n".join(all_tables))

In [None]:
PRESENTABLE_NAMES = {
    "ExtraSteps": "Extra Steps",
    "MathError": "Math Error",
    "SkippedSteps": "Skipped Steps",
    "Sycophancy": "Sycophancy",
    "UnitConvFinal": "Unit Conversion",
}

In [None]:
from scipy.stats import gaussian_kde

# perturbation wise density plot of accuracies for both clean and perturbed in different colours
for perturbation in perturbation_types:
    clean_accuracies = []
    perturbed_accuracies = []
    
    for model_name, model_data in accuracy_summary_no_ids.items():
        if perturbation in model_data and model_data[perturbation]:
            clean_acc = model_data[perturbation].get("clean_accuracy", None)
            perturbed_acc = model_data[perturbation].get("perturbed_accuracy", None)
            if clean_acc is not None and perturbed_acc is not None:
                clean_accuracies.append(clean_acc)
                perturbed_accuracies.append(perturbed_acc)
    
    # Print statistics
    print(f"\n{PRESENTABLE_NAMES.get(perturbation, perturbation)} Perturbation:")
    print(f"  Clean Accuracy    - Mean: {np.mean(clean_accuracies):.4f}, Std: {np.std(clean_accuracies):.4f}")
    print(f"  Perturbed Accuracy - Mean: {np.mean(perturbed_accuracies):.4f}, Std: {np.std(perturbed_accuracies):.4f}")
    
    # Plot density plot
    fig, ax = plt.subplots(figsize=(8, 5))
    
    
    # Create density estimates
    if len(clean_accuracies) > 1:
        kde_clean = gaussian_kde(clean_accuracies)
        x_range = np.linspace(0, 2, 1000)
        ax.fill_between(x_range, kde_clean(x_range), alpha=0.5, label='Clean Accuracy', color='skyblue')
        ax.plot(x_range, kde_clean(x_range), color='blue', linewidth=2)
    
    if len(perturbed_accuracies) > 1:
        kde_perturbed = gaussian_kde(perturbed_accuracies)
        ax.fill_between(x_range, kde_perturbed(x_range), alpha=0.5, label='Perturbed Accuracy', color='salmon')
        ax.plot(x_range, kde_perturbed(x_range), color='red', linewidth=2)
    
    ax.axvline(np.mean(clean_accuracies), color='blue', linestyle='--', label=f'Clean Mean: {np.mean(clean_accuracies):.2f}')
    ax.axvline(np.mean(perturbed_accuracies), color='red', linestyle='--', label=f'Perturbed Mean: {np.mean(perturbed_accuracies):.2f}')
    
    ax.set_xlabel('Accuracy')
    ax.set_ylabel('Density')
    ax.set_title(f'Accuracy Distribution for {PRESENTABLE_NAMES.get(perturbation, perturbation)} Perturbation')
    ax.legend()
    ax.set_xlim(0, 2)
    
    fig.tight_layout()
    plt.show()


In [None]:
# Plot accuracy scatter plot for each perturbation type, w.r.t model size
for perturbation in perturbation_types:
    model_sizes = []
    clean_accuracies = []
    perturbed_accuracies = []
    accuracy_diffs = []
    
    for model_name, model_data in accuracy_summary_no_ids.items():
        if perturbation in model_data and model_data[perturbation]:
            clean_acc = model_data[perturbation].get("clean_accuracy", None)
            perturbed_acc = model_data[perturbation].get("perturbed_accuracy", None)
            if clean_acc is not None and perturbed_acc is not None:
                size = MODEL_SIZE_MAP.get(model_name, None)
                if size is not None:
                    log_size = math.log10(size)
                    model_sizes.append(log_size)
                    clean_accuracies.append(clean_acc)
                    perturbed_accuracies.append(perturbed_acc)
                    accuracy_diff = clean_acc - perturbed_acc
                    accuracy_diffs.append(accuracy_diff)

    
    # Plot scatter plot
    fig, ax = plt.subplots(figsize=(8, 5))
    ax.scatter(model_sizes, accuracy_diffs, label='Accuracy Difference (Clean - Perturbed)', color='purple', s=100, alpha=0.7, edgecolor='black')
    
    # Add regression line
    if len(model_sizes) > 1:
        z = np.polyfit(model_sizes, accuracy_diffs, 1)
        p = np.poly1d(z)
        ax.plot(model_sizes, p(model_sizes), "r--", alpha=0.8, label=f'Regression: y={z[0]:.3f}x+{z[1]:.3f}')
    
    ax.set_xlabel('Log10(Model Size) (Billion Parameters)')
    ax.set_ylabel('Accuracy Difference (Clean - Perturbed)')
    ax.set_title(f'Accuracy vs Model Size for {PRESENTABLE_NAMES.get(perturbation, perturbation)} Perturbation')
    ax.legend()
    plt.show()
    # Save the figures also
    os.makedirs("result_plots/accuracy_vs_model_size", exist_ok=True)
    fig.savefig(f"result_plots/accuracy_vs_model_size/{perturbation}.png")