# Confidence and Accuracy Analysis

In [1]:
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib.pyplot as plt


In [2]:
def get_confidence(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Define a regex pattern for "Extracted Answer Confidence"
    pattern = re.compile(r"Extracted Answer Confidence:\s([0-1\]*\.?[0-9]+)")

    # Search through the text in the HTML
    matches = pattern.findall(soup.get_text())

    # Print the extracted confidence values
    return np.array(matches, dtype=float)

def get_accuracy(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Define a regex pattern for "Extracted Answer Confidence"
    pattern = re.compile(r"Score:\s([0-1]*\.?[0-9]+)")

    # Search through the text in the HTML
    matches = pattern.findall(soup.get_text())

    # Print the extracted confidence values
    return np.array(matches, dtype=float)

def get_subject(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Define a regex pattern for "Extracted Answer Confidence"
    pattern = re.compile(r'\nSubject:\s(.+)\n')

    # Search through the text in the HTML
    matches = pattern.findall(soup.get_text())

    # Print the extracted confidence values
    return (matches)

## Confidence Distributions

- GPT-4.1-mini
- GPT-4.1-nano
- meta-llama/Llama-2-70b-hf
- meta-llama/Llama-3.2-3B-Instruct-Turbo
- meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo
- meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo
- meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo

In [None]:
html_filename = """../results/mmlu_pro_{model}_{confidence}_shared_sampling_None.html"""
all_models = ["gpt-4.1-mini", 
              "gpt-4.1-nano", 
            #   "meta-llama/Llama-2-70b-hf",
              "meta-llama/Llama-3.2-3B-Instruct-Turbo",
              "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo",
              "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo",
              "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"
              ]

In [None]:

for model in all_models:
    verbal_num = get_confidence(html_filename.format(model=model, confidence="verbal_numerical"))
    logits_perp = get_confidence(html_filename.format(model=model, confidence="logit_perplexity"))

    fig, axes = plt.subplots(1, 2, figsize=(12, 5))

    # First histogram
    axes[0].hist(verbal_num, density=True, bins=30)
    axes[0].set_title(f'{model} Verbal Numerical Confidence Distribution')
    axes[0].set_xlabel('Confidence')
    axes[0].set_ylabel('Density')

    # Second histogram
    axes[1].hist(logits_perp, density=True, bins=30)
    axes[1].set_title(f'{model} Verbal Logit Perplexity Confidence Distribution')
    axes[1].set_xlabel('Confidence')
    axes[1].set_ylabel('Density')

    plt.tight_layout()
    plt.show()

KeyboardInterrupt: 

## Confidence vs. Accuracy Scatter Plots

In [None]:
json_filesname = """../results/mmlu_pro_{model}_{confidence}_shared_sampling_None.json"""