In [25]:
import re
import numpy as np
from bs4 import BeautifulSoup

def get_success_failed_indexes(file_path):
    """
    Extract sampled messages where the extracted answer is 'None'.
    """
    # Read the HTML file
    with open(file_path, "r", encoding="utf-8") as f:
        soup = BeautifulSoup(f, "html.parser")

    # Find all blocks containing "Sampled message"
    success = []
    failed = []
    i = 0
    for h3 in soup.find_all("h3"):
        if h3.text.strip() == "Sampled message":
            assistant_div = h3.find_next_sibling("div", class_="message assistant")
            results_h3 = assistant_div.find_next_sibling("h3")
            # Ensure it's the corresponding "Results" section
            if results_h3 and results_h3.text.strip() == "Results":
                # Look for "Extracted Answer"
                extracted_answer_p = results_h3.find_next_sibling("p", string=lambda t: t and "Extracted Answer:" in t)
                if extracted_answer_p and ("None" in extracted_answer_p.text or "Z" in extracted_answer_p.text):
                    failed.append(i)
            success.append(i)
            i = i + 1
    # Remove failed indexes from success
    success = [i for i in success if i not in failed]
                    
    return np.array(success), np.array(failed)

def get_confidence(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Define a regex pattern for "Extracted Answer Confidence"
    pattern = re.compile(r"Extracted Answer Confidence:\s([0-1\]*\.?[0-9]+)")

    # Search through the text in the HTML
    matches = pattern.findall(soup.get_text())

    # Print the extracted confidence values
    return np.array(matches, dtype=float)

def get_accuracy(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Define a regex pattern for "Extracted Answer Confidence"
    pattern = re.compile(r"Score:\s([0-1]*\.?[0-9]+)")

    # Search through the text in the HTML
    matches = pattern.findall(soup.get_text())

    # Print the extracted confidence values
    return np.array(matches, dtype=float)

def get_subject(file_path):
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()

    # Parse the HTML
    soup = BeautifulSoup(html_content, "html.parser")

    # Define a regex pattern for "Extracted Answer Confidence"
    pattern = re.compile(r'\nSubject:\s(.+)\n')

    # Search through the text in the HTML
    matches = pattern.findall(soup.get_text())

    # Print the extracted confidence values
    return (matches)

In [26]:
# extract confidence of different mode and acc and save to csv
import pandas as pd
import numpy as np

def extract_confidence_and_acc(model_name,  linguistic_conf_file_path):
    html_filename = """../results/mmlu_pro_{model}_{confidence}_shared_sampling_None.html"""
    linguistic_conf = get_confidence(linguistic_conf_file_path)
    logits_perp_conf = get_confidence(html_filename.format(model=model_name, confidence="logit_perplexity"))
    verbal_num_conf = get_confidence(html_filename.format(model=model_name, confidence="verbal_numerical"))
    acc_list = get_accuracy(html_filename.format(model=model_name, confidence="verbal_numerical"))

    # get the success and failed indexes
    linguistic_success, linguistic_failed = get_success_failed_indexes(linguistic_conf_file_path)
    numerical_success, numerical_failed = get_success_failed_indexes(html_filename.format(model=model_name, confidence="verbal_numerical"))
    logit_success, logit_failed = get_success_failed_indexes(html_filename.format(model=model_name, confidence="logit_perplexity"))

    # turn success and failed indexes to boolean
    linguistic_success_bool = np.zeros(linguistic_conf.shape[0], dtype=bool)
    linguistic_success_bool[linguistic_success] = 1
    logit_success_bool = np.zeros(logits_perp_conf.shape[0], dtype=bool)
    logit_success_bool[logit_success] = 1
    numerical_success_bool = np.zeros(verbal_num_conf.shape[0], dtype=bool)
    numerical_success_bool[numerical_success] = 1

    return acc_list, linguistic_conf, logits_perp_conf, verbal_num_conf,\
                        linguistic_success, logit_success, numerical_success,\
                        linguistic_success_bool, logit_success_bool, numerical_success_bool

## gpt-4.1-mini

In [27]:
model_name = "gpt-4.1-mini"
linguistic_conf_file_path = "../results/linguistic-judges/mmlu_pro_gpt-4.1-mini_verbal_linguistic_shared_sampling_None_meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8_dec_judge.html"

acc_list, linguistic_conf, logits_perp_conf, verbal_num_conf,\
    linguistic_success, logit_success, numerical_success,\
    linguistic_success_bool, logit_success_bool, numerical_success_bool= \
        extract_confidence_and_acc(model_name, linguistic_conf_file_path)

# Create a DataFrame
df = pd.DataFrame({
    'Linguistic Confidence': linguistic_conf,
    'Logit Perplexity Confidence': logits_perp_conf,
    'Verbal Numerical Confidence': verbal_num_conf,
    'Accuracy': acc_list,
    'Linguistic Success': linguistic_success_bool,
    'Logit Success': logit_success_bool,
    'Numerical Success': numerical_success_bool
})

df.to_csv(f"{model_name}_confidence_acc_success", index=False)

## gpt-4.1-nano

In [28]:
model_name = "gpt-4.1-nano"
linguistic_conf_file_path = "../results/linguistic-judges/mmlu_pro_gpt-4.1-nano_verbal_linguistic_shared_sampling_None_meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8_dec_judge.html"

acc_list, linguistic_conf, logits_perp_conf, verbal_num_conf,\
    linguistic_success, logit_success, numerical_success,\
    linguistic_success_bool, logit_success_bool, numerical_success_bool= \
        extract_confidence_and_acc(model_name, linguistic_conf_file_path)

# Create a DataFrame
df = pd.DataFrame({
    'Linguistic Confidence': linguistic_conf,
    'Logit Perplexity Confidence': logits_perp_conf,
    'Verbal Numerical Confidence': verbal_num_conf,
    'Accuracy': acc_list,
    'Linguistic Success': linguistic_success_bool,
    'Logit Success': logit_success_bool,
    'Numerical Success': numerical_success_bool
})

df.to_csv(f"{model_name}_confidence_acc_success", index=False)

## Llama-3.2-3B-Instruct-Turbo

In [29]:
model_name = "Llama-3.2-3B-Instruct-Turbo"
linguistic_conf_file_path = "../results/linguistic-judges/mmlu_pro_Llama-3.2-3B-Instruct-Turbo_verbal_linguistic_shared_sampling_None_meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8_dec_judge.html"

acc_list, linguistic_conf, logits_perp_conf, verbal_num_conf,\
    linguistic_success, logit_success, numerical_success,\
    linguistic_success_bool, logit_success_bool, numerical_success_bool= \
        extract_confidence_and_acc(model_name, linguistic_conf_file_path)

# Create a DataFrame
df = pd.DataFrame({
    'Linguistic Confidence': linguistic_conf,
    'Logit Perplexity Confidence': logits_perp_conf,
    'Verbal Numerical Confidence': verbal_num_conf,
    'Accuracy': acc_list,
    'Linguistic Success': linguistic_success_bool,
    'Logit Success': logit_success_bool,
    'Numerical Success': numerical_success_bool
})

df.to_csv(f"{model_name}_confidence_acc_success", index=False)

## Meta-Llama-3.1-8B-Instruct-Turbo

In [30]:
model_name = "Meta-Llama-3.1-8B-Instruct-Turbo"
linguistic_conf_file_path = "../results/linguistic-judges/mmlu_pro_Meta-Llama-3.1-8B-Instruct-Turbo_verbal_linguistic_shared_sampling_None_meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8_dec_judge.html"

acc_list, linguistic_conf, logits_perp_conf, verbal_num_conf,\
    linguistic_success, logit_success, numerical_success,\
    linguistic_success_bool, logit_success_bool, numerical_success_bool= \
        extract_confidence_and_acc(model_name, linguistic_conf_file_path)

# Create a DataFrame
df = pd.DataFrame({
    'Linguistic Confidence': linguistic_conf,
    'Logit Perplexity Confidence': logits_perp_conf,
    'Verbal Numerical Confidence': verbal_num_conf,
    'Accuracy': acc_list,
    'Linguistic Success': linguistic_success_bool,
    'Logit Success': logit_success_bool,
    'Numerical Success': numerical_success_bool
})

df.to_csv(f"{model_name}_confidence_acc_success", index=False)

## Meta-Llama-3.1-70B-Instruct-Turbo

In [31]:
model_name = "Meta-Llama-3.1-70B-Instruct-Turbo"
linguistic_conf_file_path = "../results/linguistic-judges/mmlu_pro_Meta-Llama-3.1-70B-Instruct-Turbo_verbal_linguistic_shared_sampling_None_meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8_dec_judge.html"

acc_list, linguistic_conf, logits_perp_conf, verbal_num_conf,\
    linguistic_success, logit_success, numerical_success,\
    linguistic_success_bool, logit_success_bool, numerical_success_bool= \
        extract_confidence_and_acc(model_name, linguistic_conf_file_path)

# Create a DataFrame
df = pd.DataFrame({
    'Linguistic Confidence': linguistic_conf,
    'Logit Perplexity Confidence': logits_perp_conf,
    'Verbal Numerical Confidence': verbal_num_conf,
    'Accuracy': acc_list,
    'Linguistic Success': linguistic_success_bool,
    'Logit Success': logit_success_bool,
    'Numerical Success': numerical_success_bool
})

df.to_csv(f"{model_name}_confidence_acc_success", index=False)

## Meta-Llama-3.1-405B-Instruct-Turbo

In [32]:
model_name = "Meta-Llama-3.1-405B-Instruct-Turbo"
linguistic_conf_file_path = "../results/linguistic-judges/mmlu_pro_Meta-Llama-3.1-405B-Instruct-Turbo_verbal_linguistic_shared_sampling_None_Llama-4-Maverick-17B-128E-Instruct-FP8_dec_judge.html"

acc_list, linguistic_conf, logits_perp_conf, verbal_num_conf,\
    linguistic_success, logit_success, numerical_success,\
    linguistic_success_bool, logit_success_bool, numerical_success_bool= \
        extract_confidence_and_acc(model_name, linguistic_conf_file_path)

# Create a DataFrame
df = pd.DataFrame({
    'Linguistic Confidence': linguistic_conf,
    'Logit Perplexity Confidence': logits_perp_conf,
    'Verbal Numerical Confidence': verbal_num_conf,
    'Accuracy': acc_list,
    'Linguistic Success': linguistic_success_bool,
    'Logit Success': logit_success_bool,
    'Numerical Success': numerical_success_bool
})

df.to_csv(f"{model_name}_confidence_acc_success", index=False)

In [33]:
model_name_lists = [
    "Llama-3.2-3B-Instruct-Turbo",
    "Meta-Llama-3.1-8B-Instruct-Turbo",
    "Meta-Llama-3.1-70B-Instruct-Turbo",
    "Meta-Llama-3.1-405B-Instruct-Turbo",
    "gpt-4.1-mini",
    "gpt-4.1-nano",
    ]
path_format = """../analysis/{model_name}_confidence_acc_success"""
for model_name in model_name_lists:
    file_path = path_format.format(model_name=model_name)
    df = pd.read_csv(file_path)
    # print number of failed samples
    print(f"Number of failed samples for {model_name}: crossfail-{len(df[(df['Linguistic Success'] == 0) | (df['Logit Success'] == 0) | (df['Numerical Success'] == 0)])}, linguistic-{len(df[df['Linguistic Success'] == 0])}, logit-{len(df[df['Logit Success'] == 0])}, numerical-{len(df[df['Numerical Success'] == 0])}")
    

Number of failed samples for Llama-3.2-3B-Instruct-Turbo: crossfail-80, linguistic-58, logit-61, numerical-79
Number of failed samples for Meta-Llama-3.1-8B-Instruct-Turbo: crossfail-467, linguistic-258, logit-310, numerical-411
Number of failed samples for Meta-Llama-3.1-70B-Instruct-Turbo: crossfail-112, linguistic-63, logit-67, numerical-111
Number of failed samples for Meta-Llama-3.1-405B-Instruct-Turbo: crossfail-77, linguistic-44, logit-47, numerical-75
Number of failed samples for gpt-4.1-mini: crossfail-250, linguistic-225, logit-79, numerical-122
Number of failed samples for gpt-4.1-nano: crossfail-324, linguistic-126, logit-268, numerical-302
