# Scorer Notebook
The main notebook to do postprocessing and score the results. Postprocessing can be run via CLI or through this notebook

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import argparse
import json
import os
import gc
import warnings
from typing import List, Dict, Any
import time
import torch
from datasets import load_dataset, load_from_disk
from tqdm import tqdm
from vllm import LLM, SamplingParams
from utils import postprocessing, scoring_function

sys.path.insert(0, "../src/")
sys.path.insert(0, "..")
from config import REPO_ROOT

## Postprocessing

This might already be done in the bash script

In [None]:
# iterature through files and run postprocessing
standard_filepath = REPO_ROOT / "results/heart_disease_minimal/claude_3_7_sonnet.json" # o3, claude_3_7_sonnet. gpt4_1
results = postprocessing(standard_filepath, force=True, distance_metric="gower", save=True, explicit_dataset = "")

In [None]:
datasets = ["income", "income_minimal", "house_prices", "house_prices_minimal", "heart_disease", "heart_disease_minimal"]
for dataset in datasets:
    standard_filepath = REPO_ROOT / f"analysis/temperature_1/{dataset}/claude_3_7_sonnet.json" # o3, claude_3_7_sonnet. gpt4_1
    results = postprocessing(standard_filepath, force=True, distance_metric="gower", save=True, explicit_dataset = "")

## Scoring

Score the results for specific models and datasets

In [None]:
scores = scoring_function(standard_filepath, verbose=1, overwrite=False)
print(scores)

#### Summary stats

In [None]:
"""
Compute the mean and standard deviation (population and sample)
for the exact-match percentages reported in Table 2.
"""

# Table values: (Income, House prices, Heart disease) for each model
values = [
    0.00, 0.00, 0.15,   # Gemma 2 27B
    3.33, 0.00, 0.00,   # Llama 3 3 70B
    6.89, 0.06, 0.05,   # DeepSeek-R1 32B
    19.70, 0.19, 0.42,  # DeepSeek-R1 70B
    9.95, 0.00, 0.00,   # Claude Sonnet 3.7
    15.70, 0.00, 0.00,  # GPT-4.1
    4.27, 0.00, 0.00    # o3
]

# Convert to NumPy array (optional but convenient)
arr = np.array(values)

mean_val = np.mean(arr)
std_population = np.std(arr, ddof=0)  # Population SD (N)
std_sample = np.std(arr, ddof=1)      # Sample SD (N-1)

print(f"Values (n={len(arr)}):\n{arr}\n")
print(f"Mean                 : {mean_val:.6f}")
print(f"Population SD (σ)    : {std_population:.6f}")
print(f"Sample SD (s, unbiased): {std_sample:.6f}")
