# Evaluation

In [5]:
import json
import sys
import tqdm
import matplotlib.pyplot as plt

from transformers import AutoTokenizer
import numpy as np

sys.path.append("..")

from inference.utils.evaluators import extract_boxed_expressions

  from .autonotebook import tqdm as notebook_tqdm


## Evaluation Code

In [7]:
from concurrent.futures import as_completed
from utils.evaluators import evaluate_single, MathEvaluator, parse_single
import warnings
from concurrent.futures import TimeoutError as FuturesTimeoutError
from pebble import ProcessPool, ProcessExpired
from concurrent.futures import as_completed, TimeoutError as FuturesTimeoutError

warnings.filterwarnings('ignore', message='.*Timeout is disabled.*')


evaluator = MathEvaluator()


def evaluate_rollouts(rollouts, timeout_seconds=5.0, max_workers=8, verbose=False):
    eval_store = [[] for _ in range(len(rollouts))]
    
    tasks = []
    task_metadata = []
    
    for i, sample in enumerate(rollouts):
        for j, output in enumerate(sample["reasoning_store"]):
            extracted_answer = extract_boxed_expressions(output)
            boxed_answer = evaluator.add_boxed(extracted_answer)
            args = (boxed_answer, sample["label"])            
            tasks.append(args)
            task_metadata.append((i, j, boxed_answer, sample["label"]))
    
    with ProcessPool(max_workers=max_workers) as executor:
        futures = {executor.schedule(evaluate_single, args=task, timeout=timeout_seconds): idx 
                  for idx, task in enumerate(tasks)}
        
        for future in tqdm.tqdm(as_completed(futures), total=len(tasks)):
            idx = futures[future]
            i, j, boxed_answer, label = task_metadata[idx]
            result = False
            try:
                result = future.result()
            except (ProcessExpired, FuturesTimeoutError) as e: 
                result = False
                if verbose:
                    print(f"Evaluation timed out for sample {i}, step {j} (Reason: {type(e).__name__})")
                    print(f"  Prediction: {boxed_answer}")
                    print(f"  Label: {label}")
            except Exception as e:
                result = False
                if verbose:
                    print(f"Evaluator error for sample {i}, step {j}: {e}")
            while len(eval_store[i]) <= j:
                eval_store[i].append(False)
            eval_store[i][j] = result
    return eval_store


def parse_rollouts(rollouts, timeout_seconds=5.0, max_workers=8, verbose=False):
    eval_store = [[] for _ in range(len(rollouts))]
    
    tasks = []
    task_metadata = []
    
    for i, sample in enumerate(rollouts):
        for j, output in enumerate(sample["reasoning_store"]):
            extracted_answer = extract_boxed_expressions(output)
            boxed_answer = evaluator.add_boxed(extracted_answer)
            args = (boxed_answer, )            
            tasks.append(args)
            task_metadata.append((i, j, boxed_answer, sample["label"]))
    
    with ProcessPool(max_workers=max_workers) as executor:
        futures = {executor.schedule(parse_single, args=task, timeout=timeout_seconds): idx 
                  for idx, task in enumerate(tasks)}
        
        for future in tqdm.tqdm(as_completed(futures), total=len(tasks)):
            idx = futures[future]
            i, j, boxed_answer, label = task_metadata[idx]
            result = False
            try:
                result = future.result()
            except (ProcessExpired, FuturesTimeoutError) as e: 
                result = None
                if verbose:
                    print(f"Parsing timed out for sample {i}, step {j} (Reason: {type(e).__name__})")
                    print(f"  Prediction: {boxed_answer}")
            except Exception as e:
                result = None
                if verbose:
                    print(f"Parser error for sample {i}, step {j}: {e}")
            while len(eval_store[i]) <= j:
                eval_store[i].append(None)
            eval_store[i][j] = result
    return eval_store



def compute_metrics(eval_results):
    metric_array = np.array(eval_results)
    return np.mean(metric_array, axis=0).tolist()


def extract_correct_predictions_and_labels(rollouts, eval_results, turn_idx=None):
    correct_store = []
    for i, sample in enumerate(rollouts):
        for j, turn in enumerate(sample["reasoning_store"]):
            if eval_results[i][j] and (turn_idx is None or j == turn_idx):
                extracted_correct = extract_boxed_expressions(turn)
                correct_store.append({"prediction": extracted_correct, "label": sample["label"], "id": sample["problem_id"]})
    return correct_store


def reformat_baseline(rollouts):
    store = []
    for sample in rollouts:
        for k, v in sample["rollouts"].items():
            curr_sample = {}
            curr_sample["problem"] = sample["problem"]
            curr_sample["label"] = sample["answer"]
            curr_sample["reasoning_store"] = [v]
            curr_sample["problem_id"] = sample["id"]
            curr_sample["sample_id"] = k
            store.append(curr_sample)
    return store

## Run Evaluation

In [8]:
with open("outputs/sample_outputs.json", "r") as f:
    sample_outputs = json.load(f)


In [9]:
eval_results = evaluate_rollouts(sample_outputs)
metrics = compute_metrics(eval_results)

100%|██████████| 5760/5760 [00:02<00:00, 2680.93it/s]


In [10]:
metrics

[0.45625,
 0.5229166666666667,
 0.5625,
 0.5979166666666667,
 0.59375,
 0.6020833333333333,
 0.6229166666666667,
 0.625,
 0.625,
 0.61875,
 0.6375,
 0.6375]