In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.chdir("..")

import sys
sys.path.append("./vrevals")

In [2]:
import re
import json
import numpy as np
import pandas as pd
import yaml
import time
import glob

In [3]:
from collections import Counter
from utils.math_equivalence import is_equiv
from utils.pass_k_utils import estimate_pass_at_k

In [67]:
def extract_math_answer(output, mode='gen'):
    if "</think>" in output:
        output = output.split("</think>")[1]
    extracted_text = ''
    # Existing extraction logic for 'gen' and 'choose' modes
    pattern = r'\\boxed\{(.*)\}'
    matches = re.findall(pattern, output)
    if matches:
        extracted_text = matches[-1]  # Take the last match
        if mode in ['choose', 'qa']:
            # Handle 'choose' mode
            inner_pattern = r'\\text\{(.*)\}'
            inner_matches = re.findall(inner_pattern, extracted_text)
            if inner_matches:
                extracted_text = inner_matches[-1]  # Take the last match
            extracted_text = extracted_text.strip("()")
    return extracted_text

def normalize_answer(text):
    text = text.lower()
    text = " ".join(text.strip().split())
    return text

def evaluate_predictions(pred_answer, labeled_answer):
    final_metric = {"is_valid_answer": False, "acc": 0, "em": 0, "f1": 0, 'math_equal': 0}
    if pred_answer != '':
        final_metric["is_valid_answer"] = True

    normalized_pred_answer = normalize_answer(pred_answer)
    normalized_ground_truth = normalize_answer(labeled_answer)

    em = int(normalized_pred_answer == normalized_ground_truth)
    acc = int(normalized_ground_truth in normalized_pred_answer)

    prediction_tokens = normalized_pred_answer.split()
    ground_truth_tokens = normalized_ground_truth.split()
    common = Counter(prediction_tokens) & Counter(ground_truth_tokens)
    num_same = sum(common.values())
    if num_same == 0:
        f1 = 0
    else:
        precision = 1.0 * num_same / len(prediction_tokens) if len(prediction_tokens) > 0 else 0
        recall = 1.0 * num_same / len(ground_truth_tokens) if len(ground_truth_tokens) > 0 else 0
        if (precision + recall) == 0:
            f1 = 0
        else:
            f1 = (2 * precision * recall) / (precision + recall)

    final_metric["em"] = em
    final_metric["acc"] = acc
    final_metric["f1"] = f1

    final_metric["math_equal"] = is_equiv(normalized_pred_answer, normalized_ground_truth)

    # print(em, acc, f1, normalized_pred_answer, '|', normalized_ground_truth)
    return final_metric

In [94]:
class Args:
    dataset_name = "gsm8k"
    split = "test"
    k_list = [1,4,8,32]
    subset_num = None
    step_by_step_prompt = True
    n_threads = 1
args = Args()
job_dir = f"vrevals/runs/default/{args.dataset_name}.qwen-1.5b-inst"
prompt_csv_path = f'{job_dir}/{args.split}.prompts.csv'
# sampler_config_dir = f'{job_dir}/distilled-50.direct/sample_1'
# sampler_config_dir = f'{job_dir}/distilled-50.direct/sample_2'

# sampler_config_dir = f'{job_dir}/distilled-50.direct/sample_*'
sampler_config_dir = f'{job_dir}/distilled-50.direct/sample_[123]'
# sampler_config_dir = f'{job_dir}/direct/sample_*'

# sampler_config_dir = f'{job_dir}/direct/sample_1'
# with open(f"{sampler_config_dir}/sampler_config.yaml", "r") as f:
#     sampler_config = yaml.safe_load(f)
# sampler_config

In [95]:
all_generation_csv = glob.glob(f"{sampler_config_dir}/generations.*.csv")
all_generation_df = [pd.read_csv(p) for p in all_generation_csv]
# Concatenate all dataframes in all_generation_df into a single dataframe
generation_df = pd.concat(all_generation_df, ignore_index=True)
generation_df['pred_answer'] = generation_df.response.apply(extract_math_answer)
metrics = generation_df.apply(lambda x: evaluate_predictions(str(x['pred_answer']), str(x['gt_answer'])), axis=1)
generation_df['is_correct'] = [e['math_equal'] for e in metrics]
generation_df['is_valid'] = [e['is_valid_answer'] for e in metrics]

In [96]:
generation_df

Unnamed: 0,question_id,prompt_id,response,pred_answer,gt_answer,sampler_config,is_correct,is_valid
0,0,0,"Okay, so Janet's ducks lay 16 eggs each day. H...",18,18,{'tokenizer': {'pretrained_model_name_or_path'...,True,True
1,0,0,"Okay, so I need to figure out how much Janet m...",18,18,{'tokenizer': {'pretrained_model_name_or_path'...,True,True
2,1,1,"Okay, let's see. I have this math problem here...",3,3,{'tokenizer': {'pretrained_model_name_or_path'...,True,True
3,1,1,"Okay, so I need to solve this math problem. Le...",3,3,{'tokenizer': {'pretrained_model_name_or_path'...,True,True
4,2,2,"Okay, so Josh is trying to make some money by ...",70000,70000,{'tokenizer': {'pretrained_model_name_or_path'...,True,True
...,...,...,...,...,...,...,...,...
17142,1314,1314,"Okay, so I need to solve this math problem ste...",,8,{'tokenizer': {'pretrained_model_name_or_path'...,False,False
17143,1315,1315,"Okay, so I have this math problem here about o...",5,5,{'tokenizer': {'pretrained_model_name_or_path'...,True,True
17144,1316,1316,"Okay, so Mark's car broke down, and he needs a...",230,230,{'tokenizer': {'pretrained_model_name_or_path'...,True,True
17145,1317,1317,"Okay, so Farmer Brown has 20 animals on his fa...",5,5,{'tokenizer': {'pretrained_model_name_or_path'...,True,True


In [97]:
grouped_df = generation_df.groupby(['question_id', 'prompt_id'])

valid_grouped = grouped_df.is_valid.apply(list).reset_index(name='valids')
valid_grouped['num_samples'] = valid_grouped.valids.apply(len)
valid_grouped['num_math_equal'] = valid_grouped.valids.apply(sum)

min_num_samples = valid_grouped['num_samples'].min()
k_list = [1,2,3,4,5,6,7]
detail_pass_at_k = {
    f"pass@{k}": estimate_pass_at_k(valid_grouped['num_samples'].values, 
                                    valid_grouped['num_math_equal'].values, k)
    for k in k_list
    if (min_num_samples >= k).all()
}
pass_at_k = {k: detail_pass_at_k[k].mean() for k in detail_pass_at_k}
pass_at_k

{'pass@1': np.float64(0.8994576310724907),
 'pass@2': np.float64(0.9770124997570032),
 'pass@3': np.float64(0.9922700498894585),
 'pass@4': np.float64(0.9967224587391381),
 'pass@5': np.float64(0.9983334835495562),
 'pass@6': np.float64(0.9990156419269384),
 'pass@7': np.float64(0.9993536284286854)}

In [98]:
# {'pass@1': np.float64(0.9086429112964367),
#  'pass@2': np.float64(0.9840788476118272)}

In [99]:
# Filter the dataframe to keep only rows where the response is valid
valid_generation_df = generation_df[generation_df['is_valid']]

grouped_df = valid_generation_df.groupby(['question_id', 'prompt_id'])
corrects_grouped = grouped_df.is_correct.apply(list).reset_index(name='corrects')
corrects_grouped['num_samples'] = corrects_grouped.corrects.apply(len)
corrects_grouped['num_math_equal'] = corrects_grouped.corrects.apply(sum)

min_num_samples = corrects_grouped['num_samples'].min()

k_list = [1,2,3,4,5,6,7]
detail_pass_at_k = {
    f"pass@{k}": estimate_pass_at_k(corrects_grouped['num_samples'].values, 
                                    corrects_grouped['num_math_equal'].values, k)
    for k in k_list
    if (min_num_samples >= k).all()
}
pass_at_k = {k: detail_pass_at_k[k].mean() for k in detail_pass_at_k}
pass_at_k

{'pass@1': np.float64(0.697746359023842)}

In [None]:
# {'pass@1': np.float64(0.694953077837839),
#  'pass@2': np.float64(0.8097839655079988)}

In [18]:
overall_results = {
    'detail_pass_at_k': {k:v.tolist() for k,v in detail_pass_at_k.items()},
    'pass_at_k': pass_at_k,
}
final_metrics = {'overall': overall_results}
t = time.localtime()
metrics_json_name = f'metrics.{t.tm_mon}.{t.tm_mday},{t.tm_hour}:{t.tm_min}.json'
with open(os.path.join(sampler_config_dir, metrics_json_name), mode='w', encoding='utf-8') as json_file:
    json.dump(final_metrics, json_file, indent=4, ensure_ascii=False)
