# Benchmark Evaluation Demo

This notebook walk you through the steps for doing predictions on the LLM-AggreFact benchmark and obtain the evaluation results.

In [1]:
import pandas as pd
from datasets import load_dataset
from sklearn.metrics import balanced_accuracy_score
from minicheck.minicheck import MiniCheck
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

df = pd.DataFrame(load_dataset("lytang/LLM-AggreFact")['test'])
docs = df.doc.values
claims = df.claim.values

In [None]:
model_name = 'Granite-Guardian-3.3-8B'
scorer = MiniCheck(model_name=model_name, enable_prefix_caching=False, cache_dir='./ckpts')

## Predict the labels
In this demo, `Granite-Guardian-3.3-8B` (implemented with vLLM) predicting on the entire test set (29K) requires ~50 mins using a single NVIDIA H100 (48GB VRAM).

In [None]:
# pred_label converts the raw probability (raw_prob) into 1/0 using the threshold 0.5
pred_label, raw_prob, _, _ = scorer.score(docs=docs, claims=claims)

## Check performance on LLM-AggreFact

In [4]:
df['preds'] = pred_label
result_df = pd.DataFrame(columns=['Dataset', 'BAcc'])
for dataset in df.dataset.unique():
    sub_df = df[df.dataset == dataset]
    bacc = balanced_accuracy_score(sub_df.label, sub_df.preds) * 100
    result_df.loc[len(result_df)] = [dataset, bacc]

result_df.loc[len(result_df)] = ['Average', result_df.BAcc.mean()]
result_df.round(1)

Unnamed: 0,Dataset,BAcc
0,AggreFact-CNN,67.0
1,AggreFact-XSum,74.9
2,TofuEval-MediaS,74.0
3,TofuEval-MeetB,78.6
4,Wice,76.6
5,Reveal,89.6
6,ClaimVerify,75.9
7,FactCheck-GPT,76.1
8,ExpertQA,59.6
9,Lfqa,86.9
