In [1]:
%load_ext dotenv
%dotenv
!deepeval set-local-model --model-name="llama-3.1-70b-instruct" --base-url="https://api.scaleway.ai/aad43137-8aab-4869-a46d-26760fafaec8/v1" --api-key="2c393978-62ac-4190-ae56-93cbe4e3561c"

/bin/bash: /home/hessel/miniconda3/envs/thesis/lib/libtinfo.so.6: no version information available (required by /bin/bash)
🙌 Congratulations! You're now using a local model for all evals that require an
LLM.


In [3]:
import logging
import pandas as pd
import nest_asyncio
from deepeval.metrics import HallucinationMetric
from deepeval.test_case import LLMTestCase
from deepeval import evaluate

from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

logging.getLogger('deepeval').setLevel(logging.WARNING)
nest_asyncio.apply()

class HallucinationEvaluator:
    def __init__(self, threshold: float = 0.5):
        self.metric = HallucinationMetric(threshold=threshold, async_mode=True)
        self.models = ['chatgpt', 'claude', 'gemma', 'llama32', 'mistralnemo', 'phi']

    async def evaluate_batch(self, chunk: pd.DataFrame) -> list:
        results = []
        for model in self.models:
            test_cases = []
            prompts = []
            
            for _, row in chunk.iterrows():
                response = row.get(f'{model}_response', '')
                prompt = row['Prompt'].strip()
                
                if isinstance(response, str) and response.strip() and prompt:
                    test_cases.append(LLMTestCase(
                        input=prompt,
                        actual_output=response,
                        context=[prompt]
                    ))
                    prompts.append(prompt)
            
            if test_cases:
                scores = await evaluate(test_cases, [self.metric])
                results.extend([{
                    'prompt': prompt,
                    'model': model,
                    'hallucination_score': float(score.score),
                    'reason': str(score.reason)
                } for prompt, score in zip(prompts, scores)])
                
        return results

async def analyze_hallucinations(df_path: str, chunk_size: int = 50):
    df = pd.read_csv(df_path, sep=";", encoding='utf-8', on_bad_lines='warn')
    evaluator = HallucinationEvaluator()
    chunks = [df[i:i + chunk_size] for i in range(0, len(df), chunk_size)]
    
    all_results = []
    for chunk in tqdm(chunks, desc="Processing"):
        results = await evaluator.evaluate_batch(chunk)
        all_results.extend(results)
    
    return pd.DataFrame(all_results)

# Execute
results_df = await analyze_hallucinations('../../results/combined_model_responses.ssv')
print(results_df.groupby('model')['hallucination_score'].agg(['mean', 'std', 'count']))

Processing:   0%|          | 0/35 [00:00<?, ?it/s]

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 50 test case(s) in parallel: |██████▊   | 68% (34/50) [Time Taken: 00:29,  1.15test case/s]


ValueError: Evaluation LLM outputted an invalid JSON. Please use a better evaluation model.