## Step # 01: Install necessary libraries

In [None]:
%pip install groq
%pip install datasets
%pip install deepeval
#%pip install --upgrade httpx

## Step # 02: Import necessary libraries

In [56]:
import json
import os
import pickle
import random
import time
import typing as t
from dataclasses import dataclass, field
from pathlib import Path
from textwrap import dedent
from typing import Any, Dict, List, Optional

import groq
import numpy as np
import pandas as pd
from datasets import Dataset, load_dataset
from deepeval.metrics import (
    AnswerRelevancyMetric,
    FaithfulnessMetric,
    HallucinationMetric,
)
from deepeval.models.base_model import DeepEvalBaseLLM
from deepeval.test_case import LLMTestCase
from groq import Groq
from IPython.display import display
from tqdm import tqdm


## Step # 03: Setting Environment Variables

In [None]:

os.environ["GROQ_API_KEY"] = "<YOUR_GROQ_API_KEY>"
os.environ["DEEPEVAL_TELEMETRY_OPT_OUT"] = "YES"
os.environ["ERROR_REPORTING"] = "NO"

## Step # 04: Setting Experiment Parameters

In [58]:
N_ITEMS = 50
CRITIC_MODEL = "llama3-70b-8192"   #Model used to evaluate LLMs
np.random.seed(42)

In [59]:
#Function for Saving and Loading Data with Pickle
def save_list(items, file_path):
    try:
        with Path(file_path).open("wb") as f:
            pickle.dump(items, f)
    except Exception as e:
        print(f"An error occurred while saving the list of objects to file: {e}")


def load_list(file_path):
    try:
        with Path(file_path).open("rb") as file:
            items = pickle.load(file)
        return items
    except Exception as e:
        print(f"An error occurred while loading the list of objects from file: {e}")
        return None

## Step # 05: Load Dataset from Hugging-Face

In [60]:
dataset = load_dataset("virattt/financial-qa-10K")
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'context', 'ticker', 'filing'],
        num_rows: 7000
    })
})

In [7]:
dataset["train"][0]

{'question': 'What area did NVIDIA initially focus on before expanding to other computationally intensive fields?',
 'answer': 'NVIDIA initially focused on PC graphics.',
 'context': 'Since our original focus on PC graphics, we have expanded to several other large and important computationally intensive fields.',
 'ticker': 'NVDA',
 'filing': '2023_10K'}

In [61]:
#shuffle rows in the dataset and pick 50 rows
sample = dataset["train"].shuffle().select(range(N_ITEMS))
#sample["answer"][4]

## Step # 06: Define Functions

In [62]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

In [63]:
#Predict Function: sends a chat prompt to the Groq model, retrieves the model’s response, and handles errors like connection failures, rate limiting, and API errors with useful debug messages.
def predict(prompt: str, model: str, client: Groq = client):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
        )
        return chat_completion.choices[0].message.content
    except groq.APIConnectionError as e:
        print("The server could not be reached")
        print(e.__cause__)
    except groq.RateLimitError as e:
        print("A 429 status code was received; we should back off a bit.")
    except groq.APIStatusError as e:
        print("Another non-200-range status code was received")
        print(e.status_code)
        print(e.response)

In [64]:
def predict(prompt: str, model: str, client: Groq = client):
    try:
        chat_completion = client.chat.completions.create(
            messages=[
                {
                    "role": "user",
                    "content": prompt,
                }
            ],
            model=model,
        )
        return chat_completion.choices[0].message.content
    except groq.APIConnectionError as e:
        print(f"Error with {model}: The server could not be reached")
        print(e.__cause__)
        return f"[Error: Connection failed for {model}]"
    except groq.RateLimitError as e:
        print(f"Error with {model}: Rate limit exceeded (429)")
        return f"[Error: Rate limit exceeded for {model}]"
    except groq.APIStatusError as e:
        print(f"Error with {model}: Status code {e.status_code}")
        print(e.response)
        return f"[Error: API status error {e.status_code} for {model}]"
    except Exception as e:
        print(f"Unexpected error with {model}: {str(e)}")
        return f"[Error: Unexpected issue with {model}]"

In [65]:
#Function to format the prompt
"""
Dedent Function:
- Dedent is a handy function (probably imported from Python’s textwrap module)
- It removes extra indentation (spaces) from the multi-line string so it’s cleanly formatted when sent to the model.
"""
def format_prompt(question:str, context:str):
    return dedent(
        f"""
        Use the following context:
        ```
        {context}
        ```
        to answer the question:
        ```
        {question}
        ```
        Your answer must be succinct!
        Answer:
        """
    )

In [66]:
#This defines a data structure with 4 string fields:
@dataclass
class QuestionAnswer:
    question: str
    answer: str
    true_answer: str
    context: str

In [67]:
"""
Extract Predictions Function:
- Takes a dataset of questions, their true answers, and their contexts 
- Asks an AI model to predict the answer to each question using the provided context 
- Collects the AI’s answers along with the original info into QuestionAnswer objects
- Returns a list of these objects
"""
def extract_predictions(dataset: Dataset, model: str) -> List[QuestionAnswer]:
    return [
        QuestionAnswer(
            question=item["question"],
            answer=predict(format_prompt(item["question"], item["context"]), model),
            true_answer=item["answer"],
            context=item["context"],
        )
        for item in tqdm(dataset)
    ]

## Step # 08: Predict answers through various Large Language Models(LLM)

In [68]:
models = ["gemma2-9b-it", "llama3-8b-8192", "meta-llama/llama-4-maverick-17b-128e-instruct"]   #select available models from Groq website
predictions={}
for model in models:
    predictions[model]=extract_predictions(sample,model=model)

100%|██████████| 50/50 [01:06<00:00,  1.34s/it]
100%|██████████| 50/50 [01:07<00:00,  1.36s/it]
100%|██████████| 50/50 [01:18<00:00,  1.58s/it]


In [30]:
save_list=(predictions,"predictions.pkl")

## Step # 09: Evaluating Large Language Models(LLM)

In [69]:
#Define a evaluation promp
eval_prompt = """
Consider the question: {question}
and answer: {answer}
based on the context: {context}
compare with the true answer: {true_answer}

Score how correct the response is on a scale from 0 to 10
Respond with the integer number only.
"""

In [70]:
#Shows the 6th prediction made by gemma model, and convert it into python dict to show its content
prediction = predictions["gemma2-9b-it"][8]
prediction.__dict__

{'question': "What percentage of Intuit's revenues came from the Small Business & Self-Employed segment in fiscal 2023?",
 'answer': '56% \n',
 'true_answer': '56%',
 'context': "In fiscal 2023, the Small Business & Self-Employed segment contributed 56% of Intuit's total revenues."}

In [71]:
"""
Predict Function:
- Take the question, answer, true answer, and context from a prediction result. 
- Plug them into an evaluation prompt template. 
- Send that prompt to a model (our critic) and ask it to evaluate how good the answer is.
"""
predict(
    eval_prompt.format(
        question=prediction.question,
        answer=prediction.answer,
        context=prediction.context,
        true_answer=prediction.true_answer,
    ),
    CRITIC_MODEL,
)

'10'

In [72]:
#Evaluates the predictions made by each model using a critic model, assigns a score to each, and stores the scores for each model.
scores={}
for model, model_predictions in predictions.items():
    model_scores=[]
    for prediction in tqdm(model_predictions):
        score=predict(
        eval_prompt.format(
        question=prediction.question,
        answer=prediction.answer,
        context=prediction.context,
        true_answer=prediction.true_answer,
    ),
    CRITIC_MODEL,
    )
        model_scores.append(int(score))
        sleep_time =  random.uniform(0.5,2.0)   #Avoids hitting API rate limits
        time.sleep(sleep_time)
    scores[model]=model_scores    #store model's score in dictionary
    

100%|██████████| 50/50 [01:36<00:00,  1.94s/it]
100%|██████████| 50/50 [01:44<00:00,  2.08s/it]
100%|██████████| 50/50 [01:59<00:00,  2.38s/it]


In [73]:
#Create a dataframe summarizing the average score of each model
rows=[]
for model, model_scores in scores.items():
    rows.append({"model":model,"score":np.mean(model_scores)})
pd.DataFrame(rows)

Unnamed: 0,model,score
0,gemma2-9b-it,8.82
1,llama3-8b-8192,8.84
2,meta-llama/llama-4-maverick-17b-128e-instruct,9.1


## Step # 10: Define Critic Model

In [74]:
class GroqCriticModel(DeepEvalBaseLLM):
    #Constructor
    def __init__(self,model:str):
        self.model=model
    def load_model(self):
        pass
    def generate(self,prompt:str) -> str:
        return predict(prompt,self.model)
    async def a_generate(self, prompt:str) -> str:
        return self.generate(prompt)
    def get_model_name(self):
        return self.model
    

In [75]:
critic_model = GroqCriticModel(CRITIC_MODEL)

## Step # 11: Metrics for Evaluating LLMs

In [76]:
#LLMTestCase is a class (from deepeval.test_case) that holds all the information needed to evaluate a language model's output
test_case = LLMTestCase(
    input=prediction.question,
    actual_output=prediction.answer,
    context=[prediction.context],
    retrieval_context=[prediction.context],
)

In [77]:
#AnswerRelevenacyMetric: Evaluating how relevant a model’s answer is to the question and context 
relevancy_matric = AnswerRelevancyMetric(threshold=0.7, model=critic_model, include_reason=True)
relevancy_matric.measure(test_case)
print(relevancy_matric.score, relevancy_matric.reason)

1.0 The score is 1.00 because the actual output perfectly aligns with the input, providing a clear and concise answer to the question, with no irrelevant statements present!


In [78]:
#FaithfulnessMetric: Faithfulness is how accurately the model’s response reflects the source input and facts — without hallucinations or unrelated content.
faithfulness_metric = FaithfulnessMetric(threshold=0.7, model=critic_model, include_reason=True)
faithfulness_metric.measure(test_case)
print(faithfulness_metric.score,faithfulness_metric.reason) 

1.0 The score is 1.00 because there are no contradictions, the actual output perfectly aligns with the retrieval context!


In [79]:
#HallucinationMetric: 
hallucination_metric = HallucinationMetric(threshold=0.7,model=critic_model,include_reason=True)
hallucination_metric.measure(test_case)
print(hallucination_metric.score,hallucination_metric.reason)

0.0 The score is 0.00 because the actual output perfectly aligns with the provided context, indicating no hallucination.


## Step # 12: Evaluation

In [80]:
#Define ModelEvaluation class to store the results of evaluating AI models
@dataclass
class ModelEvaluation:
    model:str
    relevancy: List[float]=field(default_factory=list)              #relevancy_score
    faithfulness: List[float]=field(default_factory=list)           #faithfulness_score 
    hallucination: List[float]=field(default_factory=list)          #hallucination_score
    no_relevancy_reasons: List[str]=field(default_factory=list)     #reason why outputs were not relevant
    no_faithfulness_reasons: List[str]=field(default_factory=list)  #reason why outputs were not faithful
    hallucination_reasons:List[str]=field(default_factory=list)     #reason why output is considered hallucination


In [None]:
#Evaluating the performance of multiple models 
"""
Evaluate each model in the predictions dictionary by measuring:
- Relevancy
- Faithfulness
- Hallucination
For each model's prediction, it stores:
1. The scores for the above metrics
The reasons when scores are poor (e.g. low relevancy, high hallucination
"""
evaluations = []
for model, model_predictions in predictions.items():
    evaluation = ModelEvaluation(model)
    for prediction in tqdm(model_predictions):
        test_case = LLMTestCase(
            input=prediction.question,
            actual_output=prediction.answer,
            context=[prediction.context],
            retrieval_context=[prediction.context],
        )
        try:
            relevancy_matric.measure(test_case)
            faithfulness_metric.measure(test_case)
            hallucination_metric.measure(test_case)

            evaluation.relevancy.append(relevancy_matric.score)
            evaluation.faithfulness.append(faithfulness_metric.score)
            evaluation.hallucination.append(hallucination_metric.score)

            if relevancy_matric.score < 0.5:
                evaluation.no_relevancy_reasons.append(relevancy_matric.reason)
            if faithfulness_metric.score < 0.5:
                evaluation.no_faithfulness_reasons.append(faithfulness_metric.reason)
            if hallucination_metric.score > 0.5:
                evaluation.hallucination_reasons.append(hallucination_metric.reason)
        except:
            continue

        sleep_time = random.uniform(0.5, 2.0)
        time.sleep(sleep_time)
    evaluations.append(evaluation)

In [82]:
evaluations[2]

ModelEvaluation(model='meta-llama/llama-4-maverick-17b-128e-instruct', relevancy=[1.0, 1.0, 1.0, 1.0, 1.0], faithfulness=[1.0, 1.0, 1.0, 1.0, 1.0], hallucination=[0.0, 0.0, 0.0, 0.0, 1.0], no_relevancy_reasons=[], no_faithfulness_reasons=[], hallucination_reasons=['The score is 1.00 because the actual output contains no relevant information from the context, particularly the crucial detail about DJS-002, resulting in a complete contradiction.'])

In [50]:
save_list(evaluations,"evaluations.pkl")

## Step # 13: Create a Report

In [83]:
#Create a list of summary list
rows=[]
for evaluation in evaluations:
    rows.append(
        {
            "model": evaluation.model,
            "relevancy": np.mean(evaluation.relevancy),  #calculate average score
            "faithfulness": np.mean(evaluation.faithfulness),
            "hallucination": np.mean(evaluation.hallucination),
        }
    )

In [84]:
#Display the dataframe
pd.DataFrame(rows)

Unnamed: 0,model,relevancy,faithfulness,hallucination
0,gemma2-9b-it,0.960526,0.973684,0.578947
1,llama3-8b-8192,0.916667,0.953704,0.5
2,meta-llama/llama-4-maverick-17b-128e-instruct,1.0,1.0,0.2


In [85]:
report_prompt="""
Summarize the most common reasons in a paragraph based on the texts:
{relevancy}
{faithfulness}
{hallucination}
If the reasons are empty - conclude that the model did a great job.
"""

In [92]:
reports={}
for evaluation in tqdm(evaluations):
    reports[evaluation.model] = predict(
        report_prompt.format(
            relevancy="\n".join(evaluation.no_relevancy_reasons),
            faithfulness="\n".join(evaluation.no_faithfulness_reasons),
            hallucination="\n".join(evaluation.hallucination_reasons),
        ),
        CRITIC_MODEL,
    )

100%|██████████| 3/3 [01:45<00:00, 35.08s/it]


In [93]:
for model, report in reports.items():
    print(model)
    print()
    display(report)
    print()
    print("-"*50)
    print()

gemma2-9b-it



'Here is a summary of the most common reasons in a paragraph:\n\nThe majority of the actual outputs scored 1.00, indicating that they failed to provide crucial information or context, leading to contradictions, hallucinations, or complete mismatches with the expected context. The common reasons for these scores include omitting significant details, lacking critical context, providing incomplete or inaccurate information, and failing to mention specific dates, types, or amounts. Additionally, some outputs contained contradictions, misrepresentations, or only provided numerical values without proper context, leading to a significant disparity between the output and the expected context.'


--------------------------------------------------

llama3-8b-8192



'Based on the texts, the most common reasons for a low score are that the output lacks crucial information, omits specific details, and contradicts the context. The outputs often fail to provide sufficient detail, leading to a mismatch or contradiction with the provided context. Specifically, the outputs may omit important figures, neglect to mention key terms, or even suggest the opposite of what the context states. These issues result in a complete contradiction between the output and the context, leading to a score of 1.00.'


--------------------------------------------------

meta-llama/llama-4-maverick-17b-128e-instruct



"Here is a summary of the most common reasons in a paragraph:\n\nThe most common reasons for a low score include the model's failure to incorporate relevant information from the context into its output. Specifically, the absence of crucial details, such as DJS-002, leads to contradictions and inaccurate responses. On the other hand, if the reasons are empty, it indicates that the model has performed well, suggesting that it has successfully addressed the task at hand."


--------------------------------------------------

