In [1]:
import os
import pandas as pd
from pathlib import Path
from dotenv import load_dotenv
pd.set_option("display.max_colwidth", None)

# Construct Evaluation Model
from langchain_groq import ChatGroq
from deepeval.models import DeepEvalBaseLLM

# Evaluation data and metrics
from deepeval.metrics import GEval
from deepeval.test_case import LLMTestCase
from deepeval.dataset import EvaluationDataset
from deepeval.test_case import LLMTestCaseParams

dotenv_path = Path('../.env')
load_dotenv(dotenv_path=dotenv_path)
model = ChatGroq(temperature=0.8, groq_api_key=os.environ["GROQ_API_KEY"], model_name="llama-3.1-70b-versatile", max_retries=2)

### Build Custom Model for Evaluation

In [2]:
class CustomGroqLLM(DeepEvalBaseLLM):
    def __init__(self):
        
        model = ChatGroq(temperature=0.8, groq_api_key=os.environ["GROQ_API_KEY"], model_name="llama-3.1-70b-versatile", max_retries=2)
        self.model = model

    def load_model(self):
        return self.model

    def generate(self, prompt: str) -> str:
        model = self.load_model()
        response = model.invoke(prompt)
        return response.content

    async def a_generate(self, prompt: str) -> str:
        return self.generate(prompt)

    def get_model_name(self):
        return "ChatGroq"
  
llm = CustomGroqLLM()

### Prepare Synthetic Data

In [3]:
# Create the Evaluation Dataset
evaluation_prompt = "Task: You are a career guide. Your job is to ask me up to 2 questions to uncover my personality traits according to the RAISEC model.\
  You will ask these questions in a conversational flow where you will ask the second question after I answer the first.\
  At any point, I can ask you questions and you will answer normally, then you will continue your personality test. when asking your questions, please number them to keep track of the number of questions asked.\
  Also, I will give you data from a knowledge graph to supplement your answer\
  Your final output: Interpret all the graph data, choose up to 6 suitable careers for me, list them in bullet points and include a brief explanation of how each path suites my personality. Do not include Cypher code in your answer.\
  Output's tone: Make your output friendly, fun and easy to read.\
  ...I queryied the graph and extracted the relevant data. Here are the extracted data: {extracted_data}\
  Here is our conversation's history: {conversation}.\
  Use the conversation anf the extracted data to give me suitable career tracks."

df = pd.read_csv("gpt_synthetic_data.csv")
test_cases = []

for i in df.index:
  conversation = df.loc[i, "conversation"]
  extracted_data = df.loc[i, "extracted_data"]
  output = df.loc[i, "output"]
  prompt = evaluation_prompt.format(extracted_data = extracted_data, conversation = conversation)

  # TestCase prep includes having the LLM receive a synthetically generate conversation with extracted data and return suggestions  
  test_case = LLMTestCase(
    input = f"Conversation: {conversation}",
    actual_output = llm.generate(prompt),
    expected_output = output,
    retrieval_context=[extracted_data]
  )
  test_cases.append(test_case)
  print(i, end=" ")

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 

### Prepare Custom Evaluation Metric

In [4]:
# Create custom metric using chain-of-thought
from deepeval import evaluate
from deepeval.metrics import AnswerRelevancyMetric, FaithfulnessMetric
answer_relevancy = AnswerRelevancyMetric(model=llm)
faithfulness = FaithfulnessMetric(model=llm,)

suggestion_logic_metric = GEval(
    name="Logical_Suggestions",
    evaluation_steps=[
        "Check if the suggested careers are suitable for the person based on the conversation in the 'input'.",
        "Check if career suggestions in the 'actual output' are compatible with the extracted data in the 'retrieval_context' and with the user's character in the 'input' conversation.",
        "Slightly contradicting opinions between 'actual_output' and 'expected_output' are okay, as long as they seem logical and align with the person's character."
    ],
    evaluation_params=[LLMTestCaseParams.INPUT, LLMTestCaseParams.ACTUAL_OUTPUT, LLMTestCaseParams.EXPECTED_OUTPUT],
    model=llm,
)

# Initialize evaluation dataset
dataset = EvaluationDataset(test_cases=test_cases)

### Running Evaluation of LLM on synthetic data

In [5]:
# Run logic evaluation
dataset.evaluate(metrics=[suggestion_logic_metric])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 22 test case(s) in parallel: |██████████|100% (22/22) [Time Taken: 03:11,  8.69s/test case]



Metrics Summary

  - ✅ Logical_Suggestions (GEval) (score: 0.8, threshold: 0.5, strict: False, evaluation model: ChatGroq, reason: Actual Output provides suitable careers like Software Architect and UX Designer, aligning with problem-solving and designing solutions preferences, but some suggested careers like IT Project Manager and Cybersecurity Consultant slightly deviate from the Investigative trait., error: None)

For test case:

  - input: Conversation: ai message: Hello. Let's discover career paths based on your personality traits. Here's my first question: What do you prefer when working: teamwork, problem-solving, creativity, or data analysis? human message: problem-solving. ai message: Great! Next, do you prefer hands-on tasks, planning, or designing solutions? human message: designing solutions.
  - actual output: I'm excited to help you discover your ideal career path. 

Based on your preferences for problem-solving and designing solutions, I can infer that you have a stron




EvaluationResult(test_results=[TestResult(success=True, metrics_data=[MetricData(name='Logical_Suggestions (GEval)', threshold=0.5, success=True, score=0.8, reason='Actual Output provides suitable careers like Software Architect and UX Designer, aligning with problem-solving and designing solutions preferences, but some suggested careers like IT Project Manager and Cybersecurity Consultant slightly deviate from the Investigative trait.', strict_mode=False, evaluation_model='ChatGroq', error=None, evaluation_cost=None, verbose_logs='Criteria:\nNone \n \nEvaluation Steps:\n[\n    "Check if the suggested careers are suitable for the person based on the conversation in the \'input\'.",\n    "Check if career suggestions in the \'actual output\' are compatible with the extracted data in the \'retrieval_context\' and with the user\'s character in the \'input\' conversation.",\n    "Slightly contradicting opinions between \'actual_output\' and \'expected_output\' are okay, as long as they seem

In [6]:
# Run answer relevency evaluation
dataset.evaluate(metrics=[answer_relevancy])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 22 test case(s) in parallel: |██████████|100% (22/22) [Time Taken: 09:31, 25.99s/test case]



Metrics Summary

  - ✅ Answer Relevancy (score: 1.0, threshold: 0.5, strict: False, evaluation model: ChatGroq, reason: The score is 1.00 because the actual output perfectly addresses the conversation flow and provides relevant follow-up questions based on the human's inputs., error: None)

For test case:

  - input: Conversation: ai message: Hello. Let's discover career paths based on your personality traits. Here's my first question: What do you prefer when working: teamwork, problem-solving, creativity, or data analysis? human message: problem-solving. ai message: Great! Next, do you prefer hands-on tasks, planning, or designing solutions? human message: designing solutions.
  - actual output: I'm excited to help you discover your ideal career path. 

Based on your preferences for problem-solving and designing solutions, I can infer that you have a strong Investigative and Artistic (or possibly Enterprising) personality type according to the RAISEC model. However, since designing 




EvaluationResult(test_results=[TestResult(success=True, metrics_data=[MetricData(name='Answer Relevancy', threshold=0.5, success=True, score=1.0, reason="The score is 1.00 because the actual output perfectly addresses the conversation flow and provides relevant follow-up questions based on the human's inputs.", strict_mode=False, evaluation_model='ChatGroq', error=None, evaluation_cost=None, verbose_logs='Statements:\n[\n    "I\'m excited to help you discover your ideal career path.",\n    "Based on your preferences for problem-solving and designing solutions, I can infer that you have a strong Investigative and Artistic (or possibly Enterprising) personality type according to the RAISEC model.",\n    "However, since designing solutions often involves planning and organization, I will also consider the Realistic and Conventional aspects.",\n    "Now, let\'s examine the data from the knowledge graph.",\n    "It seems that the Investigative trait is a common theme among the careers liste

In [7]:
# Run faithfulness evaluation
dataset.evaluate(metrics=[faithfulness])

Event loop is already running. Applying nest_asyncio patch to allow async execution...


Evaluating 22 test case(s) in parallel: |██████████|100% (22/22) [Time Taken: 10:05, 27.50s/test case]



Metrics Summary

  - ✅ Faithfulness (score: 0.8888888888888888, threshold: 0.5, strict: False, evaluation model: ChatGroq, reason: The score is 0.89 because the actual output incorrectly implies a direct connection between designing solutions and the Realistic trait, which is not explicitly stated in the retrieval context., error: None)

For test case:

  - input: Conversation: ai message: Hello. Let's discover career paths based on your personality traits. Here's my first question: What do you prefer when working: teamwork, problem-solving, creativity, or data analysis? human message: problem-solving. ai message: Great! Next, do you prefer hands-on tasks, planning, or designing solutions? human message: designing solutions.
  - actual output: I'm excited to help you discover your ideal career path. 

Based on your preferences for problem-solving and designing solutions, I can infer that you have a strong Investigative and Artistic (or possibly Enterprising) personality type accordin




EvaluationResult(test_results=[TestResult(success=True, metrics_data=[MetricData(name='Faithfulness', threshold=0.5, success=True, score=0.8888888888888888, reason='The score is 0.89 because the actual output incorrectly implies a direct connection between designing solutions and the Realistic trait, which is not explicitly stated in the retrieval context.', strict_mode=False, evaluation_model='ChatGroq', error=None, evaluation_cost=None, verbose_logs='Truths (limit=None):\n[\n    "Some Software Developers have Investigative traits.",\n    "Some Software Developers have Realistic traits.",\n    "Some System Analysts have Conventional traits.",\n    "Some System Analysts have Investigative traits.",\n    "Some Database Administrators have Conventional traits.",\n    "Some Database Administrators have Investigative traits."\n] \n \nClaims:\n[\n    "Based on your preferences for problem-solving and designing solutions, I can infer that you have a strong Investigative and Artistic (or poss