In [1]:
import os
import time
import json
from groq import Groq

In [2]:
judge_client = Groq(
    api_key=os.environ.get("GROQ_API_KEY")
)

In [3]:
def judge_llm(prompt):
    start_time = time.time()
    response = judge_client.chat.completions.create(
        model="llama-3.1-70b-versatile",
        messages=[
        {
            "role": "user",
            "content": prompt,
        }
    ])
    answer = response.choices[0].message.content
    tokens = {
            'prompt_tokens': response.usage.prompt_tokens,
            'completion_tokens': response.usage.completion_tokens,
            'total_tokens': response.usage.total_tokens
        }
    end_time = time.time()
    response_time = end_time - start_time
    return answer, tokens, response_time

In [4]:
def evaluate_response(question, answer):
    prompt_template = """
    You will be given a user_question and system_answer couple.
    Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
    Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.

    Here is the scale you should use to build your answer:
    1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
    2: The system_answer is mostly not helpful: misses some key aspects of the question
    3: The system_answer is mostly helpful: provides support, but still could be improved
    4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question

    Provide your feedback as follows:

    Feedback:::
    Evaluation: (your rationale for the rating, as a text)
    Total rating: (your rating, as a number between 1 and 4)

    You MUST provide values for 'Feedback' and 'Rating' in your answer.

    Now here are the question and answer.

    Question: {question}
    Generated Answer: {answer}

    Please analyze the content and context of the generated answer in relation to the question
    and provide your evaluation in parsable JSON without using code blocks:

    {{
      "Feedback": "[Provide a brief explanation for your evaluation]",
      "Rating": [Provide the evaluation between 1 to 4]
    }}
    """.strip()
    prompt = prompt_template.format(question=question, answer=answer)
    evaluation, tokens, _ = judge_llm(prompt)
    try:
        json_eval = json.loads(evaluation)
        return json_eval['Feedback'], json_eval['Rating'], tokens
    except json.JSONDecodeError:
        return "Failed to parse evaluation", 0, tokens

In [5]:
question = """Explain how a convolutional network works"""
answer = """A convolutional network works by applying multiple convolutions to the input data, creating multiple channels at each spatial position. These channels are then combined and downsampled by a factor of two, increasing the number of channels. This process is repeated, with the spatial dimensions decreasing and the channels increasing. The network typically ends with fully connected layers that integrate information from across the input to create the desired output."""

In [11]:
feedback,rating,tokens = evaluate_response(question, answer)

In [12]:
print(feedback)

The system answer provides a general overview of the components and process involved in a convolutional network, but it is somewhat technical and lacks specific details and explanations about key concepts such as convolutions, downsampling, and fully connected layers. However, it still addresses the main question and provides a clear framework of how the network works.


In [13]:
print(rating)

3


In [14]:
print(tokens)

{'prompt_tokens': 446, 'completion_tokens': 80, 'total_tokens': 526}
