In [1]:
from dotenv import load_dotenv
from anthropic import Anthropic
import json

load_dotenv()
client = Anthropic()
model = "claude-sonnet-4-0"

In [2]:
def add_user_message(messages, text):
    messages.append({"role": "user", "content": text})
    return messages

def add_assistant_message(messages, text):
    messages.append({"role": "assistant", "content": text})
    return messages

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
            "model": model,
            "max_tokens": 1000,
            "messages": messages,
            "temperature":temperature,
            "stop_sequences":stop_sequences,
    }
    if system:
        params["system"] = system

    message = client.messages.create(
        **params
    )
    return message.content[0].text


In [3]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    
    prompt = f"""Please solve the following task:

    {test_case["task"]}
    """

    messages=[]
    add_user_message(messages, prompt)
    output = chat(messages)
    return output
    

def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    # GRADE
    model_grade = grade_by_model(test_case, output)
    score = model_grade["score"]
    reasoning = model_grade["reasoning"]
    return {
    "output": output,
    "test_case": test_case,
    "score": score,
    "reasoning":reasoning
    }
    
def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results=[]
    for item in dataset:
        # print(item)
        result = run_test_case(item)
        results.append(result)
    return results

In [4]:
with open("data/dataset.json") as f:
    dataset = json.load(f)
dataset

[{'task': 'Write a Python function to create an AWS S3 bucket with a given name.'},
 {'task': 'Create a JSON object to configure an AWS Lambda function with a specified runtime, memory size, and timeout.'},
 {'task': "Write a regular expression to validate an AWS EC2 instance ID in the format 'i-0123456789abcdef'."}]

In [5]:
def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = f'''
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Original Task:
    <task>
    {test_case["task"]}
    </task>
    
    Solution to Evaluate:
    <solution>
    {output}
    </solution>
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    Respond with JSON. Keep your response concise and direct.
    Example response shape:
    {{
        "strengths": string[],
        "weaknesses": string[],
        "reasoning": string,
        "score": number
    }}
    '''
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [6]:
from statistics import mean

def run_eval(dataset):
    results = []
    
    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)
    
    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")
    
    return results

In [7]:
eval_results = run_eval(dataset)
eval_results

Average score: 6


  'test_case': {'task': 'Write a Python function to create an AWS S3 bucket with a given name.'},
  'score': 6,
  'reasoning': "The solution demonstrates solid understanding of AWS S3 bucket creation with proper error handling and region considerations. However, it's incomplete with missing implementations and referenced functions. The basic function is well-structured and would work correctly, but the advanced version appears truncated."},
 {'output': 'Here\'s a JSON object to configure an AWS Lambda function with runtime, memory size, and timeout specifications:\n\n```json\n{\n  "FunctionName": "my-lambda-function",\n  "Runtime": "python3.11",\n  "Role": "arn:aws:iam::123456789012:role/lambda-execution-role",\n  "Handler": "lambda_function.lambda_handler",\n  "Code": {\n    "ZipFile": "def lambda_handler(event, context): return {\'statusCode\': 200, \'body\': \'Hello World\'}"\n  },\n  "Description": "Sample Lambda function configuration",\n  "Timeout": 30,\n  "MemorySize": 256,\n  "