In [1]:
from anthropic import Anthropic
import ast
from dotenv import load_dotenv
import json
import re
from statistics import mean


load_dotenv()
client = Anthropic()
model = "claude-sonnet-4-0"

In [2]:
def add_user_message(messages, text):
    messages.append({"role": "user", "content": text})
    return messages

def add_assistant_message(messages, text):
    messages.append({"role": "assistant", "content": text})
    return messages

def chat(messages, system=None, temperature=1.0, stop_sequences=[]):
    params = {
            "model": model,
            "max_tokens": 1000,
            "messages": messages,
            "temperature":temperature,
            "stop_sequences":stop_sequences,
    }
    if system:
        params["system"] = system

    message = client.messages.create(
        **params
    )
    return message.content[0].text


In [3]:
def generate_dataset():
    prompt = """
Generate a evaluation dataset for a prompt evaluation. The dataset will be used to evaluate prompts
that generate Python, JSON, or Regex specifically for AWS-related tasks. Generate an array of JSON objects,
each representing task that requires Python, JSON, or a Regex to complete.

Example output:
```json
[
    {
        "task": "Description of task",
        "format": "json" or "python" or "regex"
    },
    ...additional
]
```

* Focus on tasks that can be solved by writing a single Python function, a single JSON object, or a regular expression.
* Focus on tasks that do not require writing much code

Please generate 3 objects.
"""

    messages = []
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```json")
    text = chat(messages, stop_sequences=["```"])
    return json.loads(text)

In [4]:
dataset = generate_dataset()
with open("data/dataset.json", "w") as f:
    json.dump(dataset, f, indent=2)
dataset

[{'task': 'Create a JSON policy document that allows read-only access to all S3 buckets and objects for a specific IAM user',
  'format': 'json'},
 {'task': 'Write a Python function that extracts the region from an AWS ARN string',
  'format': 'python'},
 {'task': 'Create a regular expression that validates AWS S3 bucket names according to AWS naming conventions (lowercase letters, numbers, hyphens, 3-63 characters)',
  'format': 'regex'}]

In [5]:
def grade_by_model(test_case, output):
    # Create evaluation prompt
    eval_prompt = f'''
    You are an expert code reviewer. Evaluate this AI-generated solution.
    
    Original Task:
    <task>
    {test_case["task"]}
    </task>
    
    Solution to Evaluate:
    <solution>
    {output}
    </solution>
    
    Provide your evaluation as a structured JSON object with:
    - "strengths": An array of 1-3 key strengths
    - "weaknesses": An array of 1-3 key areas for improvement  
    - "reasoning": A concise explanation of your assessment
    - "score": A number between 1-10
    Respond with JSON. Keep your response concise and direct.
    Example response shape:
    {{
        "strengths": string[],
        "weaknesses": string[],
        "reasoning": string,
        "score": number
    }}
    '''
    
    messages = []
    add_user_message(messages, eval_prompt)
    add_assistant_message(messages, "```json")
    
    eval_text = chat(messages, stop_sequences=["```"])
    return json.loads(eval_text)

In [6]:
def validate_json(text):
    try:
        json.loads(text.strip())
        return 10
    except json.JSONDecodeError:
        return 0


def validate_python(text):
    try:
        ast.parse(text.strip())
        return 10
    except SyntaxError:
        return 0


def validate_regex(text):
    try:
        re.compile(text.strip())
        return 10
    except re.error:
        return 0


def grade_syntax(response, test_case):
    format = test_case["format"]
    if format == "json":
        return validate_json(response)
    elif format == "python":
        return validate_python(response)
    else:
        return validate_regex(response)


In [7]:
def run_prompt(test_case):
    """Merges the prompt and test case input, then returns the result"""
    
    prompt = f"""Please solve the following task:

    {test_case["task"]}
    * Respond only with Python, JSON, or a plain Regex
    * Do not add any comments or commentary or explanation
    """

    messages=[]
    add_user_message(messages, prompt)
    add_assistant_message(messages, "```code")
    output = chat(messages, stop_sequences=["```"])
    return output
    

def run_test_case(test_case):
    """Calls run_prompt, then grades the result"""
    output = run_prompt(test_case)

    # GRADE
    model_grade = grade_by_model(test_case, output)
    model_score = model_grade["score"]
    reasoning = model_grade["reasoning"]

    syntax_score = grade_syntax(output, test_case)
    score = (model_score + syntax_score) /2
    return {
    "output": output,
    "test_case": test_case,
    "score": score,
    "reasoning":reasoning
    }
    
def run_eval(dataset):
    """Loads the dataset and calls run_test_case with each case"""
    results = []

    for test_case in dataset:
        result = run_test_case(test_case)
        results.append(result)

    average_score = mean([result["score"] for result in results])
    print(f"Average score: {average_score}")

    return results

In [9]:
results = run_eval(dataset)

Average score: 7.833333333333333


In [10]:
print(json.dumps(results, indent=2))

[
  {
    "output": "\n{\n    \"Version\": \"2012-10-17\",\n    \"Statement\": [\n        {\n            \"Effect\": \"Allow\",\n            \"Action\": [\n                \"s3:GetObject\",\n                \"s3:GetObjectVersion\",\n                \"s3:ListBucket\",\n                \"s3:ListBucketVersions\",\n                \"s3:GetBucketLocation\",\n                \"s3:GetBucketVersioning\",\n                \"s3:ListAllMyBuckets\"\n            ],\n            \"Resource\": [\n                \"arn:aws:s3:::*\",\n                \"arn:aws:s3:::*/*\"\n            ]\n        }\n    ]\n}\n",
    "test_case": {
      "task": "Create a JSON policy document that allows read-only access to all S3 buckets and objects for a specific IAM user",
      "format": "json"
    },
    "score": 8.5,
    "reasoning": "The policy correctly provides read-only S3 access with proper JSON structure and relevant permissions. However, it doesn't address the 'specific IAM user' requirement and could include