In [1]:
import sys
import os

# Add the parent directory to the Python path (since notebooks don't have __file__)
current_dir = os.getcwd()
if current_dir.endswith('notebooks'):
    parent_dir = os.path.dirname(current_dir)
    sys.path.append(parent_dir)

from utils.chat_utils import text_to_json
from prompts.eval_dataset import EVALUATION_DATASET_1_PROMPT, EVALUATION_DATASET_2_PROMPT, EVALUATION_DATASET_3_PROMPT
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
from utils.anthropic_client import ChatClient
from utils.evaluator import Evaluator, FormatAwareEvaluator

api_key = os.getenv('ANTHROPIC_API_KEY')
if api_key is None:
    raise ValueError("ANTHROPIC_API_KEY not found in environment variables")

messages = []
stop_sequences = ["```"]
max_tokens = 4096

params = {
    "messages": messages,
    "stop_sequences": stop_sequences,
    "max_tokens": max_tokens,
}

# Create ChatClient
chat_client = ChatClient(
    api_key=api_key,
    model="claude-3-haiku-20240307",
    params=params
)

# Create Evaluator for running evaluations
evaluator = Evaluator(chat_client=chat_client)

# Create Format Aware Evaluator
format_evaluator = FormatAwareEvaluator(chat_client=chat_client)

## Generate Dataset (if needed)

In [3]:
GENERATE_DATASET = False
GENERATE_DATASET_FORMAT_SPECIFICATIONS = False
if GENERATE_DATASET:
    prompt = EVALUATION_DATASET_2_PROMPT
    text = "```code"
    dataset = chat_client.generate_dataset(
        prompt=prompt, 
        text=text, 
        save_path="datasets/generated_dataset_format.json"
    )
    print(f"Generated dataset with {len(dataset)} test cases")
if GENERATE_DATASET_FORMAT_SPECIFICATIONS:
    prompt = EVALUATION_DATASET_3_PROMPT
    text = "```code"
    dataset = chat_client.generate_dataset(
        prompt=prompt, 
        text=text, 
        save_path="datasets/generated_dataset_with_format_specifications.json"
    )
    print(f"Generated dataset with {len(dataset)} test cases")

## Running the Evaluation

In [3]:
# Test with a simple dataset
test_dataset = [
    {"prompt": "Write a hello world function", "solution_criteria": "A function that prints hello world"},
    {"prompt": "What is 2+2?", "solution_criteria": "4"},
    {"prompt": "Explain Python lists", "solution_criteria": "Explanation of Python lists"}
]

# This should now work without the formatting error!
results = evaluator.run_eval(test_dataset)
print(f"Success! Passed {results['passed_tests']}/{results['total_tests']} tests")

Running test case 1/3...
  ✗ Test case 1 failed
Running test case 2/3...
  ✓ Test case 2 passed
Running test case 3/3...
  ✓ Test case 3 passed

Results saved to eval_results.json

Evaluation complete: 2/3 tests passed (66.67%)
Success! Passed 2/3 tests


## Running the Evaluation with Format Specifications

In [3]:
format_aware_dataset = [
    {
        "prompt": "Write a Python function that retrieves the list of EC2 instances in a specific AWS region",
        "format": "python",
        "solution_criteria": "A Python function using boto3 to list EC2 instances",
        "grading_config": {
            "code": {
                "min_length": 100,
                "required_words": ["def", "boto3", "ec2", "return"],
                "syntax_check": True
            }
        }
    },
    {
        "prompt": "Create a JSON object that represents an AWS Lambda function configuration",
        "format": "json", 
        "solution_criteria": "A valid JSON object with Lambda configuration properties",
        "grading_config": {
            "format": {
                "required_fields": ["FunctionName", "Runtime", "Handler", "Role"],
                "forbidden_fields": ["AccessKeyId", "SecretAccessKey"],
                "validate_json_schema": True,
                "json_schema": {
                    "type": "object",
                    "properties": {
                        "FunctionName": {"type": "string"},
                        "Runtime": {"type": "string"},
                        "Handler": {"type": "string"},
                        "Role": {"type": "string"},
                        "MemorySize": {"type": "number"},
                        "Timeout": {"type": "number"}
                    },
                    "required": ["FunctionName", "Runtime", "Handler", "Role"]
                }
            }
        }
    },
    {
        "prompt": "Write a regular expression to validate an AWS S3 bucket name",
        "format": "regex",
        "solution_criteria": "A regex pattern that validates S3 bucket naming rules",
        "grading_config": {
            "code": {
                "min_length": 20,
                "syntax_check": True
            }
        }
    }
]

results = format_evaluator.run_format_aware_eval_with_detailed_display(
        test_dataset=format_aware_dataset,
        save_results=True,
        results_path="../evaluation_results/enhanced_eval_results.json",
        verbose=True,
        show_individual_tests=True,
        max_display=5
    )

print("\n" + "="*60)
print("✅ Enhanced evaluation completed!")
print("Check 'enhanced_eval_results.json' for detailed results.")
    
# Demonstrate statistics calculation separately
print("\n📊 Additional Statistics Analysis:")
stats = format_evaluator.calculate_format_statistics(results['results'])
    
print(f"Total tests: {stats['overall']['total']}")
print(f"Passed: {stats['overall']['passed']}")
print(f"Failed: {stats['overall']['failed']}")
    
print("\nFormat breakdown:")
for format_type, format_stats in stats['by_format'].items():
    total = format_stats['passed'] + format_stats['failed']
    pass_rate = format_stats['passed'] / total * 100 if total > 0 else 0
    print(f"  {format_type}: {pass_rate:.1f}% pass rate")

🚀 Running Format-Aware Evaluation

Test 1/3: Write a Python function that retrieves the list of EC2 insta...
Format: python
❌ FAILED
  Code Score: 10/10
    Issue: Missing required words: def, return; Unsupported language: text
  Model Score: 9/10

Test 2/3: Create a JSON object that represents an AWS Lambda function ...
Format: json
❌ FAILED
  Code Score: 10/10
    Issue: Output too short. Minimum length: 100, got: 89; Missing required words: def, boto3, ec2, return
  Format Score: 8.0/10
    Issue: Invalid JSON format: Expecting value: line 1 column 1 (char 0)
  Model Score: 8.8/10

Test 3/3: Write a regular expression to validate an AWS S3 bucket name...
Format: regex
✅ PASSED

Results saved to ../evaluation_results/enhanced_eval_results.json

Format-Aware Evaluation Complete: 1/3 tests passed (33.33%)
🚀 Running Format-Aware Evaluation

Test 1/3: Write a Python function that retrieves the list of EC2 insta...
Format: python
❌ FAILED
  Code Score: 10/10
    Issue: Missing required wo

In [None]:
import json

# Load the dataset
with open("datasets/generated_dataset.json", "r") as f:
    dataset = json.load(f)

# Run evaluation using the Evaluator class
results = evaluator.run_eval(
    test_dataset=dataset,
    save_results=True,
    results_path="../evaluation_results/eval_results.json"
)

# Display summary
print("\n" + "="*50)
print("EVALUATION SUMMARY")
print("="*50)
print(f"Total Tests: {results['total_tests']}")
print(f"Passed: {results['passed_tests']}")
print(f"Failed: {results['failed_tests']}")
print(f"Pass Rate: {results['pass_rate']:.2%}")

Running test case 1/3...
  ✗ Test case 1 failed
Running test case 2/3...
  ✗ Test case 2 failed
Running test case 3/3...
  ✗ Test case 3 failed

Results saved to eval_results.json

Evaluation complete: 0/3 tests passed (0.00%)

EVALUATION SUMMARY
Total Tests: 3
Passed: 0
Failed: 3
Pass Rate: 0.00%


## Detailed Results Analysis

In [6]:
# Analyze specific failures
if results['failed_tests'] > 0:
    print("\n" + "="*50)
    print("FAILED TEST CASES")
    print("="*50)
    for result in results['results']:
        if not result.get('passed', False):
            test_case = result.get('test_case', {})
            print(f"\nPrompt: {test_case.get('prompt', 'N/A')[:100]}...")
            if 'error' in result:
                print(f"Error: {result['error']}")
            else:
                grading = result.get('grading_results', {})
                if 'code_grader' in grading:
                    print(f"Code Grader: {grading['code_grader'].feedback}")
                if 'model_grader' in grading:
                    print(f"Model Grader: {grading['model_grader'].feedback}")



FAILED TEST CASES

Prompt: Write a Python function that retrieves the list of EC2 instances in a specific AWS region....
Code Grader: Unsupported language: text
Model Grader: The response provides an excellent Python function that retrieves the list of EC2 instances in a specific AWS region. It is well-written, comprehensive, and follows the given instructions closely. This function would be very useful for AWS administrators and developers who need to interact with EC2 instances programmatically.

Prompt: Create a JSON object that represents an AWS Lambda function configuration, including the function na...
Code Grader: Unsupported language: text
Model Grader: The response is of high quality, as it addresses the question/task well, follows the instructions, is complete, helpful, and safe. The provided JSON object is a clear and accurate representation of an AWS Lambda function configuration.

Prompt: Write a regular expression to validate an AWS S3 bucket name....
Code Grader: Unsupp