# Green Agent Evaluation Consistency Test

This notebook tests whether the green agent's evaluation produces consistent results when provided with the same answers/inputs across different instances.

**Test Setup:**
- Runs 60 evaluations with identical inputs
- Uses fixed mock responses for player actions
- Compares results for consistency (winner, final wealth, final health, etc.)


In [13]:
import sys
import os
import json
from typing import Dict, Any, List
from collections import Counter
import statistics
import types
from datetime import datetime, timedelta
from dotenv import load_dotenv
from concurrent.futures import ThreadPoolExecutor, as_completed

# Add backend to path
backend_path = os.path.join(os.path.dirname(os.getcwd()), 'backend')
if os.path.exists(backend_path):
    sys.path.insert(0, backend_path)
else:
    # Try current directory
    backend_path = os.path.join(os.getcwd(), 'backend')
    if os.path.exists(backend_path):
        sys.path.insert(0, backend_path)

# Load environment variables from backend/.env BEFORE importing anything that uses them
env_path = os.path.join(backend_path, '.env')
if os.path.exists(env_path):
    load_dotenv(env_path, override=True)
    print(f"Loaded environment variables from {env_path}")
    # Verify API key is loaded
    api_key = os.getenv("OPENAI_API_KEY")
    if api_key:
        print(f"‚úì OPENAI_API_KEY found (length: {len(api_key)})")
    else:
        print("‚ö† WARNING: OPENAI_API_KEY not found in environment variables")
else:
    # Try loading from current directory
    load_dotenv(override=True)
    print("Loaded environment variables from current directory")
    api_key = os.getenv("OPENAI_API_KEY")
    if api_key:
        print(f"‚úì OPENAI_API_KEY found (length: {len(api_key)})")
    else:
        print("‚ö† WARNING: OPENAI_API_KEY not found in environment variables")

from src.app.Game import Game
from src.app.Player import Player

print("Imports successful!")


Loaded environment variables from /Users/cyro/Documents/VSC/AgenticAI/backend/.env
‚úì OPENAI_API_KEY found (length: 164)
Imports successful!


In [14]:
# Function to create a mock player action method
def create_mock_get_action(fixed_response: str):
    """Create a mock get_action method that returns a fixed response"""
    def mock_get_action(self, context: dict) -> str:
        """Mock version of Player.get_action that returns fixed response"""
        self._responses.append(fixed_response)
        return fixed_response
    return mock_get_action

print("Mock action creator function created!")


Mock action creator function created!


In [15]:
def run_single_game(max_turns: int, world_size: int, starting_wealth: int, 
                    fixed_responses: Dict[str, str]) -> Dict[str, Any]:
    """
    Run a single game instance with fixed player responses.
    
    Args:
        max_turns: Maximum number of turns
        world_size: World size for the game
        starting_wealth: Starting wealth for players
        fixed_responses: Dict mapping player UID to their fixed action response
    
    Returns:
        Dict with game results
    """
    # Create player info (same as green agent does)
    player_info = {
        "player_1": {
            "uid": "player_1", 
            "position": [0, 0], 
            "model": "mock", 
            "player_class": "human",
            "values": {"money": starting_wealth, "health": 100}, 
            "responses": []
        },
        "player_2": {
            "uid": "player_2", 
            "position": [0, 0], 
            "model": "mock", 
            "player_class": "human",
            "values": {"money": starting_wealth, "health": 100}, 
            "responses": []
        }
    }
    
    # IMPORTANT: We do NOT mock the DM - let the real AI decide rewards
    # We only mock player actions to ensure they're consistent
    # This tests if the AI DM gives consistent rewards for the same player actions
    
    # Create game instance with real AI model (not "mock")
    # The DM will use real AI to generate tiles and respond to actions
    game = Game(player_info, {"model": "gpt-4o-mini"}, world_size)
    
    # Verify the DM model is set correctly (Game creates DM then loads, so verify it worked)
    if game.dm.model != "gpt-4o-mini":
        # Force set it if it didn't load correctly
        game.dm.model = "gpt-4o-mini"
    
    # CRITICAL: Make sure we're using real AI, not mock
    # Check if model contains "mock" - if so, it will use MockAiService
    if "mock" in game.dm.model.lower():
        raise ValueError(f"ERROR: DM model is '{game.dm.model}' which will use MockAiService! Change to a real AI model like 'gpt-4o-mini'")
    
    # IMPORTANT: Use unique chat_id per run to ensure no history contamination
    # The DM uses chat_id "DungeonMaster" which accumulates history across runs
    # We'll monkey-patch to use a unique chat_id for this run
    from src.services.aiServices.wrapper import AIWrapper
    import uuid
    
    # Generate unique chat_id for this run
    unique_chat_id = f"DM_{uuid.uuid4().hex[:8]}"
    
    # Monkey-patch DM's respond_actions to use unique chat_id
    original_respond_actions = game.dm.respond_actions
    
    def isolated_respond_actions(self, info: dict):
        """Wrapper to use unique chat_id (fresh history for each run)"""
        from src.services.Utils import format_request
        from core.settings import AIConfig
        from api.apiDtoModel import GameResponse
        
        # Use unique chat_id - this creates a fresh service instance with no history
        structured_response = AIWrapper.ask(
            format_request(AIConfig.dm_prompt, info), 
            self.model, 
            unique_chat_id,  # Unique chat_id = fresh history
            structured_output=GameResponse
        )
        self._responses.append(str(structured_response))
        return structured_response
    
    # Replace DM's respond_actions with version that uses unique chat_id
    game.dm.respond_actions = types.MethodType(isolated_respond_actions, game.dm)
    
    game.max_turns = max_turns
    
    # Note: Player actions are mocked (fixed), but DM uses real AI
    # This tests if the AI DM gives consistent rewards for the same actions
    
    # Disable database saves to avoid database connection issues
    # We'll override the save method temporarily to prevent database calls
    # This is needed because Game.save() tries to save to database which may not be configured
    def no_op_save(self):
        # Return minimal valid JSON to avoid breaking anything that expects save() to return a string
        # Format similar to what the real save() returns
        import json
        return json.dumps({
            "id": "test-game",
            "status": "active",
            "current_turn_number": self.current_turn_number
        })
    game.save = types.MethodType(no_op_save, game)
    
    # Mock player actions to return fixed responses
    for uid, player in game.players.items():
        fixed_response = fixed_responses.get(uid, "I will explore")
        # Replace the get_action method with our mock
        player.get_action = types.MethodType(create_mock_get_action(fixed_response), player)
    
    # Track actions and rewards for bias analysis
    turn_data = []  # Store action-reward pairs for each turn
    
    # Run turns (similar to green agent's loop)
    for turn in range(max_turns):
        if game.is_game_over:
            break
            
        # Get actions from players (will use our mocked methods)
        actions = {}
        for role in ["player_1", "player_2"]:
            player = game.players[role]
            context = f"Stats: money={player.values.money}, health={player.values.health}. Position: {player.position}. Make one action."
            action = player.get_action({
                "Self": player.save(),
                "Players (excluding self)": {id: game.players[id].save() for id in game.players if id != role},
                "tiles": game._get_viewable_tiles_payload(player.position, 1),
                "verdict": "",
                "uid": role,
                "position": player.position
            })
            actions[role] = action
        
        # Store pre-verdict state for tracking
        pre_verdict_wealth = {uid: p.values.money for uid, p in game.players.items()}
        
        # DM processes actions (using our mocked method)
        verdict = game.dm.respond_actions({
            "Players": {uid: game.players[uid].save() for uid in game.players},
            "Responses": actions,
            "Past Verdict": "",
            "tiles": game._get_tiles_full_payload()
        })
        
        # Handle verdict
        game.handle_verdict(verdict)
        game.current_turn_number += 1
        game._check_game_conditions()
        
        # Track rewards for bias analysis
        post_verdict_wealth = {uid: p.values.money for uid, p in game.players.items()}
        turn_rewards = {
            uid: post_verdict_wealth[uid] - pre_verdict_wealth[uid] 
            for uid in game.players.keys()
        }
        
        turn_data.append({
            "turn": turn + 1,
            "actions": actions.copy(),
            "rewards": turn_rewards.copy()
        })
        
        if game.is_game_over:
            break
    
    # Determine winner (same logic as green agent)
    winner = None
    max_wealth = 0
    for uid, p in game.players.items():
        if p.values.money > max_wealth and p.values.health > 0:
            max_wealth, winner = p.values.money, uid
    
    # Return results including turn-by-turn data for bias analysis
    return {
        "winner": winner or "none",
        "final_wealth": {uid: p.values.money for uid, p in game.players.items()},
        "final_health": {uid: p.values.health for uid, p in game.players.items()},
        "turns_played": game.current_turn_number,
        "is_game_over": game.is_game_over,
        "game_over_reason": game.game_over_reason,
        "turn_data": turn_data  # For bias analysis
    }

print("Game runner function created!")


Game runner function created!


In [16]:
# Configuration for the test
TEST_CONFIG = {
    "max_turns": 2,  # Reduced from 3 to 2 for faster testing
    "world_size": 1,
    "starting_wealth": 100
}

# Fixed responses for each player (same responses used in all runs)
FIXED_RESPONSES = {
    "player_1": "I will move north and explore the area",
    "player_2": "I will move east and collect resources"
}

NUM_RUNS = 60

print(f"Test configuration:")
print(f"  Runs: {NUM_RUNS}")
print(f"  Config: {TEST_CONFIG}")
print(f"  Fixed responses: {FIXED_RESPONSES}")
print("\nNote: Player actions are fixed, but the AI DM will determine rewards.")
print("This tests if the AI DM gives consistent rewards for the same player actions.")


Test configuration:
  Runs: 60
  Config: {'max_turns': 2, 'world_size': 1, 'starting_wealth': 100}
  Fixed responses: {'player_1': 'I will move north and explore the area', 'player_2': 'I will move east and collect resources'}

Note: Player actions are fixed, but the AI DM will determine rewards.
This tests if the AI DM gives consistent rewards for the same player actions.


In [17]:
# This cell is no longer needed - we use run_single_game directly from cell 3
# Keeping this cell empty for notebook structure
pass


In [None]:
# Run 60 game evaluations in parallel
print(f"Running {NUM_RUNS} game evaluations in parallel...")
print(f"Max workers: 10 (adjust if needed based on API rate limits)")
print("This should be much faster than sequential execution!\n")

# Track timing
start_time = datetime.now()
print(f"‚è∞ Started at: {start_time.strftime('%Y-%m-%d %H:%M:%S')}")

all_results = []
error_counts = {}
success_count = 0
failure_count = 0

def run_single_game_wrapper(run_number):
    """Wrapper function for parallel execution"""
    try:
        result = run_single_game(
            max_turns=TEST_CONFIG["max_turns"],
            world_size=TEST_CONFIG["world_size"],
            starting_wealth=TEST_CONFIG["starting_wealth"],
            fixed_responses=FIXED_RESPONSES
        )
        return {
            "run_number": run_number,
            "success": True,
            "result": result
        }
    except Exception as e:
        error_msg = str(e)
        error_type = type(e).__name__
        return {
            "run_number": run_number,
            "success": False,
            "error": error_msg,
            "error_type": error_type,
            "result": None
        }

# Use ThreadPoolExecutor for parallel execution
# Using 10 workers - adjust based on your API rate limits
max_workers = 10
completed = 0
last_update_time = start_time

with ThreadPoolExecutor(max_workers=max_workers) as executor:
    # Submit all tasks
    future_to_run = {
        executor.submit(run_single_game_wrapper, i + 1): i + 1 
        for i in range(NUM_RUNS)
    }
    
    print(f"üì§ Submitted {NUM_RUNS} tasks to {max_workers} workers\n")
    
    # Process completed tasks as they finish
    for future in as_completed(future_to_run):
        run_number = future_to_run[future]
        current_time = datetime.now()
        
        try:
            result = future.result()
            all_results.append(result)
            completed += 1
            
            # Update success/failure counts
            if result['success']:
                success_count += 1
            else:
                failure_count += 1
                error_type = result.get('error_type', 'Unknown')
                error_counts[error_type] = error_counts.get(error_type, 0) + 1
                # Print first few errors for debugging
                if completed <= 3:
                    error_msg = result.get('error', 'Unknown error')
                    print(f"‚ùå Error in run {run_number}: {error_type}: {error_msg[:200]}")
            
            # Progress update every 5 completions or every 30 seconds
            time_since_last_update = (current_time - last_update_time).total_seconds()
            should_update = (completed % 5 == 0) or (time_since_last_update >= 30)
            
            if should_update:
                elapsed = (current_time - start_time).total_seconds()
                progress_pct = (completed / NUM_RUNS) * 100
                
                # Calculate ETA
                if completed > 0:
                    avg_time_per_run = elapsed / completed
                    remaining_runs = NUM_RUNS - completed
                    eta_seconds = avg_time_per_run * remaining_runs
                    eta = timedelta(seconds=int(eta_seconds))
                    eta_str = f"ETA: {str(eta)}"
                else:
                    eta_str = "ETA: calculating..."
                
                # Calculate rate
                if elapsed > 0:
                    rate = completed / elapsed
                    rate_str = f"({rate:.2f} runs/sec)"
                else:
                    rate_str = ""
                
                print(f"üìä [{current_time.strftime('%H:%M:%S')}] Progress: {completed}/{NUM_RUNS} ({progress_pct:.1f}%) | "
                      f"‚úÖ {success_count} | ‚ùå {failure_count} | {eta_str} {rate_str}")
                
                last_update_time = current_time
                
        except Exception as e:
            failure_count += 1
            error_type = type(e).__name__
            error_counts[error_type] = error_counts.get(error_type, 0) + 1
            print(f"‚ùå Unexpected error processing run {run_number}: {e}")
            all_results.append({
                "run_number": run_number,
                "success": False,
                "error": str(e),
                "error_type": error_type,
                "result": None
            })

# Sort results by run_number for consistency
all_results.sort(key=lambda x: x['run_number'])

# Final summary
end_time = datetime.now()
total_time = (end_time - start_time).total_seconds()
total_time_str = str(timedelta(seconds=int(total_time)))

print(f"\n{'='*60}")
print(f"‚úÖ Completed all {NUM_RUNS} evaluations!")
print(f"‚è∞ Total time: {total_time_str}")
print(f"üìä Results:")
print(f"   ‚úÖ Successful runs: {success_count}/{NUM_RUNS} ({success_count/NUM_RUNS*100:.1f}%)")
print(f"   ‚ùå Failed runs: {failure_count}/{NUM_RUNS} ({failure_count/NUM_RUNS*100:.1f}%)")
if total_time > 0:
    print(f"   ‚ö° Average rate: {NUM_RUNS/total_time:.2f} runs/sec")
print(f"{'='*60}")

if error_counts:
    print(f"\n‚ùå Error breakdown:")
    for error_type, count in sorted(error_counts.items(), key=lambda x: x[1], reverse=True):
        print(f"   {error_type}: {count}")


Running 60 game evaluations in parallel...
Max workers: 10 (adjust if needed based on API rate limits)
This should be much faster than sequential execution!

‚è∞ Started at: 2025-11-22 19:01:51

[AI] === New request ===
[AI] Model: gpt-4o-mini
[AI] Message: You are the Dungeon Master of a tile-based fantasy world. Always add 200 value or more to each secret in a tile.Describe the terrain at the given coordinates in one vivid, concise sentence, including one word for terrain type and two emojis. The emoj...
[AI] Using chat session: DungeonMaster
[AI] Initializing backend for model 'gpt-4o-mini'...
[AI] Model supports structured output: yes
[AI] Service ready.

[AI] === New request ===
[AI] Model: gpt-4o-mini
[AI] Message: You are the Dungeon Master of a tile-based fantasy world. Always add 200 value or more to each secret in a tile.Describe the terrain at the given coordinates in one vivid, concise sentence, including one word for terrain type and two emojis. The emoj...
[AI] Using chat

In [None]:
# Extract and analyze results
successful_results = [r for r in all_results if r['success'] and r['result']]

print(f"Total successful evaluations: {len(successful_results)}/{NUM_RUNS}\n")

if len(successful_results) == 0:
    print("No successful evaluations! Check errors above.")
else:
    # Parse results
    winners = []
    final_wealths = []
    final_healths = []
    turns_played = []
    all_turn_data = []  # For bias analysis
    
    for result_data in successful_results:
        result = result_data['result']
        
        # Extract winner
        winner = result.get('winner', 'unknown')
        winners.append(winner)
        
        # Extract details (directly from result, not nested in 'detail')
        final_wealth = result.get('final_wealth', {})
        final_health = result.get('final_health', {})
        turns = result.get('turns_played', 0)
        turn_data = result.get('turn_data', [])
        
        final_wealths.append(final_wealth)
        final_healths.append(final_health)
        turns_played.append(turns)
        all_turn_data.append(turn_data)
    
    # Analyze consistency
    print("=" * 60)
    print("CONSISTENCY ANALYSIS")
    print("=" * 60)
    
    # Winner consistency
    winner_counts = Counter(winners)
    print(f"\nWinner Distribution:")
    for winner, count in winner_counts.most_common():
        percentage = (count / len(winners)) * 100
        print(f"  {winner}: {count} times ({percentage:.1f}%)")
    
    # Check if winner is consistent
    most_common_winner, most_common_count = winner_counts.most_common(1)[0]
    winner_consistency = (most_common_count / len(winners)) * 100
    print(f"\nWinner Consistency: {winner_consistency:.1f}%")
    print(f"  (Same winner in {most_common_count}/{len(winners)} runs)")
    
    # Wealth consistency
    print(f"\nFinal Wealth Analysis:")
    if final_wealths:
        player_1_wealths = [w.get('player_1', 0) for w in final_wealths if 'player_1' in w]
        player_2_wealths = [w.get('player_2', 0) for w in final_wealths if 'player_2' in w]
        
        if player_1_wealths:
            print(f"  Player 1 wealth:")
            print(f"    Mean: {statistics.mean(player_1_wealths):.2f}")
            print(f"    Std Dev: {statistics.stdev(player_1_wealths) if len(player_1_wealths) > 1 else 0:.2f}")
            print(f"    Range: {min(player_1_wealths)} - {max(player_1_wealths)}")
            print(f"    Unique values: {len(set(player_1_wealths))}")
        
        if player_2_wealths:
            print(f"  Player 2 wealth:")
            print(f"    Mean: {statistics.mean(player_2_wealths):.2f}")
            print(f"    Std Dev: {statistics.stdev(player_2_wealths) if len(player_2_wealths) > 1 else 0:.2f}")
            print(f"    Range: {min(player_2_wealths)} - {max(player_2_wealths)}")
            print(f"    Unique values: {len(set(player_2_wealths))}")
    
    # ============================================================
    # DETAILED MONEY ANALYSIS
    # ============================================================
    print("\n" + "=" * 60)
    print("DETAILED FINAL MONEY ANALYSIS")
    print("=" * 60)
    
    if final_wealths:
        starting_wealth = TEST_CONFIG.get("starting_wealth", 100)
        max_turns = TEST_CONFIG.get("max_turns", 3)
        
        # Calculate theoretical maximum
        # According to DM prompt: "give money_change = +50 to +250 (taken from tile secrets)"
        # So the maximum reward per turn is 250
        # Maximum final wealth = starting_wealth + (250 * max_turns)
        max_reward_per_turn = 250  # Maximum per DM prompt
        theoretical_max = starting_wealth + (max_reward_per_turn * max_turns)
        
        print(f"\nüí∞ Theoretical Maximum Money:")
        print(f"  Starting Wealth: {starting_wealth}")
        print(f"  Max Turns: {max_turns}")
        print(f"  Maximum Reward Per Turn: {max_reward_per_turn} (according to DM prompt)")
        print(f"  Theoretical Maximum Final Wealth: {theoretical_max}")
        print(f"    Formula: {starting_wealth} + ({max_reward_per_turn} √ó {max_turns}) = {theoretical_max}")
        print(f"  Note: DM prompt allows money_change from +50 to +250 per turn")
        
        # Player 1 detailed analysis
        if player_1_wealths:
            p1_mean = statistics.mean(player_1_wealths)
            p1_std = statistics.stdev(player_1_wealths) if len(player_1_wealths) > 1 else 0
            p1_min = min(player_1_wealths)
            p1_max = max(player_1_wealths)
            p1_median = statistics.median(player_1_wealths)
            p1_unique = len(set(player_1_wealths))
            p1_total_earned = sum(w - starting_wealth for w in player_1_wealths)
            p1_avg_earned = p1_mean - starting_wealth
            
            # Wealth distribution
            p1_wealth_counts = Counter(player_1_wealths)
            p1_most_common_wealth, p1_most_common_count = p1_wealth_counts.most_common(1)[0]
            p1_most_common_pct = (p1_most_common_count / len(player_1_wealths)) * 100
            
            print(f"\nüìä Player 1 Final Money Statistics:")
            print(f"  Starting Wealth: {starting_wealth}")
            print(f"  Final Wealth:")
            print(f"    Mean: {p1_mean:.2f}")
            print(f"    Median: {p1_median:.2f}")
            print(f"    Std Dev: {p1_std:.2f}")
            print(f"    Min: {p1_min}")
            print(f"    Max: {p1_max}")
            print(f"    Range: {p1_max - p1_min}")
            print(f"    Unique values: {p1_unique}/{len(player_1_wealths)}")
            print(f"  Money Earned (from starting):")
            print(f"    Total across all runs: {p1_total_earned:.2f}")
            print(f"    Average per run: {p1_avg_earned:.2f}")
            print(f"    Min earned: {p1_min - starting_wealth}")
            print(f"    Max earned: {p1_max - starting_wealth}")
            print(f"  Distribution:")
            print(f"    Most common final wealth: {p1_most_common_wealth} (appears {p1_most_common_count}/{len(player_1_wealths)} times, {p1_most_common_pct:.1f}%)")
            if p1_unique <= 10:
                print(f"    All wealth values:")
                for wealth, count in sorted(p1_wealth_counts.items()):
                    pct = (count / len(player_1_wealths)) * 100
                    print(f"      {wealth}: {count} times ({pct:.1f}%)")
            
            # Consistency assessment
            if p1_unique == 1:
                print(f"  ‚úÖ PERFECT CONSISTENCY: All runs ended with {player_1_wealths[0]} money")
            elif p1_std < 1.0:
                print(f"  ‚úÖ HIGH CONSISTENCY: Very low variance (std: {p1_std:.2f})")
            elif p1_std < 5.0:
                print(f"  ‚ö†Ô∏è  MODERATE CONSISTENCY: Some variance (std: {p1_std:.2f})")
            else:
                print(f"  ‚ùå LOW CONSISTENCY: High variance (std: {p1_std:.2f})")
            
            # Achievement vs maximum
            achievement_pct = (p1_max / theoretical_max) * 100
            mean_achievement_pct = (p1_mean / theoretical_max) * 100
            print(f"  Achievement vs Maximum:")
            print(f"    Highest achieved: {p1_max} / {theoretical_max} ({achievement_pct:.1f}%)")
            print(f"    Mean achieved: {p1_mean:.2f} / {theoretical_max} ({mean_achievement_pct:.1f}%)")
            if p1_max == theoretical_max:
                print(f"    ‚úÖ Reached maximum possible wealth!")
            elif achievement_pct >= 95:
                print(f"    ‚úÖ Very close to maximum (within 5%)")
            elif achievement_pct >= 80:
                print(f"    ‚ö†Ô∏è  Close to maximum but not quite there")
            else:
                print(f"    ‚ùå Significantly below maximum")
        
        # Player 2 detailed analysis
        if player_2_wealths:
            p2_mean = statistics.mean(player_2_wealths)
            p2_std = statistics.stdev(player_2_wealths) if len(player_2_wealths) > 1 else 0
            p2_min = min(player_2_wealths)
            p2_max = max(player_2_wealths)
            p2_median = statistics.median(player_2_wealths)
            p2_unique = len(set(player_2_wealths))
            p2_total_earned = sum(w - starting_wealth for w in player_2_wealths)
            p2_avg_earned = p2_mean - starting_wealth
            
            # Wealth distribution
            p2_wealth_counts = Counter(player_2_wealths)
            p2_most_common_wealth, p2_most_common_count = p2_wealth_counts.most_common(1)[0]
            p2_most_common_pct = (p2_most_common_count / len(player_2_wealths)) * 100
            
            print(f"\nüìä Player 2 Final Money Statistics:")
            print(f"  Starting Wealth: {starting_wealth}")
            print(f"  Final Wealth:")
            print(f"    Mean: {p2_mean:.2f}")
            print(f"    Median: {p2_median:.2f}")
            print(f"    Std Dev: {p2_std:.2f}")
            print(f"    Min: {p2_min}")
            print(f"    Max: {p2_max}")
            print(f"    Range: {p2_max - p2_min}")
            print(f"    Unique values: {p2_unique}/{len(player_2_wealths)}")
            print(f"  Money Earned (from starting):")
            print(f"    Total across all runs: {p2_total_earned:.2f}")
            print(f"    Average per run: {p2_avg_earned:.2f}")
            print(f"    Min earned: {p2_min - starting_wealth}")
            print(f"    Max earned: {p2_max - starting_wealth}")
            print(f"  Distribution:")
            print(f"    Most common final wealth: {p2_most_common_wealth} (appears {p2_most_common_count}/{len(player_2_wealths)} times, {p2_most_common_pct:.1f}%)")
            if p2_unique <= 10:
                print(f"    All wealth values:")
                for wealth, count in sorted(p2_wealth_counts.items()):
                    pct = (count / len(player_2_wealths)) * 100
                    print(f"      {wealth}: {count} times ({pct:.1f}%)")
            
            # Consistency assessment
            if p2_unique == 1:
                print(f"  ‚úÖ PERFECT CONSISTENCY: All runs ended with {player_2_wealths[0]} money")
            elif p2_std < 1.0:
                print(f"  ‚úÖ HIGH CONSISTENCY: Very low variance (std: {p2_std:.2f})")
            elif p2_std < 5.0:
                print(f"  ‚ö†Ô∏è  MODERATE CONSISTENCY: Some variance (std: {p2_std:.2f})")
            else:
                print(f"  ‚ùå LOW CONSISTENCY: High variance (std: {p2_std:.2f})")
            
            # Achievement vs maximum
            achievement_pct = (p2_max / theoretical_max) * 100
            mean_achievement_pct = (p2_mean / theoretical_max) * 100
            print(f"  Achievement vs Maximum:")
            print(f"    Highest achieved: {p2_max} / {theoretical_max} ({achievement_pct:.1f}%)")
            print(f"    Mean achieved: {p2_mean:.2f} / {theoretical_max} ({mean_achievement_pct:.1f}%)")
            if p2_max == theoretical_max:
                print(f"    ‚úÖ Reached maximum possible wealth!")
            elif achievement_pct >= 95:
                print(f"    ‚úÖ Very close to maximum (within 5%)")
            elif achievement_pct >= 80:
                print(f"    ‚ö†Ô∏è  Close to maximum but not quite there")
            else:
                print(f"    ‚ùå Significantly below maximum")
        
        # Comparative analysis
        if player_1_wealths and player_2_wealths:
            print(f"\nüìà Comparative Analysis:")
            p1_mean = statistics.mean(player_1_wealths)
            p2_mean = statistics.mean(player_2_wealths)
            p1_total = sum(player_1_wealths)
            p2_total = sum(player_2_wealths)
            
            print(f"  Average final wealth:")
            print(f"    Player 1: {p1_mean:.2f}")
            print(f"    Player 2: {p2_mean:.2f}")
            print(f"    Difference: {abs(p1_mean - p2_mean):.2f}")
            
            if p1_mean > p2_mean:
                advantage = ((p1_mean - p2_mean) / p2_mean) * 100
                print(f"    Player 1 has {advantage:.1f}% more money on average")
            elif p2_mean > p1_mean:
                advantage = ((p2_mean - p1_mean) / p1_mean) * 100
                print(f"    Player 2 has {advantage:.1f}% more money on average")
            else:
                print(f"    Both players have equal average wealth")
            
            print(f"  Total wealth across all runs:")
            print(f"    Player 1: {p1_total:.2f}")
            print(f"    Player 2: {p2_total:.2f}")
            
            # Check if players end with same wealth
            same_wealth_count = sum(1 for w1, w2 in zip(player_1_wealths, player_2_wealths) if w1 == w2)
            same_wealth_pct = (same_wealth_count / len(player_1_wealths)) * 100
            print(f"  Runs where both players ended with same wealth: {same_wealth_count}/{len(player_1_wealths)} ({same_wealth_pct:.1f}%)")
    
    # Health consistency
    print(f"\nFinal Health Analysis:")
    if final_healths:
        player_1_healths = [h.get('player_1', 0) for h in final_healths if 'player_1' in h]
        player_2_healths = [h.get('player_2', 0) for h in final_healths if 'player_2' in h]
        
        if player_1_healths:
            print(f"  Player 1 health:")
            print(f"    Mean: {statistics.mean(player_1_healths):.2f}")
            print(f"    Std Dev: {statistics.stdev(player_1_healths) if len(player_1_healths) > 1 else 0:.2f}")
            print(f"    Range: {min(player_1_healths)} - {max(player_1_healths)}")
            print(f"    Unique values: {len(set(player_1_healths))}")
        
        if player_2_healths:
            print(f"  Player 2 health:")
            print(f"    Mean: {statistics.mean(player_2_healths):.2f}")
            print(f"    Std Dev: {statistics.stdev(player_2_healths) if len(player_2_healths) > 1 else 0:.2f}")
            print(f"    Range: {min(player_2_healths)} - {max(player_2_healths)}")
            print(f"    Unique values: {len(set(player_2_healths))}")
    
    # Turns consistency
    print(f"\nTurns Played Analysis:")
    if turns_played:
        print(f"  Mean: {statistics.mean(turns_played):.2f}")
        print(f"  Std Dev: {statistics.stdev(turns_played) if len(turns_played) > 1 else 0:.2f}")
        print(f"  Range: {min(turns_played)} - {max(turns_played)}")
        print(f"  Unique values: {len(set(turns_played))}")
    
    # Overall consistency score
    print(f"\n" + "=" * 60)
    print("OVERALL CONSISTENCY ASSESSMENT")
    print("=" * 60)
    
    # Calculate consistency metrics
    wealth_consistency = 0
    if player_1_wealths and len(set(player_1_wealths)) == 1:
        wealth_consistency += 0.5
    if player_2_wealths and len(set(player_2_wealths)) == 1:
        wealth_consistency += 0.5
    
    health_consistency = 0
    if player_1_healths and len(set(player_1_healths)) == 1:
        health_consistency += 0.5
    if player_2_healths and len(set(player_2_healths)) == 1:
        health_consistency += 0.5
    
    turns_consistency = 1.0 if len(set(turns_played)) == 1 else 0.0
    
    overall_score = (winner_consistency / 100 * 0.4 + 
                    wealth_consistency * 0.3 + 
                    health_consistency * 0.2 + 
                    turns_consistency * 0.1)
    
    print(f"\nConsistency Score: {overall_score * 100:.1f}%")
    print(f"  - Winner consistency: {winner_consistency:.1f}%")
    print(f"  - Wealth consistency: {wealth_consistency * 100:.1f}%")
    print(f"  - Health consistency: {health_consistency * 100:.1f}%")
    print(f"  - Turns consistency: {turns_consistency * 100:.1f}%")
    
    if overall_score >= 0.9:
        print("\n‚úÖ HIGH CONSISTENCY: Results are very similar across runs")
    elif overall_score >= 0.7:
        print("\n‚ö†Ô∏è  MODERATE CONSISTENCY: Some variation in results")
    else:
        print("\n‚ùå LOW CONSISTENCY: Significant variation in results")
    
    # ============================================================
    # BIAS ANALYSIS: Check if same actions get same rewards
    # ============================================================
    print("\n" + "=" * 60)
    print("BIAS ANALYSIS: Action-Reward Consistency")
    print("=" * 60)
    
    # Group rewards by action text
    action_reward_map = {}  # action_text -> list of rewards
    
    for run_idx, turn_data in enumerate(all_turn_data):
        for turn_info in turn_data:
            actions = turn_info.get('actions', {})
            rewards = turn_info.get('rewards', {})
            
            for player_id, action_text in actions.items():
                reward = rewards.get(player_id, 0)
                
                if action_text not in action_reward_map:
                    action_reward_map[action_text] = []
                action_reward_map[action_text].append({
                    'reward': reward,
                    'player': player_id,
                    'run': run_idx,
                    'turn': turn_info.get('turn', 0)
                })
    
    # Analyze bias: same actions should get same rewards
    print(f"\nAnalyzing {len(action_reward_map)} unique actions across all runs...\n")
    
    biased_actions = []
    consistent_actions = []
    
    for action_text, reward_data in action_reward_map.items():
        rewards = [r['reward'] for r in reward_data]
        unique_rewards = set(rewards)
        
        if len(unique_rewards) == 1:
            # All same reward - consistent
            consistent_actions.append({
                'action': action_text,
                'reward': rewards[0],
                'count': len(rewards)
            })
        else:
            # Different rewards for same action - potential bias
            biased_actions.append({
                'action': action_text,
                'rewards': rewards,
                'unique_rewards': list(unique_rewards),
                'mean': statistics.mean(rewards),
                'std_dev': statistics.stdev(rewards) if len(rewards) > 1 else 0,
                'min': min(rewards),
                'max': max(rewards),
                'count': len(rewards)
            })
    
    print(f"‚úÖ Consistent Actions (same reward every time): {len(consistent_actions)}")
    print(f"‚ö†Ô∏è  Potentially Biased Actions (different rewards): {len(biased_actions)}\n")
    
    if consistent_actions:
        print("Consistent Actions (showing first 5):")
        for action_info in consistent_actions[:5]:
            print(f"  '{action_info['action'][:50]}...' ‚Üí {action_info['reward']} money ({action_info['count']} times)")
    
    if biased_actions:
        print(f"\n‚ö†Ô∏è  Potentially Biased Actions (showing all {len(biased_actions)}):")
        for action_info in biased_actions:
            print(f"\n  Action: '{action_info['action'][:60]}...'")
            print(f"    Occurrences: {action_info['count']}")
            print(f"    Reward Range: {action_info['min']} to {action_info['max']}")
            print(f"    Mean Reward: {action_info['mean']:.2f}")
            print(f"    Std Dev: {action_info['std_dev']:.2f}")
            print(f"    Unique Rewards: {action_info['unique_rewards']}")
    
    # Calculate average rewards per action
    print("\n" + "=" * 60)
    print("AVERAGE REWARDS BY ACTION")
    print("=" * 60)
    
    for action_text, reward_data in sorted(action_reward_map.items(), key=lambda x: -len(x[1])):
        rewards = [r['reward'] for r in reward_data]
        avg_reward = statistics.mean(rewards)
        std_reward = statistics.stdev(rewards) if len(rewards) > 1 else 0
        
        print(f"\n'{action_text[:60]}...'")
        print(f"  Count: {len(rewards)}")
        print(f"  Average Reward: {avg_reward:.2f} money")
        print(f"  Std Deviation: {std_reward:.2f}")
        print(f"  Range: {min(rewards)} - {max(rewards)}")
    
    # Overall bias summary
    print("\n" + "=" * 60)
    print("BIAS SUMMARY")
    print("=" * 60)
    
    total_actions = sum(len(rd) for rd in action_reward_map.values())
    consistent_count = sum(len(rd) for action, rd in action_reward_map.items() if len(set(r['reward'] for r in rd)) == 1)
    biased_count = total_actions - consistent_count
    
    consistency_percentage = (consistent_count / total_actions * 100) if total_actions > 0 else 0
    
    print(f"\nTotal action instances: {total_actions}")
    print(f"Consistent rewards: {consistent_count} ({consistency_percentage:.1f}%)")
    print(f"Variable rewards: {biased_count} ({100 - consistency_percentage:.1f}%)")
    
    if consistency_percentage >= 95:
        print("\n‚úÖ EXCELLENT: Judge is highly consistent - same actions get same rewards")
    elif consistency_percentage >= 80:
        print("\n‚ö†Ô∏è  MODERATE: Judge shows some variation - most actions get consistent rewards")
    else:
        print("\n‚ùå HIGH VARIANCE: Judge shows significant bias - same actions get different rewards")


Total successful evaluations: 60/60

CONSISTENCY ANALYSIS

Winner Distribution:
  player_1: 60 times (100.0%)

Winner Consistency: 100.0%
  (Same winner in 60/60 runs)

Final Wealth Analysis:
  Player 1 wealth:
    Mean: 250.00
    Std Dev: 0.00
    Range: 250 - 250
    Unique values: 1
  Player 2 wealth:
    Mean: 250.00
    Std Dev: 0.00
    Range: 250 - 250
    Unique values: 1

DETAILED FINAL MONEY ANALYSIS

üí∞ Theoretical Maximum Money:
  Starting Wealth: 100
  Max Turns: 3
  With Mock DM (50 per turn): 250
    Formula: 100 + (50 √ó 3) = 250
  With Real AI DM (up to 250 per turn): 850
    Formula: 100 + (250 √ó 3) = 850
  Note: Current setup uses Mock DM, so max is 250

üìä Player 1 Final Money Statistics:
  Starting Wealth: 100
  Final Wealth:
    Mean: 250.00
    Median: 250.00
    Std Dev: 0.00
    Min: 250
    Max: 250
    Range: 0
    Unique values: 1/60
  Money Earned (from starting):
    Total across all runs: 9000.00
    Average per run: 150.00
    Min earned: 150
    M

In [None]:
# Display sample results for inspection
print("Sample Results (first 5 successful runs):\n")
for i, result_data in enumerate(successful_results[:5]):
    print(f"Run {result_data['run_number']}:")
    result = result_data['result']
    print(f"  Winner: {result.get('winner', 'unknown')}")
    print(f"  Final Wealth: {result.get('final_wealth', {})}")
    print(f"  Final Health: {result.get('final_health', {})}")
    print(f"  Turns Played: {result.get('turns_played', 0)}")
    print(f"  Game Over: {result.get('is_game_over', False)}")
    print("-" * 60)


Sample Results (first 5 successful runs):

Run 1:
  Winner: player_1
  Final Wealth: {'player_1': 250, 'player_2': 250}
  Final Health: {'player_1': 115, 'player_2': 115}
  Turns Played: 3
  Game Over: True
------------------------------------------------------------
Run 2:
  Winner: player_1
  Final Wealth: {'player_1': 250, 'player_2': 250}
  Final Health: {'player_1': 115, 'player_2': 115}
  Turns Played: 3
  Game Over: True
------------------------------------------------------------
Run 3:
  Winner: player_1
  Final Wealth: {'player_1': 250, 'player_2': 250}
  Final Health: {'player_1': 115, 'player_2': 115}
  Turns Played: 3
  Game Over: True
------------------------------------------------------------
Run 4:
  Winner: player_1
  Final Wealth: {'player_1': 250, 'player_2': 250}
  Final Health: {'player_1': 115, 'player_2': 115}
  Turns Played: 3
  Game Over: True
------------------------------------------------------------
Run 5:
  Winner: player_1
  Final Wealth: {'player_1': 25

In [None]:
# Show detailed error information for debugging
failed_runs = [r for r in all_results if not r['success']]
if failed_runs:
    print(f"\nDetailed Error Information (first 3 failures):")
    for i, failed in enumerate(failed_runs[:3]):
        print(f"\n--- Run {failed['run_number']} ---")
        print(f"Error Type: {failed.get('error_type', 'Unknown')}")
        print(f"Error Message: {failed.get('error', 'Unknown error')}")
        if len(failed.get('error', '')) > 500:
            print(f"(Error message truncated, full length: {len(failed.get('error', ''))})")


In [None]:
# Test a single run with detailed error reporting
print("Testing a single game run with detailed error handling...\n")

try:
    import traceback
    result = run_single_game(
        max_turns=TEST_CONFIG["max_turns"],
        world_size=TEST_CONFIG["world_size"],
        starting_wealth=TEST_CONFIG["starting_wealth"],
        fixed_responses=FIXED_RESPONSES
    )
    print("‚úÖ Single test run successful!")
    print(f"Result: {result}")
except Exception as e:
    print(f"‚ùå Single test run failed!")
    print(f"Error Type: {type(e).__name__}")
    print(f"Error Message: {str(e)}")
    print("\nFull traceback:")
    traceback.print_exc()


Testing a single game run with detailed error handling...

[Game] Game ended: Maximum turns reached (3/3)
‚úÖ Single test run successful!
Result: {'winner': 'player_1', 'final_wealth': {'player_1': 250, 'player_2': 250}, 'final_health': {'player_1': 115, 'player_2': 115}, 'turns_played': 3, 'is_game_over': True, 'game_over_reason': 'Maximum turns reached (3/3)', 'turn_data': [{'turn': 1, 'actions': {'player_1': 'I will move north and explore the area', 'player_2': 'I will move east and collect resources'}, 'rewards': {'player_1': 50, 'player_2': 50}}, {'turn': 2, 'actions': {'player_1': 'I will move north and explore the area', 'player_2': 'I will move east and collect resources'}, 'rewards': {'player_1': 50, 'player_2': 50}}, {'turn': 3, 'actions': {'player_1': 'I will move north and explore the area', 'player_2': 'I will move east and collect resources'}, 'rewards': {'player_1': 50, 'player_2': 50}}]}


In [None]:
# Test AI-based Eval Service
from eval import EvalWrapper, quick_evaluate

# Test configuration
TEST_ENVIRONMENT = "You are a customer service agent. A customer is complaining about a delayed order and is very upset."
TEST_RESPONSE = "I sincerely apologize for the inconvenience. Let me immediately check your order status and provide you with an updated timeline. I understand your frustration and want to resolve this quickly."

EVAL_NUM_RUNS = 60
EVAL_AI_MODEL = "gpt-4o-mini"  # AI model to use for evaluation

print("=" * 60)
print("AI-BASED EVAL SERVICE CONSISTENCY TEST")
print("=" * 60)
print(f"Testing if the AI evaluator gives consistent scores for the same inputs")
print(f"\nConfiguration:")
print(f"  Runs: {EVAL_NUM_RUNS}")
print(f"  Service Type: custom (AI-based)")
print(f"  AI Model: {EVAL_AI_MODEL}")
print(f"  Environment: {TEST_ENVIRONMENT[:80]}...")
print(f"  Response: {TEST_RESPONSE[:80]}...")
print(f"\nNote: Using real AI model with environment variables loaded from backend/.env\n")

eval_results = []

for i in range(EVAL_NUM_RUNS):
    if (i + 1) % 10 == 0:
        print(f"Completed {i + 1}/{EVAL_NUM_RUNS} evaluations...")
    
    try:
        result = EvalWrapper.evaluate(
            environment_text=TEST_ENVIRONMENT,
            user_response_text=TEST_RESPONSE,
            service_type="custom",
            ai_model=EVAL_AI_MODEL,
            verbose=False
        )
        
        eval_results.append({
            "run": i + 1,
            "success": True,
            "result": result
        })
    except Exception as e:
        eval_results.append({
            "run": i + 1,
            "success": False,
            "error": str(e),
            "error_type": type(e).__name__
        })
        # Print first few errors for debugging
        if i < 3:
            print(f"Error in run {i + 1}: {type(e).__name__}: {str(e)[:100]}")

print(f"\nEval Service Results:")
print(f"  Successful: {sum(1 for r in eval_results if r['success'])}/{EVAL_NUM_RUNS}")
print(f"  Failed: {sum(1 for r in eval_results if not r['success'])}/{EVAL_NUM_RUNS}")

if sum(1 for r in eval_results if not r['success']) > 0:
    error_counts = Counter([r.get('error_type', 'Unknown') for r in eval_results if not r['success']])
    print(f"\nError breakdown:")
    for error_type, count in error_counts.items():
        print(f"  {error_type}: {count}")


AI-BASED EVAL SERVICE CONSISTENCY TEST
Testing if the AI evaluator gives consistent scores for the same inputs

Configuration:
  Runs: 60
  Service Type: custom (AI-based)
  AI Model: gpt-4o-mini
  Environment: You are a customer service agent. A customer is complaining about a delayed orde...
  Response: I sincerely apologize for the inconvenience. Let me immediately check your order...

Note: Using real AI model with environment variables loaded from backend/.env

Completed 10/60 evaluations...
Completed 20/60 evaluations...
Completed 30/60 evaluations...
Completed 40/60 evaluations...
Completed 50/60 evaluations...
Completed 60/60 evaluations...

Eval Service Results:
  Successful: 60/60
  Failed: 0/60


In [None]:
# Analyze AI Eval Service Results
successful_eval = [r for r in eval_results if r['success']]

if len(successful_eval) > 0:
    scores = [r['result'].get('score', 0) for r in successful_eval]
    
    # Extract all metric keys from custom eval (different structure than mock)
    all_metrics = {}
    metric_keys = set()
    for r in successful_eval:
        # Custom eval returns: score, appropriateness, completeness, clarity, creativity, action_validity
        for key in ['score', 'appropriateness', 'completeness', 'clarity', 'creativity', 'action_validity']:
            if key in r['result']:
                metric_keys.add(key)
    
    for key in metric_keys:
        all_metrics[key] = [r['result'].get(key, 0) for r in successful_eval]
    
    print("=" * 60)
    print("AI EVAL SERVICE - SCORE CONSISTENCY ANALYSIS")
    print("=" * 60)
    
    MAX_SCORE = 1.0  # Theoretical maximum score
    
    print(f"\nOverall Score Analysis:")
    print(f"  Mean: {statistics.mean(scores):.4f}")
    print(f"  Std Dev: {statistics.stdev(scores) if len(scores) > 1 else 0:.4f}")
    print(f"  Min: {min(scores):.4f}")
    print(f"  Max: {max(scores):.4f}")
    print(f"  Unique values: {len(set(scores))}")
    print(f"\n  Maximum Possible Score: {MAX_SCORE:.4f}")
    print(f"  Highest Score Achieved: {max(scores):.4f}")
    print(f"  Mean Score: {statistics.mean(scores):.4f}")
    print(f"  % of Maximum (Mean): {(statistics.mean(scores) / MAX_SCORE * 100):.2f}%")
    print(f"  % of Maximum (Max): {(max(scores) / MAX_SCORE * 100):.2f}%")
    print(f"  Distance from Max (Mean): {MAX_SCORE - statistics.mean(scores):.4f}")
    print(f"  Distance from Max (Best): {MAX_SCORE - max(scores):.4f}")
    
    if len(set(scores)) == 1:
        print(f"\n  ‚úÖ PERFECT CONSISTENCY: All scores are identical ({scores[0]:.4f})")
    else:
        score_variance = statistics.stdev(scores) if len(scores) > 1 else 0
        if score_variance < 0.01:
            print(f"\n  ‚úÖ HIGH CONSISTENCY: Very low variance ({score_variance:.4f})")
        elif score_variance < 0.05:
            print(f"\n  ‚ö†Ô∏è  MODERATE CONSISTENCY: Some variance ({score_variance:.4f})")
        else:
            print(f"\n  ‚ùå LOW CONSISTENCY: High variance ({score_variance:.4f})")
    
    # Analyze individual metrics
    print(f"\nIndividual Metrics Analysis:")
    for metric_name, values in all_metrics.items():
        unique_count = len(set(values))
        mean_val = statistics.mean(values)
        std_val = statistics.stdev(values) if len(values) > 1 else 0
        max_val = max(values)
        MAX_METRIC = 1.0  # All metrics are 0-1 scale
        
        print(f"\n  {metric_name}:")
        print(f"    Mean: {mean_val:.4f}")
        print(f"    Std Dev: {std_val:.4f}")
        print(f"    Max Achieved: {max_val:.4f}")
        print(f"    Maximum Possible: {MAX_METRIC:.4f}")
        print(f"    % of Maximum (Mean): {(mean_val / MAX_METRIC * 100):.2f}%")
        print(f"    % of Maximum (Max): {(max_val / MAX_METRIC * 100):.2f}%")
        print(f"    Unique values: {unique_count}")
        if unique_count == 1:
            print(f"    ‚úÖ Perfectly consistent")
        elif std_val < 0.01:
            print(f"    ‚úÖ Highly consistent")
        elif std_val < 0.05:
            print(f"    ‚ö†Ô∏è  Moderate variance")
        else:
            print(f"    ‚ùå High variance")
    
    # Check reasoning consistency
    reasonings = [r['result'].get('reasoning', '') for r in successful_eval]
    unique_reasonings = len(set(reasonings))
    print(f"\nReasoning Consistency:")
    print(f"  Unique reasonings: {unique_reasonings}/{len(reasonings)}")
    if unique_reasonings == 1:
        print(f"  ‚úÖ Perfectly consistent reasoning")
    elif unique_reasonings < len(reasonings) * 0.1:
        print(f"  ‚úÖ Mostly consistent reasoning (few unique variants)")
    else:
        print(f"  ‚ö†Ô∏è  Reasoning varies significantly across runs")
else:
    print("No successful evaluations to analyze!")


AI EVAL SERVICE - SCORE CONSISTENCY ANALYSIS

Overall Score Analysis:
  Mean: 0.9000
  Std Dev: 0.0000
  Min: 0.9000
  Max: 0.9000
  Unique values: 1

  Maximum Possible Score: 1.0000
  Highest Score Achieved: 0.9000
  Mean Score: 0.9000
  % of Maximum (Mean): 90.00%
  % of Maximum (Max): 90.00%
  Distance from Max (Mean): 0.1000
  Distance from Max (Best): 0.1000

  ‚úÖ PERFECT CONSISTENCY: All scores are identical (0.9000)

Individual Metrics Analysis:

  appropriateness:
    Mean: 1.0000
    Std Dev: 0.0000
    Max Achieved: 1.0000
    Maximum Possible: 1.0000
    % of Maximum (Mean): 100.00%
    % of Maximum (Max): 100.00%
    Unique values: 1
    ‚úÖ Perfectly consistent

  score:
    Mean: 0.9000
    Std Dev: 0.0000
    Max Achieved: 0.9000
    Maximum Possible: 1.0000
    % of Maximum (Mean): 90.00%
    % of Maximum (Max): 90.00%
    Unique values: 1
    ‚úÖ Perfectly consistent

  completeness:
    Mean: 0.8500
    Std Dev: 0.0092
    Max Achieved: 0.9000
    Maximum Possible: 

In [None]:
# Additional Analysis: Score Distribution and Patterns
if len(successful_eval) > 0:
    # Get scores from successful evaluations
    scores = [r['result'].get('score', 0) for r in successful_eval]
    
    print("\n" + "=" * 60)
    print("ADDITIONAL ANALYSIS")
    print("=" * 60)
    
    # Score distribution
    print(f"\nScore Distribution:")
    score_counts = Counter([round(s, 2) for s in scores])
    print(f"  Most common scores:")
    for score_val, count in score_counts.most_common(10):
        percentage = (count / len(scores)) * 100
        print(f"    {score_val:.2f}: {count} times ({percentage:.1f}%)")
    
    # Check if scores cluster around certain values
    score_ranges = {
        "0.9-1.0 (Excellent)": sum(1 for s in scores if 0.9 <= s <= 1.0),
        "0.8-0.9 (Very Good)": sum(1 for s in scores if 0.8 <= s < 0.9),
        "0.7-0.8 (Good)": sum(1 for s in scores if 0.7 <= s < 0.8),
        "0.6-0.7 (Fair)": sum(1 for s in scores if 0.6 <= s < 0.7),
        "0.0-0.6 (Poor)": sum(1 for s in scores if 0.0 <= s < 0.6),
    }
    
    print(f"\nScore Range Distribution:")
    for range_name, count in score_ranges.items():
        percentage = (count / len(scores)) * 100
        print(f"  {range_name}: {count} ({percentage:.1f}%)")
    
    # Check for any patterns in variance
    if len(scores) > 1:
        score_variance = statistics.stdev(scores)
        coefficient_of_variation = (score_variance / statistics.mean(scores)) * 100 if statistics.mean(scores) > 0 else 0
        
        print(f"\nVariance Analysis:")
        print(f"  Coefficient of Variation: {coefficient_of_variation:.2f}%")
        if coefficient_of_variation < 5:
            print(f"    ‚úÖ Very low relative variance - highly consistent")
        elif coefficient_of_variation < 10:
            print(f"    ‚ö†Ô∏è  Low relative variance - generally consistent")
        elif coefficient_of_variation < 20:
            print(f"    ‚ö†Ô∏è  Moderate relative variance - some inconsistency")
        else:
            print(f"    ‚ùå High relative variance - significant inconsistency")



ADDITIONAL ANALYSIS

Score Distribution:
  Most common scores:
    0.90: 60 times (100.0%)

Score Range Distribution:
  0.9-1.0 (Excellent): 60 (100.0%)
  0.8-0.9 (Very Good): 0 (0.0%)
  0.7-0.8 (Good): 0 (0.0%)
  0.6-0.7 (Fair): 0 (0.0%)
  0.0-0.6 (Poor): 0 (0.0%)

Variance Analysis:
  Coefficient of Variation: 0.00%
    ‚úÖ Very low relative variance - highly consistent


In [None]:
# This cell is intentionally left empty
# All analysis is done in cells 14, 15, and 17
pass


In [None]:
# Final Summary
print("\n" + "=" * 60)
print("AI EVAL SERVICE - FINAL SUMMARY")
print("=" * 60)

if len(successful_eval) > 0:
    MAX_SCORE = 1.0
    eval_scores = [r['result'].get('score', 0) for r in successful_eval]
    
    eval_std = statistics.stdev(eval_scores) if len(eval_scores) > 1 else 0
    eval_mean = statistics.mean(eval_scores)
    eval_max = max(eval_scores)
    eval_min = min(eval_scores)
    
    print(f"\nOverall Performance:")
    print(f"  Mean Score: {eval_mean:.4f}")
    print(f"  Max Score: {eval_max:.4f}")
    print(f"  Min Score: {eval_min:.4f}")
    print(f"  Score Range: {eval_max - eval_min:.4f}")
    print(f"  Std Deviation: {eval_std:.4f}")
    
    print(f"\nMaximum Achievement:")
    print(f"  Maximum Possible Score: {MAX_SCORE:.4f}")
    print(f"  Highest Score Achieved: {eval_max:.4f}")
    print(f"  Mean Score: {eval_mean:.4f}")
    print(f"  % of Maximum (Mean): {(eval_mean / MAX_SCORE * 100):.2f}%")
    print(f"  % of Maximum (Max): {(eval_max / MAX_SCORE * 100):.2f}%")
    print(f"  Distance from Max (Mean): {MAX_SCORE - eval_mean:.4f}")
    print(f"  Distance from Max (Best): {MAX_SCORE - eval_max:.4f}")
    
    print(f"\nConsistency Assessment:")
    if eval_std == 0:
        print(f"  ‚úÖ PERFECT CONSISTENCY: All scores are identical")
    elif eval_std < 0.01:
        print(f"  ‚úÖ EXCELLENT CONSISTENCY: Very low variance ({eval_std:.4f})")
        print(f"     AI evaluator is highly consistent for same inputs")
    elif eval_std < 0.05:
        print(f"  ‚ö†Ô∏è  MODERATE CONSISTENCY: Some variance ({eval_std:.4f})")
        print(f"     AI evaluator shows some variation but generally consistent")
    elif eval_std < 0.1:
        print(f"  ‚ö†Ô∏è  MODERATE VARIANCE: Noticeable variance ({eval_std:.4f})")
        print(f"     AI evaluator shows moderate variation")
    else:
        print(f"  ‚ùå HIGH VARIANCE: Significant variance ({eval_std:.4f})")
        print(f"     AI evaluator shows high inconsistency for same inputs")
    
    print(f"\nRecommendations:")
    if eval_std < 0.05:
        print(f"  ‚úÖ The AI eval service is sufficiently consistent for production use")
        if eval_max >= 0.9:
            print(f"  ‚úÖ Scores are close to maximum - evaluator is performing well")
        elif eval_max >= 0.7:
            print(f"  ‚ö†Ô∏è  Scores are moderate - evaluator may be conservative")
        else:
            print(f"  ‚ö†Ô∏è  Scores are low - evaluator may be too strict or response needs improvement")
    elif eval_std < 0.1:
        print(f"  ‚ö†Ô∏è  The eval service shows some variance - consider averaging multiple evaluations")
        print(f"     This will help reduce variance and provide more reliable scores")
    else:
        print(f"  ‚ùå The eval service shows high variance - same inputs produce different scores")
        print(f"     Consider:")
        print(f"     - Using temperature=0 for more deterministic outputs")
        print(f"     - Averaging multiple evaluation runs")
        print(f"     - Using a more deterministic evaluation approach")
else:
    print("\n‚ùå No successful evaluations to analyze!")
    if eval_results:
        print("\nErrors encountered:")
        error_counts = Counter([r.get('error_type', 'Unknown') for r in eval_results if not r['success']])
        for error_type, count in error_counts.most_common(5):
            print(f"  {error_type}: {count} times")
        print("\nFirst few error messages:")
        for i, result in enumerate([r for r in eval_results if not r['success']][:3]):
            print(f"  Run {result['run']}: {result.get('error', 'Unknown')[:100]}")



AI EVAL SERVICE - FINAL SUMMARY

Overall Performance:
  Mean Score: 0.9000
  Max Score: 0.9000
  Min Score: 0.9000
  Score Range: 0.0000
  Std Deviation: 0.0000

Maximum Achievement:
  Maximum Possible Score: 1.0000
  Highest Score Achieved: 0.9000
  Mean Score: 0.9000
  % of Maximum (Mean): 90.00%
  % of Maximum (Max): 90.00%
  Distance from Max (Mean): 0.1000
  Distance from Max (Best): 0.1000

Consistency Assessment:
  ‚úÖ PERFECT CONSISTENCY: All scores are identical

Recommendations:
  ‚úÖ The AI eval service is sufficiently consistent for production use
  ‚úÖ Scores are close to maximum - evaluator is performing well


In [None]:
# Check for any errors in failed runs
failed_runs = [r for r in all_results if not r['success']]
if failed_runs:
    print(f"\nFailed Runs ({len(failed_runs)}):")
    for failed in failed_runs[:5]:  # Show first 5 failures
        print(f"  Run {failed['run_number']}: {failed.get('error', 'Unknown error')}")
else:
    print("\n‚úÖ All runs completed successfully!")



‚úÖ All runs completed successfully!
