In [None]:
# Rush Hour Model Response Evaluator
# Comprehensive evaluation system for assessing model solutions

import re
import os
import json
import pandas as pd
from collections import defaultdict
from typing import List, Tuple, Dict, Optional, Union
import matplotlib.pyplot as plt
import seaborn as sns

# Import the solver functions from the original generator
# (You'll need to ensure the original file is importable or copy the necessary functions)

GRID_SIZE = 3

class RushHourEvaluator:
    def __init__(self, dataset_path: str = "data"):
        self.dataset_path = dataset_path
        self.results = []
        
    def parse_solution_from_response(self, response: str) -> Optional[List[str]]:
        """Extract solution from model response with multiple parsing strategies"""
        
        # Strategy 1: Look for <solution> tags
        solution_match = re.search(r'<solution>(.*?)</solution>', response, re.DOTALL | re.IGNORECASE)
        if solution_match:
            solution_text = solution_match.group(1).strip()
        else:
            # Strategy 2: Look for step patterns in the entire response
            solution_text = response
        
        # Extract step lines
        step_pattern = r'Step\s+\d+:\s*([A-Z]\d*)\s*\[(\d+),(\d+)\]\s*->\s*\[(\d+),(\d+)\]'
        matches = re.findall(step_pattern, solution_text, re.IGNORECASE)
        
        if not matches:
            return None
            
        moves = []
        for match in matches:
            piece, start_row, start_col, end_row, end_col = match
            moves.append({
                'piece': piece.upper(),
                'start': (int(start_row), int(start_col)),
                'end': (int(end_row), int(end_col))
            })
        
        return moves
    
    def load_puzzle_data(self, puzzle_folder: str) -> Dict:
        """Load puzzle data including initial state and reference solution"""
        puzzle_data = {}
        
        # Load puzzle grid from solution.txt metadata
        solution_file = os.path.join(puzzle_folder, "solution.txt")
        if os.path.exists(solution_file):
            with open(solution_file, 'r') as f:
                content = f.read()
                
            # Extract exit position
            exit_match = re.search(r'Exit position: \[(\d+),(\d+)\]', content)
            if exit_match:
                puzzle_data['exit_pos'] = (int(exit_match.group(1)), int(exit_match.group(2)))
            
            # Extract reference solution moves
            solution_lines = []
            in_solution = False
            for line in content.split('\n'):
                if line.strip() == "Solution:":
                    in_solution = True
                    continue
                if in_solution and line.strip().startswith("Step"):
                    solution_lines.append(line.strip())
            
            puzzle_data['reference_moves'] = self.parse_solution_text(solution_lines)
            puzzle_data['optimal_length'] = len(puzzle_data['reference_moves'])
        
        return puzzle_data
    
    def parse_solution_text(self, solution_lines: List[str]) -> List[Dict]:
        """Parse solution text lines into move format"""
        moves = []
        for line in solution_lines:
            match = re.search(r'Step\s+\d+:\s*([A-Z]\d*)\s*\[(\d+),(\d+)\]\s*->\s*\[(\d+),(\d+)\]', line)
            if match:
                piece, start_row, start_col, end_row, end_col = match.groups()
                moves.append({
                    'piece': piece.upper(),
                    'start': (int(start_row), int(start_col)),
                    'end': (int(end_row), int(end_col))
                })
        return moves
    
    def reconstruct_initial_grid(self, puzzle_data: Dict) -> List[List[str]]:
        """Reconstruct initial grid state from reference solution by working backwards"""
        if not puzzle_data.get('reference_moves'):
            return None
            
        # Start with final state and work backwards
        grid = [['.' for _ in range(GRID_SIZE)] for _ in range(GRID_SIZE)]
        
        # Place car at exit in final state
        exit_pos = puzzle_data['exit_pos']
        exit_row, exit_col = exit_pos[0] - 1, exit_pos[1] - 1  # Convert to 0-indexed
        grid[exit_row][exit_col] = 'C'
        
        # Work backwards through moves
        moves = puzzle_data['reference_moves'][::-1]  # Reverse order
        pieces_seen = {'C'}
        
        for move in moves:
            piece = move['piece']
            pieces_seen.add(piece)
            # In reverse: end -> start becomes start -> end
            start_pos = (move['end'][0] - 1, move['end'][1] - 1)  # Convert to 0-indexed
            end_pos = (move['start'][0] - 1, move['start'][1] - 1)
            
            # Move piece from start to end position
            grid[start_pos[0]][start_pos[1]] = '.'
            grid[end_pos[0]][end_pos[1]] = piece
        
        # Fill remaining empty cells with blockers if needed
        empty_count = sum(row.count('.') for row in grid)
        if empty_count > 1:  # Should have exactly 1 empty cell
            # This suggests there are additional blockers not involved in solution
            # We'll need to infer their positions or load from image analysis
            pass
            
        return grid
    
    def simulate_moves(self, initial_grid: List[List[str]], moves: List[Dict], 
                      exit_pos: Tuple[int, int]) -> Tuple[bool, str, List[List[List[str]]]]:
        """Simulate moves and check validity"""
        if not initial_grid or not moves:
            return False, "No grid or moves provided", []
        
        grid = [row[:] for row in initial_grid]  # Deep copy
        states = [grid]
        exit_row, exit_col = exit_pos[0] - 1, exit_pos[1] - 1  # Convert to 0-indexed
        
        for i, move in enumerate(moves):
            piece = move['piece']
            start_row, start_col = move['start'][0] - 1, move['start'][1] - 1  # Convert to 0-indexed
            end_row, end_col = move['end'][0] - 1, move['end'][1] - 1
            
            # Validate move
            if not (0 <= start_row < GRID_SIZE and 0 <= start_col < GRID_SIZE):
                return False, f"Step {i+1}: Start position out of bounds", states
            
            if not (0 <= end_row < GRID_SIZE and 0 <= end_col < GRID_SIZE):
                return False, f"Step {i+1}: End position out of bounds", states
            
            if grid[start_row][start_col] != piece:
                return False, f"Step {i+1}: Piece {piece} not at start position [{move['start'][0]},{move['start'][1]}]", states
            
            if grid[end_row][end_col] != '.':
                return False, f"Step {i+1}: End position [{move['end'][0]},{move['end'][1]}] is occupied", states
            
            # Check if move is adjacent (only one square)
            if abs(start_row - end_row) + abs(start_col - end_col) != 1:
                return False, f"Step {i+1}: Move is not adjacent (must be exactly 1 square)", states
            
            # Apply move
            grid[start_row][start_col] = '.'
            grid[end_row][end_col] = piece
            states.append([row[:] for row in grid])
        
        # Check if solved
        if grid[exit_row][exit_col] == 'C':
            return True, "Solution correct!", states
        else:
            car_pos = None
            for r in range(GRID_SIZE):
                for c in range(GRID_SIZE):
                    if grid[r][c] == 'C':
                        car_pos = (r+1, c+1)  # Convert to 1-indexed
                        break
            return False, f"Car not at target. Car at [{car_pos[0]},{car_pos[1]}], target at [{exit_pos[0]},{exit_pos[1]}]", states
    
    def calculate_progress_score(self, initial_grid: List[List[str]], moves: List[Dict], 
                                exit_pos: Tuple[int, int]) -> Dict:
        """Calculate progress made towards solution even if not complete"""
        if not initial_grid:
            return {"progress_score": 0, "details": "No initial grid"}
        
        # Find initial car position
        initial_car_pos = None
        for r in range(GRID_SIZE):
            for c in range(GRID_SIZE):
                if initial_grid[r][c] == 'C':
                    initial_car_pos = (r, c)
                    break
        
        if not initial_car_pos:
            return {"progress_score": 0, "details": "No car found"}
        
        exit_row, exit_col = exit_pos[0] - 1, exit_pos[1] - 1  # Convert to 0-indexed
        
        # Calculate initial distance
        initial_distance = abs(initial_car_pos[0] - exit_row) + abs(initial_car_pos[1] - exit_col)
        
        if not moves:
            return {
                "progress_score": 0,
                "initial_distance": initial_distance,
                "final_distance": initial_distance,
                "details": "No moves provided"
            }
        
        # Simulate valid moves to find final car position
        grid = [row[:] for row in initial_grid]
        valid_moves = 0
        
        for i, move in enumerate(moves):
            piece = move['piece']
            start_row, start_col = move['start'][0] - 1, move['start'][1] - 1
            end_row, end_col = move['end'][0] - 1, move['end'][1] - 1
            
            # Check if move is valid
            if (0 <= start_row < GRID_SIZE and 0 <= start_col < GRID_SIZE and
                0 <= end_row < GRID_SIZE and 0 <= end_col < GRID_SIZE and
                grid[start_row][start_col] == piece and
                grid[end_row][end_col] == '.' and
                abs(start_row - end_row) + abs(start_col - end_col) == 1):
                
                # Apply move
                grid[start_row][start_col] = '.'
                grid[end_row][end_col] = piece
                valid_moves += 1
            else:
                break  # Stop at first invalid move
        
        # Find final car position
        final_car_pos = None
        for r in range(GRID_SIZE):
            for c in range(GRID_SIZE):
                if grid[r][c] == 'C':
                    final_car_pos = (r, c)
                    break
        
        if not final_car_pos:
            final_car_pos = initial_car_pos
        
        final_distance = abs(final_car_pos[0] - exit_row) + abs(final_car_pos[1] - exit_col)
        
        # Calculate progress score (0-1, where 1 is solved)
        if final_distance == 0:
            progress_score = 1.0
        elif initial_distance == 0:
            progress_score = 1.0
        else:
            # Score based on distance reduction
            distance_improvement = initial_distance - final_distance
            progress_score = max(0, distance_improvement / initial_distance)
            
            # Bonus for valid moves
            if valid_moves > 0:
                progress_score += 0.1 * (valid_moves / len(moves))
            
            progress_score = min(1.0, progress_score)
        
        return {
            "progress_score": progress_score,
            "initial_distance": initial_distance,
            "final_distance": final_distance,
            "valid_moves": valid_moves,
            "total_moves": len(moves),
            "details": f"Reduced distance from {initial_distance} to {final_distance}"
        }
    
    def evaluate_response(self, puzzle_folder: str, model_response: str, 
                         model_name: str = "unknown") -> Dict:
        """Comprehensive evaluation of a model response"""
        puzzle_name = os.path.basename(puzzle_folder)
        
        # Load puzzle data
        puzzle_data = self.load_puzzle_data(puzzle_folder)
        if not puzzle_data:
            return {
                "puzzle": puzzle_name,
                "model": model_name,
                "status": "ERROR",
                "error": "Could not load puzzle data"
            }
        
        # Parse model response
        moves = self.parse_solution_from_response(model_response)
        
        result = {
            "puzzle": puzzle_name,
            "model": model_name,
            "response_length": len(model_response),
            "moves_found": len(moves) if moves else 0,
            "optimal_length": puzzle_data.get('optimal_length', 0),
            "exit_position": puzzle_data.get('exit_pos', (0, 0))
        }
        
        if not moves:
            result.update({
                "status": "PARSE_ERROR",
                "correctness": "INVALID",
                "progress_score": 0,
                "error": "Could not parse any moves from response"
            })
            return result
        
        # Reconstruct initial grid
        initial_grid = self.reconstruct_initial_grid(puzzle_data)
        if not initial_grid:
            result.update({
                "status": "GRID_ERROR",
                "correctness": "UNKNOWN",
                "error": "Could not reconstruct initial grid"
            })
            return result
        
        # Simulate moves
        is_correct, message, states = self.simulate_moves(
            initial_grid, moves, puzzle_data['exit_pos']
        )
        
        # Calculate progress
        progress_info = self.calculate_progress_score(
            initial_grid, moves, puzzle_data['exit_pos']
        )
        
        # Determine correctness level
        if is_correct:
            if len(moves) == puzzle_data['optimal_length']:
                correctness = "OPTIMAL"
            else:
                correctness = "CORRECT_SUBOPTIMAL"
        else:
            if progress_info['valid_moves'] == 0:
                correctness = "INVALID"
            elif progress_info['progress_score'] > 0.5:
                correctness = "PARTIAL_GOOD"
            else:
                correctness = "PARTIAL_POOR"
        
        result.update({
            "status": "EVALUATED",
            "correctness": correctness,
            "is_solved": is_correct,
            "progress_score": progress_info['progress_score'],
            "valid_moves": progress_info['valid_moves'],
            "message": message,
            "efficiency": len(moves) / puzzle_data['optimal_length'] if puzzle_data['optimal_length'] > 0 else float('inf')
        })
        
        return result
    
    def evaluate_dataset(self, responses_file: str) -> pd.DataFrame:
        """Evaluate all responses in a dataset"""
        # Load responses (assuming JSON format with puzzle_name -> response mapping)
        with open(responses_file, 'r') as f:
            responses = json.load(f)
        
        results = []
        
        for puzzle_name, response_data in responses.items():
            puzzle_folder = os.path.join(self.dataset_path, puzzle_name)
            
            if isinstance(response_data, dict):
                # Multiple models
                for model_name, response in response_data.items():
                    result = self.evaluate_response(puzzle_folder, response, model_name)
                    results.append(result)
            else:
                # Single response
                result = self.evaluate_response(puzzle_folder, response_data)
                results.append(result)
        
        return pd.DataFrame(results)
    
    def generate_report(self, results_df: pd.DataFrame) -> Dict:
        """Generate comprehensive evaluation report"""
        report = {}
        
        # Overall statistics
        total_puzzles = len(results_df)
        report['total_puzzles'] = total_puzzles
        
        # By correctness level
        correctness_counts = results_df['correctness'].value_counts()
        report['correctness_breakdown'] = correctness_counts.to_dict()
        
        # Success rates
        optimal_rate = (correctness_counts.get('OPTIMAL', 0) / total_puzzles) * 100
        correct_rate = ((correctness_counts.get('OPTIMAL', 0) + 
                        correctness_counts.get('CORRECT_SUBOPTIMAL', 0)) / total_puzzles) * 100
        
        report['optimal_rate'] = optimal_rate
        report['correct_rate'] = correct_rate
        
        # Progress analysis
        report['average_progress'] = results_df['progress_score'].mean()
        report['median_progress'] = results_df['progress_score'].median()
        
        # Efficiency analysis (for correct solutions)
        correct_solutions = results_df[results_df['is_solved'] == True]
        if len(correct_solutions) > 0:
            report['average_efficiency'] = correct_solutions['efficiency'].mean()
            report['median_efficiency'] = correct_solutions['efficiency'].median()
        
        # By model (if multiple models)
        if 'model' in results_df.columns:
            model_performance = results_df.groupby('model').agg({
                'correctness': lambda x: (x == 'OPTIMAL').sum(),
                'is_solved': 'sum',
                'progress_score': 'mean',
                'efficiency': 'mean'
            }).round(3)
            report['model_performance'] = model_performance.to_dict()
        
        return report
    
    def plot_results(self, results_df: pd.DataFrame, save_path: str = None):
        """Create visualization of results"""
        fig, axes = plt.subplots(2, 2, figsize=(15, 12))
        
        # Correctness distribution
        correctness_order = ['OPTIMAL', 'CORRECT_SUBOPTIMAL', 'PARTIAL_GOOD', 'PARTIAL_POOR', 'INVALID']
        correctness_counts = results_df['correctness'].value_counts()
        axes[0, 0].bar(range(len(correctness_counts)), 
                      [correctness_counts.get(level, 0) for level in correctness_order])
        axes[0, 0].set_xticks(range(len(correctness_order)))
        axes[0, 0].set_xticklabels(correctness_order, rotation=45)
        axes[0, 0].set_title('Solution Correctness Distribution')
        axes[0, 0].set_ylabel('Count')
        
        # Progress score distribution
        axes[0, 1].hist(results_df['progress_score'], bins=20, alpha=0.7)
        axes[0, 1].set_title('Progress Score Distribution')
        axes[0, 1].set_xlabel('Progress Score')
        axes[0, 1].set_ylabel('Count')
        
        # Efficiency vs Progress
        axes[1, 0].scatter(results_df['progress_score'], results_df['efficiency'], alpha=0.6)
        axes[1, 0].set_xlabel('Progress Score')
        axes[1, 0].set_ylabel('Efficiency (moves/optimal)')
        axes[1, 0].set_title('Efficiency vs Progress')
        axes[1, 0].set_ylim(0, 5)  # Cap efficiency for readability
        
        # Model comparison (if multiple models)
        if 'model' in results_df.columns and results_df['model'].nunique() > 1:
            model_success = results_df.groupby('model')['is_solved'].mean()
            axes[1, 1].bar(model_success.index, model_success.values)
            axes[1, 1].set_title('Success Rate by Model')
            axes[1, 1].set_ylabel('Success Rate')
            axes[1, 1].tick_params(axis='x', rotation=45)
        else:
            # Show move count distribution instead
            valid_results = results_df[results_df['moves_found'] > 0]
            axes[1, 1].hist(valid_results['moves_found'], bins=15, alpha=0.7)
            axes[1, 1].set_title('Move Count Distribution')
            axes[1, 1].set_xlabel('Number of Moves')
            axes[1, 1].set_ylabel('Count')
        
        plt.tight_layout()
        
        if save_path:
            plt.savefig(save_path, dpi=300, bbox_inches='tight')
        plt.show()

# Example usage functions
def create_sample_responses():
    """Create sample responses for testing"""
    sample_responses = {
        "puzzle1": {
            "gpt4": """
            Looking at this puzzle, I need to move the car C to the target position.
            
            <solution>
            Step 1: B1 [1,2] -> [1,1]
            Step 2: C [2,1] -> [1,1]
            Step 3: C [1,1] -> [1,2]
            </solution>
            """,
            "claude": """
            I'll solve this step by step:
            
            <solution>
            Step 1: B1 [1,2] -> [2,2]
            Step 2: C [2,1] -> [1,1] 
            Step 3: C [1,1] -> [1,2]
            Step 4: C [1,2] -> [1,3]
            </solution>
            """
        }
    }
    
    with open('sample_responses.json', 'w') as f:
        json.dump(sample_responses, f, indent=2)
    
    return 'sample_responses.json'

def main_evaluation_example():
    """Example of how to use the evaluator"""
    
    # Initialize evaluator
    evaluator = RushHourEvaluator(dataset_path="data")
    
    # Create sample responses for testing
    responses_file = create_sample_responses()
    
    # Evaluate all responses
    results_df = evaluator.evaluate_dataset(responses_file)
    
    # Print results
    print("Evaluation Results:")
    print(results_df[['puzzle', 'model', 'correctness', 'progress_score', 'efficiency']])
    
    # Generate report
    report = evaluator.generate_report(results_df)
    print("\nEvaluation Report:")
    for key, value in report.items():
        print(f"{key}: {value}")
    
    # Create visualizations
    evaluator.plot_results(results_df, save_path="evaluation_results.png")
    
    return results_df, report

if __name__ == "__main__":
    results_df, report = main_evaluation_example()