This notebook is for comparing our experiments with the baseline and understanding how and why our results deviate from the baseline

In [11]:
import csv
import json
from collections import defaultdict
from typing import Dict, Set, Tuple
import sys
from io import StringIO
import traceback

def load_expmem_results(csv_path: str) -> Dict[str, Dict]:
    """Load results and full code from Expmem (CSV) file."""
    results = {}
    with open(csv_path, 'r') as f:
        reader = csv.DictReader(f)
        for row in reader:
            results[row['ID']] = {
                'result': row['Result'].lower() == 'true',
                'full_code': row['Full_Code'],
                'code': row.get('Code', '')  # Get the main implementation if available
            }
    return results

def load_mapcoder_results(jsonl_path: str) -> Dict[str, Dict]:
    """Load results and code components from MapCoder (JSONL) file."""
    results = {}
    with open(jsonl_path, 'r') as f:
        for line in f:
            data = json.loads(line)
            # Extract one sample test case if available
            sample_test = None
            if data.get('sample_io'):
                sample_test = data['sample_io'][0]
            
            results[data['task_id']] = {
                'result': data['is_solved'],
                'source_code': data['source_codes'][0] if data['source_codes'] else None,
                'test': data['test'],
                'entry_point': data['entry_point'],
                'sample_test': sample_test
            }
    return results

def construct_full_mapcoder_code(data: Dict) -> str:
    """Construct full runnable code from MapCoder components."""
    code_parts = []
    
    # Add source code
    if data['source_code']:
        code_parts.append(data['source_code'])
    
    # Add test function
    if data['test']:
        code_parts.append("\n" + data['test'])
    
    # Add the check function call
    code_parts.append(f"\ncheck({data['entry_point']})")
    
    return "\n".join(code_parts)

def execute_code_safely(code: str, source: str) -> Dict:
    """Execute code in a safe environment and capture output/errors."""
    result = {
        'success': False,
        'output': '',
        'error': '',
        'exception_type': None
    }
    
    # Create string buffer to capture stdout
    old_stdout = sys.stdout
    sys.stdout = StringIO()
    
    try:
        # Create a new namespace to avoid global namespace pollution
        namespace = {}
        exec(code, namespace)
        result['success'] = True
        
    except Exception as e:
        result['error'] = traceback.format_exc()
        result['exception_type'] = type(e).__name__
        
    finally:
        # Capture output and restore stdout
        result['output'] = sys.stdout.getvalue()
        sys.stdout = old_stdout
        
    return result

def compare_and_execute_divergent(expmem_results: Dict[str, Dict], 
                                mapcoder_results: Dict[str, Dict]) -> Dict[str, Dict]:
    """
    Compare results and execute code for divergent cases.
    Returns detailed execution results for analysis.
    """
    execution_results = {}
    
    # Find problems with divergent results
    common_ids = set(expmem_results.keys()) & set(mapcoder_results.keys())
    for problem_id in common_ids:
        expmem_success = expmem_results[problem_id]['result']
        mapcoder_success = mapcoder_results[problem_id]['result']
        
        if expmem_success != mapcoder_success:
            execution_results[problem_id] = {
                'expmem_claimed': expmem_success,
                'mapcoder_claimed': mapcoder_success,
                'expmem_code': expmem_results[problem_id]['code'],
                'mapcoder_code': mapcoder_results[problem_id]['source_code'],
                'sample_test': mapcoder_results[problem_id]['sample_test'],
                'expmem_execution': None,
                'mapcoder_execution': None
            }
            
            # Execute Expmem code
            expmem_code = expmem_results[problem_id]['full_code']
            expmem_execution = execute_code_safely(expmem_code, 'Expmem')
            execution_results[problem_id]['expmem_execution'] = expmem_execution
            
            # Execute MapCoder code
            mapcoder_code = construct_full_mapcoder_code(mapcoder_results[problem_id])
            mapcoder_execution = execute_code_safely(mapcoder_code, 'MapCoder')
            execution_results[problem_id]['mapcoder_execution'] = mapcoder_execution
    
    return execution_results

def generate_execution_report(execution_results: Dict[str, Dict]) -> str:
    """Generate a detailed report of execution results."""
    report = []
    report.append("=" * 100)
    report.append("DIVERGENT RESULTS EXECUTION REPORT")
    report.append("=" * 100)
    
    for problem_id, results in sorted(execution_results.items()):
        report.append(f"\nProblem ID: {problem_id}")
        report.append("=" * 100)
        
        # Results summary
        report.append("\nRESULTS SUMMARY")
        report.append("-" * 50)
        report.append(f"Expmem claimed: {results['expmem_claimed']}")
        report.append(f"MapCoder claimed: {results['mapcoder_claimed']}")
        
        # Source code comparison
        report.append("\nSOURCE CODE COMPARISON")
        report.append("-" * 50)
        report.append("\nExpmem Implementation:")
        report.append("```python")
        report.append(results['expmem_code'].strip())
        report.append("```")
        
        report.append("\nMapCoder Implementation:")
        report.append("```python")
        report.append(results['mapcoder_code'].strip())
        report.append("```")
        
        # Sample test case
        report.append("\nSAMPLE TEST CASE")
        report.append("-" * 50)
        if results['sample_test']:
            report.append(results['sample_test'])
        else:
            report.append("No sample test case available")
        
        # Execution results
        report.append("\nEXECUTION RESULTS")
        report.append("-" * 50)
        
        # Expmem execution results
        expmem_exec = results['expmem_execution']
        report.append("\nExpmem Execution:")
        report.append(f"Success: {expmem_exec['success']}")
        if expmem_exec['output']:
            report.append("Output:")
            report.append(expmem_exec['output'].strip())
        if expmem_exec['error']:
            report.append("Error:")
            report.append(expmem_exec['error'].strip())
            
        # MapCoder execution results
        mapcoder_exec = results['mapcoder_execution']
        report.append("\nMapCoder Execution:")
        report.append(f"Success: {mapcoder_exec['success']}")
        if mapcoder_exec['output']:
            report.append("Output:")
            report.append(mapcoder_exec['output'].strip())
        if mapcoder_exec['error']:
            report.append("Error:")
            report.append(mapcoder_exec['error'].strip())
            
        report.append("\n" + "=" * 100)
    
    return "\n".join(report)

def main(csv_path: str, jsonl_path: str):
    """Main function to run the comparison and execution analysis."""
    # Load results
    expmem_results = load_expmem_results(csv_path)
    mapcoder_results = load_mapcoder_results(jsonl_path)
    
    # Compare and execute divergent cases
    execution_results = compare_and_execute_divergent(expmem_results, mapcoder_results)
    
    # Generate report
    report = generate_execution_report(execution_results)
    
    # Save report
    with open("execution_analysis_report.txt", "w") as f:
        f.write(report)
    
    print(report)

In [13]:
csv_path = "/Users/harshavardhank/Desktop/Code/Thesis/Code/expmem/humanEval_memory_naive2.csv"
jsonl_path = "/Users/harshavardhank/Desktop/Code/Thesis/Code/MapCoder/outputs/GPT4oMini-Direct-HumanEval-Python3-0-1.jsonl"

main(csv_path, jsonl_path)

KeyError: 'Full_Code'