In [16]:
import os
import json

task_name = 'gpt-4.1_maxiter_100_N_v0.61.0-no-hint'
# task_name = 'o3-mini_maxiter_100_N_v0.61.0-no-hint'
eval_output_dir = f'/home/v-murongma/code/OpenHands_SWE-Bench-Optimized/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/{task_name}'


def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

multirun_results = {}
for dir in os.listdir(eval_output_dir):
    if task_name in dir:
        result_file = load_json(os.path.join(eval_output_dir, dir, 'report.json'))
        multirun_results[dir] = result_file

submitted_ids = set(multirun_results[list(multirun_results.keys())[0]]['submitted_ids'])
for result in multirun_results.values():
    submitted_ids = submitted_ids.intersection(set(result['submitted_ids']))
print(f"Total unique submitted IDs: {len(submitted_ids)}")

results_statistics = {}
results_records = {'resolved_ids': [], 'unresolved_ids': [], 'empty_patch_ids': [], 'error_ids': []}    
for submitted_id in submitted_ids:
    stats = {"resolved_freq": 0, "unresolved_freq": 0, "empty_patch_freq": 0, "error_freq": 0}
    for dir, result in multirun_results.items():
        if submitted_id in result['resolved_ids']:
            stats['resolved_freq'] += 1
            results_records['resolved_ids'].append('_'.join([dir, str(submitted_id)]))
        if submitted_id in result['unresolved_ids']:
            stats['unresolved_freq'] += 1
            results_records['unresolved_ids'].append('_'.join([dir, str(submitted_id)]))
        if submitted_id in result['empty_patch_ids']:
            stats['empty_patch_freq'] += 1
            results_records['empty_patch_ids'].append('_'.join([dir, str(submitted_id)]))
        if submitted_id in result['error_ids']:
            stats['error_freq'] += 1
            results_records['error_ids'].append('_'.join([dir, str(submitted_id)]))
    results_statistics[submitted_id] = stats
    
# error type statistics
error_type_statistics = {}
error_records = {'stuck_in_loop': [], 'max_iter_error': []}
for submitted_id in submitted_ids:
    stats = {'stuck_in_loop_freq':0, 'max_iter_error_freq':0}
    for dir in os.listdir(eval_output_dir):
        if task_name in dir:
            log_content = open(os.path.join(eval_output_dir, dir, 'infer_logs', f'instance_{submitted_id}.log')).read()
            if 'maximum iteration' in log_content:
                stats['max_iter_error_freq'] += 1
                error_records['max_iter_error'].append('_'.join([dir, str(submitted_id)]))
            if 'AgentStuckInLoopError' in log_content:
                stats['stuck_in_loop_freq'] += 1
                error_records['stuck_in_loop'].append('_'.join([dir, str(submitted_id)]))
    error_type_statistics[submitted_id] = stats
                
    
# normalize frequencies
num_runs = len(multirun_results)
for submitted_id, stats in results_statistics.items():
    for key in stats:
        stats[key] /= num_runs
for submitted_id, stats in error_type_statistics.items():
    for key in stats:
        stats[key] /= num_runs
        
# show statistics
print("Results Statistics:")
for submitted_id, stats in results_statistics.items():
    print(f"Submitted ID: {submitted_id},\n Stats: {stats}") 
    
print("Error Type Statistics:")
for submitted_id, stats in error_type_statistics.items():
    print(f"Submitted ID: {submitted_id},\n Error Type Stats: {stats}") 

Total unique submitted IDs: 20
Results Statistics:
Submitted ID: django__django-14434,
 Stats: {'resolved_freq': 0.6, 'unresolved_freq': 0.2, 'empty_patch_freq': 0.0, 'error_freq': 0.2}
Submitted ID: sympy__sympy-24213,
 Stats: {'resolved_freq': 0.8, 'unresolved_freq': 0.2, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: django__django-12663,
 Stats: {'resolved_freq': 0.2, 'unresolved_freq': 0.8, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: sphinx-doc__sphinx-7757,
 Stats: {'resolved_freq': 0.0, 'unresolved_freq': 0.6, 'empty_patch_freq': 0.0, 'error_freq': 0.4}
Submitted ID: sphinx-doc__sphinx-8551,
 Stats: {'resolved_freq': 0.0, 'unresolved_freq': 0.6, 'empty_patch_freq': 0.0, 'error_freq': 0.4}
Submitted ID: django__django-13279,
 Stats: {'resolved_freq': 0.6, 'unresolved_freq': 0.4, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: scikit-learn__scikit-learn-14983,
 Stats: {'resolved_freq': 0.4, 'unresolved_freq': 0.6, 'empty_patch_freq': 0.0, 'error_

In [17]:
# Calculate pass@1 and pass@k metrics
def calculate_pass_at_k(results_statistics, k):
    """
    Calculate pass@k: percentage of problems solved in at least 1 out of k attempts
    """
    total_instances = len(results_statistics)
    solved_at_least_once = 0
    
    for submitted_id, stats in results_statistics.items():
        # If resolved at least once in k runs
        if stats['resolved_freq'] > 0:
            solved_at_least_once += 1
    
    pass_at_k = solved_at_least_once / total_instances if total_instances > 0 else 0
    return pass_at_k, solved_at_least_once, total_instances

# Calculate pass@1 (single run performance - average across runs)
pass_at_1_sum = sum(stats['resolved_freq'] for stats in results_statistics.values())
pass_at_1 = pass_at_1_sum / len(results_statistics) if len(results_statistics) > 0 else 0

# Calculate pass@k (at least one success across all k runs)
pass_at_k, solved_count, total_count = calculate_pass_at_k(results_statistics, num_runs)

print(f"\n{'='*50}")
print(f"Pass@1 (average success rate per run): {pass_at_1:.2%}")
print(f"Pass@{num_runs} (solved at least once in {num_runs} runs): {pass_at_k:.2%}")
print(f"Instances solved at least once: {solved_count}/{total_count}")
print(f"{'='*50}\n")

# Additional analysis: distribution of solve frequencies
solve_frequency_dist = {}
for stats in results_statistics.values():
    freq = int(stats['resolved_freq'] * num_runs)
    solve_frequency_dist[freq] = solve_frequency_dist.get(freq, 0) + 1

print("Distribution of solve frequencies:")
for freq in sorted(solve_frequency_dist.keys()):
    print(f"  Solved {freq}/{num_runs} times: {solve_frequency_dist[freq]} instances")


Pass@1 (average success rate per run): 39.00%
Pass@5 (solved at least once in 5 runs): 60.00%
Instances solved at least once: 12/20

Distribution of solve frequencies:
  Solved 0/5 times: 8 instances
  Solved 1/5 times: 2 instances
  Solved 2/5 times: 1 instances
  Solved 3/5 times: 4 instances
  Solved 4/5 times: 2 instances
  Solved 5/5 times: 3 instances


In [None]:
print('resolved rate:', len(results_records['resolved_ids'])/(len(results_records['resolved_ids'] + results_records['unresolved_ids'])))
results_records





resolved rate: 0.42857142857142855


{'resolved_ids': ['gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_django__django-14434',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_2_django__django-14434',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_5_django__django-14434',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_2_sympy__sympy-24213',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_3_sympy__sympy-24213',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_5_sympy__sympy-24213',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_1_sympy__sympy-24213',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_django__django-12663',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_3_django__django-13279',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_5_django__django-13279',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_1_django__django-13279',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_scikit-learn__scikit-learn-14983',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_5_scikit-learn__scikit-learn-14983',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_2_django__dja

In [4]:
error_records

{'stuck_in_loop': ['gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_2_django__django-12663',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_1_django__django-12663',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_django__django-11999',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_sympy__sympy-24213',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_3_scikit-learn__scikit-learn-25232',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_django__django-15503',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_1_django__django-15503',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_2_scikit-learn__scikit-learn-14983',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_sphinx-doc__sphinx-8551',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_3_django__django-11095',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_5_django__django-11095',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_4_pydata__xarray-6992',
  'gpt-4.1_maxiter_100_N_v0.61.0-no-hint-run_5_pydata__xarray-6992'],
 'max_iter_error': ['gpt-4.1_maxiter_100_N_v0.61.

In [14]:
import os
import json

# task_name = 'gpt-4.1_maxiter_100_N_v0.61.0-no-hint'
task_name = 'o3-mini_maxiter_100_N_v0.61.0-no-hint'
eval_output_dir = f'/home/v-murongma/code/OpenHands_SWE-Bench-Optimized/evaluation/evaluation_outputs/outputs/princeton-nlp__SWE-bench_Verified-test/CodeActAgent/{task_name}'


def load_json(file_path):
    with open(file_path, 'r') as f:
        return json.load(f)

multirun_results = {}
for dir in os.listdir(eval_output_dir):
    if task_name in dir:
        result_file = load_json(os.path.join(eval_output_dir, dir, 'report.json'))
        multirun_results[dir] = result_file

submitted_ids = set(multirun_results[list(multirun_results.keys())[0]]['submitted_ids'])
for result in multirun_results.values():
    submitted_ids = submitted_ids.intersection(set(result['submitted_ids']))
print(f"Total unique submitted IDs: {len(submitted_ids)}")

results_statistics = {}
results_records = {'resolved_ids': [], 'unresolved_ids': [], 'empty_patch_ids': [], 'error_ids': []}    
for submitted_id in submitted_ids:
    stats = {"resolved_freq": 0, "unresolved_freq": 0, "empty_patch_freq": 0, "error_freq": 0}
    for dir, result in multirun_results.items():
        if submitted_id in result['resolved_ids']:
            stats['resolved_freq'] += 1
            results_records['resolved_ids'].append('_'.join([dir, str(submitted_id)]))
        if submitted_id in result['unresolved_ids']:
            stats['unresolved_freq'] += 1
            results_records['unresolved_ids'].append('_'.join([dir, str(submitted_id)]))
        if submitted_id in result['empty_patch_ids']:
            stats['empty_patch_freq'] += 1
            results_records['empty_patch_ids'].append('_'.join([dir, str(submitted_id)]))
        if submitted_id in result['error_ids']:
            stats['error_freq'] += 1
            results_records['error_ids'].append('_'.join([dir, str(submitted_id)]))
    results_statistics[submitted_id] = stats
    
# error type statistics
error_type_statistics = {}
error_records = {'stuck_in_loop': [], 'max_iter_error': []}
for submitted_id in submitted_ids:
    stats = {'stuck_in_loop_freq':0, 'max_iter_error_freq':0}
    for dir in os.listdir(eval_output_dir):
        if task_name in dir:
            log_content = open(os.path.join(eval_output_dir, dir, 'infer_logs', f'instance_{submitted_id}.log')).read()
            if 'maximum iteration' in log_content:
                stats['max_iter_error_freq'] += 1
                error_records['max_iter_error'].append('_'.join([dir, str(submitted_id)]))
            if 'AgentStuckInLoopError' in log_content:
                stats['stuck_in_loop_freq'] += 1
                error_records['stuck_in_loop'].append('_'.join([dir, str(submitted_id)]))
    error_type_statistics[submitted_id] = stats
                
    
# normalize frequencies
num_runs = len(multirun_results)
for submitted_id, stats in results_statistics.items():
    for key in stats:
        stats[key] /= num_runs
for submitted_id, stats in error_type_statistics.items():
    for key in stats:
        stats[key] /= num_runs
        
# show statistics
print("Results Statistics:")
for submitted_id, stats in results_statistics.items():
    print(f"Submitted ID: {submitted_id},\n Stats: {stats}") 
    
print("Error Type Statistics:")
for submitted_id, stats in error_type_statistics.items():
    print(f"Submitted ID: {submitted_id},\n Error Type Stats: {stats}") 

Total unique submitted IDs: 20
Results Statistics:
Submitted ID: django__django-14434,
 Stats: {'resolved_freq': 0.8, 'unresolved_freq': 0.2, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: sympy__sympy-24213,
 Stats: {'resolved_freq': 0.8, 'unresolved_freq': 0.2, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: django__django-12663,
 Stats: {'resolved_freq': 0.2, 'unresolved_freq': 0.8, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: sphinx-doc__sphinx-7757,
 Stats: {'resolved_freq': 0.0, 'unresolved_freq': 1.0, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: sphinx-doc__sphinx-8551,
 Stats: {'resolved_freq': 0.0, 'unresolved_freq': 1.0, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: django__django-13279,
 Stats: {'resolved_freq': 0.4, 'unresolved_freq': 0.6, 'empty_patch_freq': 0.0, 'error_freq': 0.0}
Submitted ID: scikit-learn__scikit-learn-14983,
 Stats: {'resolved_freq': 0.0, 'unresolved_freq': 1.0, 'empty_patch_freq': 0.0, 'error_

In [15]:
# Calculate pass@1 and pass@k metrics
def calculate_pass_at_k(results_statistics, k):
    """
    Calculate pass@k: percentage of problems solved in at least 1 out of k attempts
    """
    total_instances = len(results_statistics)
    solved_at_least_once = 0
    
    for submitted_id, stats in results_statistics.items():
        # If resolved at least once in k runs
        if stats['resolved_freq'] > 0:
            solved_at_least_once += 1
    
    pass_at_k = solved_at_least_once / total_instances if total_instances > 0 else 0
    return pass_at_k, solved_at_least_once, total_instances

# Calculate pass@1 (single run performance - average across runs)
pass_at_1_sum = sum(stats['resolved_freq'] for stats in results_statistics.values())
pass_at_1 = pass_at_1_sum / len(results_statistics) if len(results_statistics) > 0 else 0

# Calculate pass@k (at least one success across all k runs)
pass_at_k, solved_count, total_count = calculate_pass_at_k(results_statistics, num_runs)

print(f"\n{'='*50}")
print(f"Pass@1 (average success rate per run): {pass_at_1:.2%}")
print(f"Pass@{num_runs} (solved at least once in {num_runs} runs): {pass_at_k:.2%}")
print(f"Instances solved at least once: {solved_count}/{total_count}")
print(f"{'='*50}\n")

# Additional analysis: distribution of solve frequencies
solve_frequency_dist = {}
for stats in results_statistics.values():
    freq = int(stats['resolved_freq'] * num_runs)
    solve_frequency_dist[freq] = solve_frequency_dist.get(freq, 0) + 1

print("Distribution of solve frequencies:")
for freq in sorted(solve_frequency_dist.keys()):
    print(f"  Solved {freq}/{num_runs} times: {solve_frequency_dist[freq]} instances")


Pass@1 (average success rate per run): 30.00%
Pass@5 (solved at least once in 5 runs): 55.00%
Instances solved at least once: 11/20

Distribution of solve frequencies:
  Solved 0/5 times: 9 instances
  Solved 1/5 times: 4 instances
  Solved 2/5 times: 1 instances
  Solved 3/5 times: 2 instances
  Solved 4/5 times: 2 instances
  Solved 5/5 times: 2 instances


In [7]:
print('resolved rate:', len(results_records['resolved_ids'])/(len(results_records['resolved_ids'] + results_records['unresolved_ids'])))
results_records

resolved rate: 0.30927835051546393


{'resolved_ids': ['o3-mini_maxiter_100_N_v0.61.0-no-hint-run_4_django__django-14434',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_3_django__django-14434',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_5_django__django-14434',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_1_django__django-14434',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_4_sympy__sympy-24213',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_3_sympy__sympy-24213',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_5_sympy__sympy-24213',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_1_sympy__sympy-24213',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_2_django__django-12663',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_3_django__django-13279',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_1_django__django-13279',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_2_django__django-12155',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_4_django__django-12155',
  'o3-mini_maxiter_100_N_v0.61.0-no-hint-run_3_django__django-12155',
  'o3-mini_m