In [22]:
import pandas as pd
import os
import json
import glob

In [23]:
def load_status_files(base_dir="../data/runs_stored/", pattern="*o1*"):
    """
    Load current_status_of_all_instances.json files from directories matching the pattern.
    
    Args:
        base_dir: Base directory to search in
        pattern: Pattern to match directory names
    
    Returns:
        Dictionary with directory names as keys and loaded JSON data as values
    """
    results = {}
    
    # Find all directories matching the pattern
    matching_dirs = glob.glob(f"{base_dir}{pattern}")
    
    for dir_path in matching_dirs:
        dir_name = os.path.basename(dir_path)
        status_file = os.path.join(dir_path, "current_status_of_all_instances.json")
        
        if os.path.exists(status_file):
            with open(status_file, 'r') as f:
                results[dir_name] = json.load(f)
    
    # Also load the specific raw_acr_test_split_swe_verified directory
    specific_dir = os.path.join(base_dir, "raw_acr_test_split_swe_verified")
    specific_file = os.path.join(specific_dir, "current_status_of_all_instances.json")
    
    if os.path.exists(specific_file):
        with open(specific_file, 'r') as f:
            results["raw_acr_test_split_swe_verified"] = json.load(f)
    
    return results

# Load all status files with "o1" in the directory name
status_data = load_status_files()

# Print the loaded directories
print(f"Loaded status data from {len(status_data)} directories:")
for dirname in status_data.keys():
    print(f"  - {dirname}")


Loaded status data from 6 directories:
  - benchmark_on_test_ALL_rules_INCL_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data
  - benchmark_on_test_ALL_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data
  - benchmark_on_test_GENERAL_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data
  - benchmark_on_test_NO_rules_INCL_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data
  - benchmark_on_test_REPO_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data
  - raw_acr_test_split_swe_verified


In [24]:
# Create a base dataframe with instance_ids
eval_data_df = pd.DataFrame(status_data['benchmark_on_test_ALL_rules_INCL_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data'].items(), 
                      columns=['instance_id', 'ALL_rules_INCL_exemplars'])

# Join all other dictionaries in status_data
for config_name, config_data in status_data.items():
    # Skip the one we already used as base
    if config_name == 'benchmark_on_test_ALL_rules_INCL_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data':
        continue
    
    # Skip if empty
    if not config_data:
        continue
    
    # Create a simplified column name from the config_name
    column_name = config_name.replace('benchmark_on_test_', '').replace('_from_o1_with_gpt_4o_and_combined_lite_and_verified_data', '')
    
    # Create a temporary dataframe for this config
    temp_df = pd.DataFrame(config_data.items(), columns=['instance_id', column_name])
    
    # Merge with the base dataframe
    eval_data_df = eval_data_df.merge(temp_df, on='instance_id', how='outer')

In [25]:
eval_data_df['repository'] = eval_data_df['instance_id'].str.split('__').str[0]


In [26]:
eval_data_df.head()

Unnamed: 0,instance_id,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,NO_rules_INCL_exemplars,REPO_rules_NO_exemplars,raw_acr_test_split_swe_verified,repository
0,django__django-10880,True,True,True,True,True,True,django
1,django__django-10999,False,False,False,False,False,False,django
2,django__django-11211,False,False,False,True,False,False,django
3,django__django-11451,True,True,True,True,True,True,django
4,django__django-11477,False,False,False,False,False,False,django


In [27]:
# Calculate per-repository success rates
repo_success_rates = eval_data_df.groupby('repository').agg({
    'ALL_rules_INCL_exemplars': ['mean', 'sum', 'size'], 
    'ALL_rules_NO_exemplars': ['mean', 'sum', 'size'], 
    'GENERAL_rules_NO_exemplars': ['mean', 'sum', 'size'], 
    'REPO_rules_NO_exemplars': ['mean', 'sum', 'size'], 
    'NO_rules_INCL_exemplars': ['mean', 'sum', 'size'], 
    'raw_acr_test_split_swe_verified': ['mean', 'sum', 'size']
})

# Display per-repository success rates
print("Success rates by repository:")
display(repo_success_rates)

# Calculate overall success rates across all repositories
overall_success_rates = eval_data_df.agg({
    'ALL_rules_INCL_exemplars': ['mean', 'sum', 'count'],
    'ALL_rules_NO_exemplars': ['mean', 'sum', 'count'],
    'GENERAL_rules_NO_exemplars': ['mean', 'sum', 'count'],
    'REPO_rules_NO_exemplars': ['mean', 'sum', 'count'],
    'NO_rules_INCL_exemplars': ['mean', 'sum', 'count'],
    'raw_acr_test_split_swe_verified': ['mean', 'sum', 'count']
})

# Return the repository success rates for display in the notebook
overall_success_rates

Success rates by repository:


Unnamed: 0_level_0,ALL_rules_INCL_exemplars,ALL_rules_INCL_exemplars,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,ALL_rules_NO_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,REPO_rules_NO_exemplars,REPO_rules_NO_exemplars,REPO_rules_NO_exemplars,NO_rules_INCL_exemplars,NO_rules_INCL_exemplars,NO_rules_INCL_exemplars,raw_acr_test_split_swe_verified,raw_acr_test_split_swe_verified,raw_acr_test_split_swe_verified
Unnamed: 0_level_1,mean,sum,size,mean,sum,size,mean,sum,size,mean,sum,size,mean,sum,size,mean,sum,size
repository,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2
django,0.5,11,22,0.363636,8,22,0.545455,12,22,0.409091,9,22,0.5,11,22,0.5,11,22
matplotlib,0.5,2,4,0.25,1,4,0.0,0,4,0.0,0,4,0.25,1,4,0.25,1,4
mwaskom,0.0,0,1,0.0,0,1,0.0,0,1,0.0,0,1,0.0,0,1,0.0,0,1
pytest-dev,0.333333,1,3,0.333333,1,3,0.333333,1,3,0.333333,1,3,0.666667,2,3,0.333333,1,3
scikit-learn,1.0,1,1,1.0,1,1,1.0,1,1,1.0,1,1,1.0,1,1,1.0,1,1
sphinx-doc,0.0,0,5,0.0,0,5,0.0,0,5,0.0,0,5,0.0,0,5,0.0,0,5
sympy,0.333333,3,9,0.333333,3,9,0.222222,2,9,0.333333,3,9,0.333333,3,9,0.555556,5,9


Unnamed: 0,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,REPO_rules_NO_exemplars,NO_rules_INCL_exemplars,raw_acr_test_split_swe_verified
mean,0.4,0.311111,0.355556,0.311111,0.4,0.422222
sum,18.0,14.0,16.0,14.0,18.0,19.0
count,45.0,45.0,45.0,45.0,45.0,45.0


In [28]:
# Find instances solved by any configuration but not by raw_acr_test_split_swe_verified
solved_by_any = eval_data_df[
    (eval_data_df['ALL_rules_INCL_exemplars'] == True) | 
    (eval_data_df['ALL_rules_NO_exemplars'] == True) | 
    (eval_data_df['GENERAL_rules_NO_exemplars'] == True) | 
    (eval_data_df['REPO_rules_NO_exemplars'] == True) | 
    (eval_data_df['NO_rules_INCL_exemplars'] == True)
]

# Filter to only those not solved by raw_acr_test_split_swe_verified
solved_by_any_except_raw = solved_by_any[solved_by_any['raw_acr_test_split_swe_verified'] == False]

print(f"Found {len(solved_by_any_except_raw)} instances solved by at least one configuration but not by raw_acr_test_split_swe_verified:")
display(solved_by_any_except_raw)

# Show which configurations solved each instance
config_columns = [
    'ALL_rules_INCL_exemplars', 
    'ALL_rules_NO_exemplars', 
    'GENERAL_rules_NO_exemplars', 
    'REPO_rules_NO_exemplars', 
    'NO_rules_INCL_exemplars'
]

# Count how many configurations solved each instance
solved_by_any_except_raw['num_configs_solved'] = solved_by_any_except_raw[config_columns].sum(axis=1)

# Sort by number of configurations that solved it (most interesting cases first)
solved_by_any_except_raw_sorted = solved_by_any_except_raw.sort_values('num_configs_solved', ascending=False)

print("\nSorted by number of configurations that solved each instance:")
display(solved_by_any_except_raw_sorted[['instance_id', 'repository'] + config_columns + ['num_configs_solved']])


Found 6 instances solved by at least one configuration but not by raw_acr_test_split_swe_verified:


Unnamed: 0,instance_id,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,NO_rules_INCL_exemplars,REPO_rules_NO_exemplars,raw_acr_test_split_swe_verified,repository
2,django__django-11211,False,False,False,True,False,False,django
6,django__django-13279,False,False,False,True,False,False,django
9,django__django-13807,False,False,True,False,False,False,django
21,django__django-16938,True,False,False,False,False,False,django
23,matplotlib__matplotlib-14623,True,True,False,False,False,False,matplotlib
29,pytest-dev__pytest-7490,False,False,False,True,False,False,pytest-dev



Sorted by number of configurations that solved each instance:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  solved_by_any_except_raw['num_configs_solved'] = solved_by_any_except_raw[config_columns].sum(axis=1)


Unnamed: 0,instance_id,repository,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,REPO_rules_NO_exemplars,NO_rules_INCL_exemplars,num_configs_solved
23,matplotlib__matplotlib-14623,matplotlib,True,True,False,False,False,2
2,django__django-11211,django,False,False,False,False,True,1
6,django__django-13279,django,False,False,False,False,True,1
9,django__django-13807,django,False,False,True,False,False,1
21,django__django-16938,django,True,False,False,False,False,1
29,pytest-dev__pytest-7490,pytest-dev,False,False,False,False,True,1


In [29]:
# Find instances solved by raw_acr_test_split_swe_verified but not by any other configuration
solved_by_raw = eval_data_df[eval_data_df['raw_acr_test_split_swe_verified'] == True]

# Filter to only those not solved by any other configuration
solved_by_raw_only = solved_by_raw[
    (solved_by_raw['ALL_rules_INCL_exemplars'] == False) & 
    (solved_by_raw['ALL_rules_NO_exemplars'] == False) & 
    (solved_by_raw['GENERAL_rules_NO_exemplars'] == False) & 
    (solved_by_raw['REPO_rules_NO_exemplars'] == False) & 
    (solved_by_raw['NO_rules_INCL_exemplars'] == False)
]

print(f"Found {len(solved_by_raw_only)} instances solved by raw_acr_test_split_swe_verified but not by any other configuration:")
display(solved_by_raw_only)

# Show details for each instance
config_columns = [
    'ALL_rules_INCL_exemplars', 
    'ALL_rules_NO_exemplars', 
    'GENERAL_rules_NO_exemplars', 
    'REPO_rules_NO_exemplars', 
    'NO_rules_INCL_exemplars',
    'raw_acr_test_split_swe_verified'
]

# Display the results with repository information
print("\nDetails of instances solved only by raw_acr_test_split_swe_verified:")
display(solved_by_raw_only[['instance_id', 'repository'] + config_columns])


Found 2 instances solved by raw_acr_test_split_swe_verified but not by any other configuration:


Unnamed: 0,instance_id,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,NO_rules_INCL_exemplars,REPO_rules_NO_exemplars,raw_acr_test_split_swe_verified,repository
42,sympy__sympy-20801,False,False,False,False,False,True,sympy
44,sympy__sympy-24661,False,False,False,False,False,True,sympy



Details of instances solved only by raw_acr_test_split_swe_verified:


Unnamed: 0,instance_id,repository,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,REPO_rules_NO_exemplars,NO_rules_INCL_exemplars,raw_acr_test_split_swe_verified
42,sympy__sympy-20801,sympy,False,False,False,False,False,True
44,sympy__sympy-24661,sympy,False,False,False,False,False,True


In [30]:
# Filter Django rows where the boolean values differ across configurations
django_df = eval_data_df[eval_data_df['repository'] == 'django']
django_with_differences = django_df[
    (django_df['ALL_rules_NO_exemplars'] != django_df['GENERAL_rules_NO_exemplars']) |
    (django_df['ALL_rules_NO_exemplars'] != django_df['REPO_rules_NO_exemplars']) |
    (django_df['ALL_rules_NO_exemplars'] != django_df['raw_acr_test_split_swe_verified']) |
    (django_df['GENERAL_rules_NO_exemplars'] != django_df['REPO_rules_NO_exemplars']) |
    (django_df['GENERAL_rules_NO_exemplars'] != django_df['raw_acr_test_split_swe_verified']) |
    (django_df['REPO_rules_NO_exemplars'] != django_df['raw_acr_test_split_swe_verified'])
]
django_with_differences[['instance_id', 'ALL_rules_NO_exemplars', 'GENERAL_rules_NO_exemplars', 'REPO_rules_NO_exemplars', 'NO_rules_INCL_exemplars', 'raw_acr_test_split_swe_verified']]

Unnamed: 0,instance_id,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,REPO_rules_NO_exemplars,NO_rules_INCL_exemplars,raw_acr_test_split_swe_verified
7,django__django-13417,False,True,True,True,True
9,django__django-13807,False,True,False,False,False
10,django__django-13933,False,True,False,True,True
16,django__django-15814,True,True,False,True,True
17,django__django-15987,False,True,True,False,True


In [31]:
test_df = pd.read_json('../data/test.json')

In [32]:
# Join test_df with eval_data_df on instance_id
merged_df = pd.merge(test_df[['empiric_difficulty', 'instance_id']], eval_data_df, on='instance_id', how='inner')

# Compute median empiric_difficulty for each configuration
config_columns = [
    'ALL_rules_INCL_exemplars', 
    'ALL_rules_NO_exemplars', 
    'GENERAL_rules_NO_exemplars', 
    'REPO_rules_NO_exemplars', 
    'NO_rules_INCL_exemplars',
    'raw_acr_test_split_swe_verified'
]

# Create a results dataframe to store median difficulties
results = []

for config in config_columns:
    # Calculate median difficulty for instances where this config succeeded (True)
    success_median = merged_df[merged_df[config] == True]['empiric_difficulty'].median()
    
    # Calculate median difficulty for instances where this config failed (False)
    failure_median = merged_df[merged_df[config] == False]['empiric_difficulty'].median()
    
    # Count instances for each case
    success_count = merged_df[merged_df[config] == True].shape[0]
    failure_count = merged_df[merged_df[config] == False].shape[0]
    
    results.append({
        'Configuration': config,
        'Success_Median_Difficulty': success_median,
        'Failure_Median_Difficulty': failure_median,
        'Success_Count': success_count,
        'Failure_Count': failure_count
    })

# Convert to dataframe and display
difficulty_by_config = pd.DataFrame(results)
display(difficulty_by_config)

Unnamed: 0,Configuration,Success_Median_Difficulty,Failure_Median_Difficulty,Success_Count,Failure_Count
0,ALL_rules_INCL_exemplars,0.455,0.83,18,27
1,ALL_rules_NO_exemplars,0.43,0.78,14,31
2,GENERAL_rules_NO_exemplars,0.405,0.83,16,29
3,REPO_rules_NO_exemplars,0.405,0.83,14,31
4,NO_rules_INCL_exemplars,0.47,0.89,18,27
5,raw_acr_test_split_swe_verified,0.44,0.89,19,26


In [33]:
# Find instances solved by any configuration but not by raw_acr_test_split_swe_verified
solved_by_any = merged_df[
    (merged_df['ALL_rules_INCL_exemplars'] == True) | 
    (merged_df['ALL_rules_NO_exemplars'] == True) | 
    (merged_df['GENERAL_rules_NO_exemplars'] == True) | 
    (merged_df['REPO_rules_NO_exemplars'] == True) | 
    (merged_df['NO_rules_INCL_exemplars'] == True)
]

# Filter to only those not solved by raw_acr_test_split_swe_verified
solved_by_any_except_raw = solved_by_any[solved_by_any['raw_acr_test_split_swe_verified'] == False]

In [34]:
solved_by_any_except_raw

Unnamed: 0,empiric_difficulty,instance_id,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,NO_rules_INCL_exemplars,REPO_rules_NO_exemplars,raw_acr_test_split_swe_verified,repository
0,0.89,pytest-dev__pytest-7490,False,False,False,True,False,False,pytest-dev
6,0.89,django__django-13807,False,False,True,False,False,False,django
7,0.97,matplotlib__matplotlib-14623,True,True,False,False,False,False,matplotlib
21,0.67,django__django-13279,False,False,False,True,False,False,django
28,0.97,django__django-16938,True,False,False,False,False,False,django
39,0.69,django__django-11211,False,False,False,True,False,False,django


In [35]:
solved_by_baseline = merged_df[
    (merged_df['raw_acr_test_split_swe_verified'] == True) 
]

# Filter to only those not solved by raw_acr_test_split_swe_verified
solved_by_baseline_vs_repo_django = solved_by_baseline[(solved_by_baseline['REPO_rules_NO_exemplars'] == False) & (solved_by_baseline['repository'] == 'django')]

In [36]:
solved_by_baseline_vs_repo_django['empiric_difficulty'].median()

np.float64(0.515)

In [37]:
merged_df[merged_df['repository'] == 'django']['empiric_difficulty'].median()

np.float64(0.655)

In [38]:
solved_by_baseline_vs_repo_django

Unnamed: 0,empiric_difficulty,instance_id,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,NO_rules_INCL_exemplars,REPO_rules_NO_exemplars,raw_acr_test_split_swe_verified,repository
19,0.64,django__django-15814,True,True,True,True,False,True,django
33,0.39,django__django-13933,True,False,True,True,False,True,django


In [39]:
solved_by_baseline[solved_by_baseline['REPO_rules_NO_exemplars'] == False]

Unnamed: 0,empiric_difficulty,instance_id,ALL_rules_INCL_exemplars,ALL_rules_NO_exemplars,GENERAL_rules_NO_exemplars,NO_rules_INCL_exemplars,REPO_rules_NO_exemplars,raw_acr_test_split_swe_verified,repository
19,0.64,django__django-15814,True,True,True,True,False,True,django
20,0.78,sympy__sympy-20801,False,False,False,False,False,True,sympy
23,0.72,sympy__sympy-24661,False,False,False,False,False,True,sympy
27,0.64,matplotlib__matplotlib-13989,True,False,False,True,False,True,matplotlib
33,0.39,django__django-13933,True,False,True,True,False,True,django


In [15]:
import os
import json
import pandas as pd
from pathlib import Path
import tiktoken

# Define the base directory and configurations to analyze
base_dir = Path("../data/runs_stored")
configs_to_analyze = [
    "raw_acr_test_split_swe_verified",
    "benchmark_on_test_ALL_rules_INCL_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data",
    "benchmark_on_test_ALL_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data",
    "benchmark_on_test_GENERAL_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data"
    "benchmark_on_test_REPO_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data",
    "benchmark_on_test_NO_rules_INCL_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data"
]

# Initialize the tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")  # OpenAI's tokenizer for GPT models

# Function to count trajectory turns for a given instance directory
def count_trajectory_turns(instance_dir):
    conversation_files = list(instance_dir.glob("conversation_round_*.json"))
    debug_files = list(instance_dir.glob("debug_agent_write_patch_*.json"))
    return len(conversation_files) + len(debug_files)

# Function to get token count from the highest indexed debug file
def get_token_count(instance_dir):
    debug_files = list(instance_dir.glob("debug_agent_write_patch_*.json"))
    if not debug_files:
        return 0
    
    # Get the highest indexed debug file
    highest_index = max([int(f.stem.split('_')[-1]) for f in debug_files])
    highest_debug_file = instance_dir / f"debug_agent_write_patch_{highest_index}.json"
    
    try:
        with open(highest_debug_file, 'r') as f:
            debug_data = json.load(f)
        
        # Count tokens in each message
        total_tokens = 0
        for message in debug_data:
            if 'content' in message and message['content']:
                total_tokens += len(tokenizer.encode(message['content']))
        
        return total_tokens
    except Exception as e:
        print(f"Error processing {highest_debug_file}: {e}")
        return 0

# Dictionary to store results for each configuration
trajectory_data = {config: [] for config in configs_to_analyze}

# Process each configuration
for config in configs_to_analyze:
    config_dir = base_dir / config
    
    # Skip if directory doesn't exist
    if not config_dir.exists():
        print(f"Directory not found: {config_dir}")
        continue
    
    # Walk through all subdirectories
    for root, dirs, files in os.walk(config_dir):
        root_path = Path(root)
        
        # Check if this directory contains trajectory files
        if any(f.startswith("conversation_round_") or f.startswith("debug_agent_write_patch_") for f in files):
            # Extract instance_id from the directory path
            dir_name = root_path.name
            if "_2025" in dir_name:
                instance_id = dir_name.split("_2025")[0]
                
                # Count trajectory turns
                turn_length = count_trajectory_turns(root_path)
                
                # Get token count from debug file
                token_count = get_token_count(root_path)
                
                trajectory_data[config].append({
                    "instance_id": instance_id,
                    "turn_length": turn_length,
                    "trajectory_length": token_count
                })

# Convert to DataFrames
dfs = {}
for config in configs_to_analyze:
    if trajectory_data[config]:
        dfs[config] = pd.DataFrame(trajectory_data[config])
    else:
        print(f"No data found for {config}")

# Calculate median trajectory length for each configuration
for config, df in dfs.items():
    if not df.empty:
        print(f"Median turn length for {config}: {df['turn_length'].median()}")
        print(f"Median token count for {config}: {df['trajectory_length'].median()}")
        print(f"Number of instances analyzed: {len(df)}")

# Display the distribution of trajectory lengths
for config, df in dfs.items():
    if not df.empty:
        print(f"\nTrajectory statistics for {config}:")
        print("Turn length:")
        print(df['turn_length'].describe())
        print("\nToken count:")
        print(df['trajectory_length'].describe())

# Create a summary DataFrame
summary_data = []
for config, df in dfs.items():
    if not df.empty:
        summary_data.append({
            "Configuration": config,
            "Median_Turn_Length": df['turn_length'].median(),
            "Mean_Turn_Length": df['turn_length'].mean(),
            "Min_Turn_Length": df['turn_length'].min(),
            "Max_Turn_Length": df['turn_length'].max(),
            "Median_Trajectory_Length": df['trajectory_length'].median(),
            "Mean_Trajectory_Length": df['trajectory_length'].mean(),
            "Min_Trajectory_Length": df['trajectory_length'].min(),
            "Max_Trajectory_Length": df['trajectory_length'].max(),
            "Instance_Count": len(df)
        })

if summary_data:
    summary_df = pd.DataFrame(summary_data)
    display(summary_df)


Median turn length for raw_acr_test_split_swe_verified: 9.0
Median token count for raw_acr_test_split_swe_verified: 10027.0
Number of instances analyzed: 45
Median turn length for benchmark_on_test_REPO_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data: 8.0
Median token count for benchmark_on_test_REPO_rules_NO_exemplars_from_o1_with_gpt_4o_and_combined_lite_and_verified_data: 10130.0
Number of instances analyzed: 45

Trajectory statistics for raw_acr_test_split_swe_verified:
Turn length:
count    45.000000
mean     11.155556
std       5.538935
min       4.000000
25%       7.000000
50%       9.000000
75%      18.000000
max      20.000000
Name: turn_length, dtype: float64

Token count:
count       45.000000
mean     11414.022222
std       6631.648196
min       3654.000000
25%       6760.000000
50%      10027.000000
75%      12503.000000
max      31408.000000
Name: trajectory_length, dtype: float64

Trajectory statistics for benchmark_on_test_REPO_rules_NO_exempl

Unnamed: 0,Configuration,Median_Turn_Length,Mean_Turn_Length,Min_Turn_Length,Max_Turn_Length,Median_Trajectory_Length,Mean_Trajectory_Length,Min_Trajectory_Length,Max_Trajectory_Length,Instance_Count
0,raw_acr_test_split_swe_verified,9.0,11.155556,4,20,10027.0,11414.022222,3654,31408,45
1,benchmark_on_test_REPO_rules_NO_exemplars_from...,8.0,9.666667,4,20,10130.0,12139.866667,3953,38730,45
