In [None]:
import pandas as pd
import json
import os

# Define the paths for the report files for each run
report_paths = {
    'run1': './sonet_openhands/claude-3-7-sonnet-20250219_maxiter_100_N_v0.31.0-no-hint-juan-inst-t1-run_1/report.json',
    'run2': './sonet_openhands/claude-3-7-sonnet-20250219_maxiter_100_N_v0.31.0-no-hint-juan-inst-t1-run_2/report.json',
    'run3': './sonet_openhands/claude-3-7-sonnet-20250219_maxiter_100_N_v0.31.0-no-hint-juan-inst-t1-run_3/report.json',
    'run4': './sonet_openhands/claude-3-7-sonnet-20250219_maxiter_100_N_v0.31.0-no-hint-juan-inst-t1-run_4/report.json'
}

# Load the CSV file
csv_path = './dataset/swe_bench_token_cost_aggregated_total.csv'
df = pd.read_csv(csv_path)

print(f"Original CSV shape: {df.shape}")
print(f"Columns: {list(df.columns)}")
print(f"First few instance_ids: {df['instance_id'].head().tolist()}")


Original CSV shape: (500, 65)
Columns: ['repo', 'instance_id', 'base_commit', 'patch', 'test_patch', 'problem_statement', 'hints_text', 'created_at', 'version', 'FAIL_TO_PASS', 'PASS_TO_PASS', 'environment_setup_commit', 'difficulty', 'total_prompt_tokens_run1', 'total_completion_tokens_run1', 'total_total_tokens_run1', 'total_cache_creation_input_tokens_run1', 'total_cache_read_input_tokens_run1', 'total_cached_tokens_run1', 'total_tool_usages_run1', 'total_tool_usage_str_replace_editor_run1', 'total_tool_usage_execute_bash_run1', 'total_tool_usage_think_run1', 'total_tool_usage_finish_run1', 'total_interaction_rounds_run1', 'total_cost_run1', 'total_prompt_tokens_run2', 'total_completion_tokens_run2', 'total_total_tokens_run2', 'total_cache_creation_input_tokens_run2', 'total_cache_read_input_tokens_run2', 'total_cached_tokens_run2', 'total_tool_usages_run2', 'total_tool_usage_str_replace_editor_run2', 'total_tool_usage_execute_bash_run2', 'total_tool_usage_think_run2', 'total_tool_u

In [2]:
# Function to load resolved_ids from each report file
def load_resolved_ids(report_path):
    """Load resolved_ids from a report.json file"""
    try:
        with open(report_path, 'r') as f:
            report_data = json.load(f)
        return set(report_data.get('resolved_ids', []))
    except FileNotFoundError:
        print(f"Warning: Report file not found: {report_path}")
        return set()
    except json.JSONDecodeError:
        print(f"Warning: Could not parse JSON from: {report_path}")
        return set()

# Load resolved_ids for all runs
resolved_ids_by_run = {}
for run_name, report_path in report_paths.items():
    resolved_ids = load_resolved_ids(report_path)
    resolved_ids_by_run[run_name] = resolved_ids
    print(f"{run_name}: {len(resolved_ids)} resolved instances")

# Check if all report files were loaded successfully
for run_name, resolved_ids in resolved_ids_by_run.items():
    if len(resolved_ids) == 0:
        print(f"Warning: No resolved_ids found for {run_name}")
    else:
        print(f"{run_name}: Sample resolved_ids: {list(resolved_ids)[:5]}")


run1: 297 resolved instances
run2: 303 resolved instances
run3: 300 resolved instances
run4: 301 resolved instances
run1: Sample resolved_ids: ['django__django-13933', 'sympy__sympy-19637', 'django__django-11265', 'django__django-13315', 'django__django-15103']
run2: Sample resolved_ids: ['django__django-13933', 'sympy__sympy-19637', 'django__django-11265', 'django__django-13315', 'django__django-15103']
run3: Sample resolved_ids: ['django__django-13933', 'sympy__sympy-19637', 'django__django-11265', 'django__django-15103', 'matplotlib__matplotlib-25122']
run4: Sample resolved_ids: ['django__django-13933', 'sympy__sympy-19637', 'django__django-15103', 'matplotlib__matplotlib-25122', 'django__django-13568']


In [3]:
# Add accuracy columns to the dataframe
for run_name in ['run1', 'run2', 'run3', 'run4']:
    acc_column_name = f'acc_{run_name}'
    
    if run_name in resolved_ids_by_run:
        # Create accuracy column: 1 if instance_id is in resolved_ids, 0 otherwise
        df[acc_column_name] = df['instance_id'].isin(resolved_ids_by_run[run_name]).astype(int)
        print(f"Added {acc_column_name}: {df[acc_column_name].sum()} instances marked as resolved (accuracy=1)")
    else:
        # If no data available for this run, set all to 0
        df[acc_column_name] = 0
        print(f"Added {acc_column_name}: No data available, set all to 0")

# Display summary statistics
print("\nAccuracy Summary:")
for run_name in ['run1', 'run2', 'run3', 'run4']:
    acc_column_name = f'acc_{run_name}'
    total_instances = len(df)
    resolved_count = df[acc_column_name].sum()
    accuracy_rate = (resolved_count / total_instances) * 100
    print(f"{acc_column_name}: {resolved_count}/{total_instances} ({accuracy_rate:.2f}%)")


Added acc_run1: 297 instances marked as resolved (accuracy=1)
Added acc_run2: 303 instances marked as resolved (accuracy=1)
Added acc_run3: 300 instances marked as resolved (accuracy=1)
Added acc_run4: 301 instances marked as resolved (accuracy=1)

Accuracy Summary:
acc_run1: 297/500 (59.40%)
acc_run2: 303/500 (60.60%)
acc_run3: 300/500 (60.00%)
acc_run4: 301/500 (60.20%)


In [4]:
# Display the new columns and verify the data
print("New columns added:")
new_columns = ['acc_run1', 'acc_run2', 'acc_run3', 'acc_run4']
for col in new_columns:
    if col in df.columns:
        print(f"✓ {col}")

print(f"\nUpdated CSV shape: {df.shape}")
print(f"New columns: {new_columns}")

# Show a sample of the data with accuracy columns
print("\nSample data with accuracy columns:")
sample_cols = ['instance_id', 'acc_run1', 'acc_run2', 'acc_run3', 'acc_run4']
print(df[sample_cols].head(10))


New columns added:
✓ acc_run1
✓ acc_run2
✓ acc_run3
✓ acc_run4

Updated CSV shape: (500, 69)
New columns: ['acc_run1', 'acc_run2', 'acc_run3', 'acc_run4']

Sample data with accuracy columns:
              instance_id  acc_run1  acc_run2  acc_run3  acc_run4
0  astropy__astropy-12907         1         1         1         1
1  astropy__astropy-13033         0         0         0         0
2  astropy__astropy-13236         0         0         0         0
3  astropy__astropy-13398         0         0         0         0
4  astropy__astropy-13453         1         0         1         0
5  astropy__astropy-13579         1         1         1         1
6  astropy__astropy-13977         0         0         0         0
7  astropy__astropy-14096         0         1         0         1
8  astropy__astropy-14182         0         0         0         0
9  astropy__astropy-14309         1         1         1         1


In [None]:
# Save the updated CSV file
output_path = './dataset/swe_bench_token_cost_aggregated_total_with_accuracy.csv'
df.to_csv(output_path, index=False)

print(f"Updated CSV saved to: {output_path}")
print(f"Final shape: {df.shape}")

# Verify the saved file
print("\nVerification - checking saved file:")
df_verify = pd.read_csv(output_path)
print(f"Saved file shape: {df_verify.shape}")
print(f"Accuracy columns in saved file: {[col for col in df_verify.columns if col.startswith('acc_')]}")

# Show final summary
print("\nFinal Accuracy Summary:")
for run_name in ['run1', 'run2', 'run3', 'run4']:
    acc_column_name = f'acc_{run_name}'
    if acc_column_name in df_verify.columns:
        resolved_count = df_verify[acc_column_name].sum()
        total_instances = len(df_verify)
        accuracy_rate = (resolved_count / total_instances) * 100
        print(f"{acc_column_name}: {resolved_count}/{total_instances} ({accuracy_rate:.2f}%)")


Updated CSV saved to: /nfs/turbo/coe-mihalcea/longju/token_consumption/organized_codebase/dataset/swe_bench_token_cost_aggregated_total_with_accuracy.csv
Final shape: (500, 69)

Verification - checking saved file:
Saved file shape: (500, 69)
Accuracy columns in saved file: ['acc_run1', 'acc_run2', 'acc_run3', 'acc_run4']

Final Accuracy Summary:
acc_run1: 297/500 (59.40%)
acc_run2: 303/500 (60.60%)
acc_run3: 300/500 (60.00%)
acc_run4: 301/500 (60.20%)
