In [1]:
import pandas as pd

df = pd.read_csv("swe_bench_token_cost_aggregated_total.csv")

cols = df.columns.tolist()

cols

['repo',
 'instance_id',
 'base_commit',
 'patch',
 'test_patch',
 'problem_statement',
 'hints_text',
 'created_at',
 'version',
 'FAIL_TO_PASS',
 'PASS_TO_PASS',
 'environment_setup_commit',
 'difficulty',
 'total_prompt_tokens_run1',
 'total_completion_tokens_run1',
 'total_total_tokens_run1',
 'total_cache_creation_input_tokens_run1',
 'total_cache_read_input_tokens_run1',
 'total_cached_tokens_run1',
 'total_tool_usages_run1',
 'total_tool_usage_str_replace_editor_run1',
 'total_tool_usage_execute_bash_run1',
 'total_tool_usage_think_run1',
 'total_tool_usage_finish_run1',
 'total_interaction_rounds_run1',
 'total_cost_run1',
 'total_prompt_tokens_run2',
 'total_completion_tokens_run2',
 'total_total_tokens_run2',
 'total_cache_creation_input_tokens_run2',
 'total_cache_read_input_tokens_run2',
 'total_cached_tokens_run2',
 'total_tool_usages_run2',
 'total_tool_usage_str_replace_editor_run2',
 'total_tool_usage_execute_bash_run2',
 'total_tool_usage_think_run2',
 'total_tool_usag

In [None]:
import json
from pathlib import Path

# ---------------------------------------------------------------------
# CONFIG – adjust only if your folder names have changed
# ---------------------------------------------------------------------
BASE = Path("./sonet_openhands")
RUN_DIR_TMPL = (
    "claude-3-7-sonnet-20250219_maxiter_100_N_v0.31.0-no-hint-juan-inst-t1-run_{i}"
)
EXTRACT_DIR_TMPL = RUN_DIR_TMPL + "_all_interaction_extract"

TOOLS = ["str_replace_editor", "execute_bash", "think"]  # we ignore "finish"
RUN_IDS = range(1, 5)                                    # runs 1–4
# ---------------------------------------------------------------------

# ---------------------------------------------------------------------
# 1. Create the new columns (filled with 0 by default)
# ---------------------------------------------------------------------
for run_id in RUN_IDS:
    for tool in TOOLS:
        col = f"total_tool_usage_{tool}_token_cost_run{run_id}"
        if col not in df.columns:
            df[col] = 0   # initialise

# ---------------------------------------------------------------------
# 2. For every run & every instance, sum token cost per tool (FIXED)
# ---------------------------------------------------------------------
for run_id in RUN_IDS:
    extract_root = BASE / EXTRACT_DIR_TMPL.format(i=run_id)

    if not extract_root.exists():
        print(f"Extract dir for run {run_id} not found → skipping")
        continue

    for idx, row in df.iterrows():
        inst_id = row["instance_id"]
        summary_rounds = extract_root / inst_id / "summary_rounds.json"
        if not summary_rounds.exists():
            continue  # leave zeros if this run/instance missing

        try:
            with open(summary_rounds) as fh:
                rounds = json.load(fh)["rounds"]
        except Exception as exc:
            print(f"✗ cannot read {summary_rounds}: {exc}")
            continue

        # ----------------------------------------------------------------------------
        # accumulate per-tool token totals ← we look at **tool_executed_name**
        # ----------------------------------------------------------------------------
        token_totals = {tool: 0 for tool in TOOLS}  # str_replace_editor / execute_bash / think

        for rdata in rounds.values():
            t_name  = rdata.get("tool_executed_name")
            t_tokens = rdata.get("tool_output_tokens", 0)
            if t_name in token_totals:
                token_totals[t_name] += t_tokens

        # write into dataframe
        for tool in TOOLS:
            col = f"total_tool_usage_{tool}_token_cost_run{run_id}"
            df.loc[idx, col] = token_totals[tool]


print("✓ Added per-tool token-cost columns for all runs")


✓ Added per-tool token-cost columns for all runs


In [5]:
display(df)

Unnamed: 0,repo,instance_id,base_commit,patch,test_patch,problem_statement,hints_text,created_at,version,FAIL_TO_PASS,...,total_tool_usage_think_token_cost_run1,total_tool_usage_str_replace_editor_token_cost_run2,total_tool_usage_execute_bash_token_cost_run2,total_tool_usage_think_token_cost_run2,total_tool_usage_str_replace_editor_token_cost_run3,total_tool_usage_execute_bash_token_cost_run3,total_tool_usage_think_token_cost_run3,total_tool_usage_str_replace_editor_token_cost_run4,total_tool_usage_execute_bash_token_cost_run4,total_tool_usage_think_token_cost_run4
0,astropy/astropy,astropy__astropy-12907,d16bfe05a744909de4b27f5875fe0d4ed41ce607,diff --git a/astropy/modeling/separable.py b/a...,diff --git a/astropy/modeling/tests/test_separ...,Modeling's `separability_matrix` does not comp...,,2022-03-03T15:14:54Z,4.30,"[""astropy/modeling/tests/test_separable.py::te...",...,8,17244,5004,8,9711,8497,24,8902,31978,16
1,astropy/astropy,astropy__astropy-13033,298ccb478e6bf092953bca67a3d29dc6c35f6752,diff --git a/astropy/timeseries/core.py b/astr...,diff --git a/astropy/timeseries/tests/test_sam...,TimeSeries: misleading exception when required...,The relevant code that produces the misleading...,2022-03-31T23:28:27Z,4.30,"[""astropy/timeseries/tests/test_sampled.py::te...",...,0,7328,3596,8,11224,4478,0,7013,3180,0
2,astropy/astropy,astropy__astropy-13236,6ed769d58d89380ebaa1ef52b300691eefda8928,diff --git a/astropy/table/table.py b/astropy/...,diff --git a/astropy/table/tests/test_mixin.py...,Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...,2022-05-09T14:16:30Z,5.00,"[""astropy/table/tests/test_mixin.py::test_ndar...",...,0,16278,8919,0,6861,16341,0,10316,4024,8
3,astropy/astropy,astropy__astropy-13398,6500928dc0e57be8f06d1162eacc3ba5e2eff692,diff --git a/astropy/coordinates/builtin_frame...,diff --git a/astropy/coordinates/tests/test_in...,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher...",2022-06-24T15:22:11Z,5.00,"[""astropy/coordinates/tests/test_intermediate_...",...,24,17654,5801,0,2955,14577,0,24911,17985,0
4,astropy/astropy,astropy__astropy-13453,19cc80471739bcb67b7e8099246b391c355023ee,diff --git a/astropy/io/ascii/html.py b/astrop...,diff --git a/astropy/io/ascii/tests/test_html....,ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...,2022-07-14T10:04:40Z,5.00,"[""astropy/io/ascii/tests/test_html.py::test_wr...",...,8,25236,17924,0,10073,8934,0,38611,19745,16
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,sympy/sympy,sympy__sympy-24213,e8c22f6eac7314be8d92590bfff92ced79ee03e2,diff --git a/sympy/physics/units/unitsystem.py...,diff --git a/sympy/physics/units/tests/test_qu...,collect_factor_and_dimension does not detect e...,,2022-11-03T14:00:09Z,1.12,"[""test_issue_24211""]",...,16,6353,14847,0,7960,7794,0,12112,27788,0
496,sympy/sympy,sympy__sympy-24443,809c53c077485ca48a206cee78340389cb83b7f1,diff --git a/sympy/combinatorics/homomorphisms...,diff --git a/sympy/combinatorics/tests/test_ho...,`_check_homomorphism` is broken on Permutation...,,2022-12-30T14:43:19Z,1.12,"[""test_homomorphism""]",...,8,76613,25974,32,9823,10412,16,12035,8887,8
497,sympy/sympy,sympy__sympy-24539,193e3825645d93c73e31cdceb6d742cc6919624d,diff --git a/sympy/polys/rings.py b/sympy/poly...,diff --git a/sympy/polys/tests/test_rings.py b...,`PolyElement.as_expr()` not accepting symbols\...,,2023-01-17T17:26:42Z,1.12,"[""test_PolyElement_as_expr""]",...,0,9346,5094,0,4900,6761,0,4915,5032,0
498,sympy/sympy,sympy__sympy-24562,b1cb676cf92dd1a48365b731979833375b188bf2,diff --git a/sympy/core/numbers.py b/sympy/cor...,diff --git a/sympy/core/tests/test_numbers.py ...,"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...,2023-01-21T12:06:36Z,1.12,"[""test_issue_24543""]",...,0,13432,5271,8,10840,12017,16,14569,12197,0


In [None]:
# ---------------------------------------------------------------------
#   Row-wise average of token cost per tool over the 4 runs
# ---------------------------------------------------------------------
TOOLS = ["str_replace_editor", "execute_bash", "think"]

for tool in TOOLS:
    run_cols = [f"total_tool_usage_{tool}_token_cost_run{r}" for r in RUN_IDS]
    avg_col  = f"avg_token_cost_{tool}"
    df[avg_col] = df[run_cols].mean(axis=1)

# ---------------------------------------------------------------------
#   Save updated CSV
# ---------------------------------------------------------------------
out_path = "./swe_bench_token_cost_aggregated_total_with_avgs.csv"
df.to_csv(out_path, index=False)
print(f"  Saved augmented CSV → {out_path}")

# ---------------------------------------------------------------------
#   Print overall mean of those averages (across all problems)
# ---------------------------------------------------------------------
print("\n=== Average token cost per tool (averaged over 4 runs, then averaged over 500 problems) ===")
for tool in TOOLS:
    overall_mean = df[f"avg_token_cost_{tool}"].mean()
    print(f"{tool:22s}: {overall_mean:,.1f} tokens")

✅  Saved augmented CSV → /home/longju/llm_token_predictor/oh_analysis/swe_bench_token_cost_aggregated_total_with_avgs.csv

=== Average token cost per tool (averaged over 4 runs, then averaged over 500 problems) ===
str_replace_editor    : 15,210.6 tokens
execute_bash          : 15,128.8 tokens
think                 : 7.0 tokens


In [7]:
df

Unnamed: 0,repo,instance_id,base_commit,patch,test_patch,problem_statement,hints_text,created_at,version,FAIL_TO_PASS,...,total_tool_usage_think_token_cost_run2,total_tool_usage_str_replace_editor_token_cost_run3,total_tool_usage_execute_bash_token_cost_run3,total_tool_usage_think_token_cost_run3,total_tool_usage_str_replace_editor_token_cost_run4,total_tool_usage_execute_bash_token_cost_run4,total_tool_usage_think_token_cost_run4,avg_token_cost_str_replace_editor,avg_token_cost_execute_bash,avg_token_cost_think
0,astropy/astropy,astropy__astropy-12907,d16bfe05a744909de4b27f5875fe0d4ed41ce607,diff --git a/astropy/modeling/separable.py b/a...,diff --git a/astropy/modeling/tests/test_separ...,Modeling's `separability_matrix` does not comp...,,2022-03-03T15:14:54Z,4.30,"[""astropy/modeling/tests/test_separable.py::te...",...,8,9711,8497,24,8902,31978,16,12135.25,12893.75,14.0
1,astropy/astropy,astropy__astropy-13033,298ccb478e6bf092953bca67a3d29dc6c35f6752,diff --git a/astropy/timeseries/core.py b/astr...,diff --git a/astropy/timeseries/tests/test_sam...,TimeSeries: misleading exception when required...,The relevant code that produces the misleading...,2022-03-31T23:28:27Z,4.30,"[""astropy/timeseries/tests/test_sampled.py::te...",...,8,11224,4478,0,7013,3180,0,8219.25,6572.50,2.0
2,astropy/astropy,astropy__astropy-13236,6ed769d58d89380ebaa1ef52b300691eefda8928,diff --git a/astropy/table/table.py b/astropy/...,diff --git a/astropy/table/tests/test_mixin.py...,Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...,2022-05-09T14:16:30Z,5.00,"[""astropy/table/tests/test_mixin.py::test_ndar...",...,0,6861,16341,0,10316,4024,8,11459.50,11383.50,2.0
3,astropy/astropy,astropy__astropy-13398,6500928dc0e57be8f06d1162eacc3ba5e2eff692,diff --git a/astropy/coordinates/builtin_frame...,diff --git a/astropy/coordinates/tests/test_in...,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher...",2022-06-24T15:22:11Z,5.00,"[""astropy/coordinates/tests/test_intermediate_...",...,0,2955,14577,0,24911,17985,0,15930.50,10217.00,6.0
4,astropy/astropy,astropy__astropy-13453,19cc80471739bcb67b7e8099246b391c355023ee,diff --git a/astropy/io/ascii/html.py b/astrop...,diff --git a/astropy/io/ascii/tests/test_html....,ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...,2022-07-14T10:04:40Z,5.00,"[""astropy/io/ascii/tests/test_html.py::test_wr...",...,0,10073,8934,0,38611,19745,16,20908.75,19186.50,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,sympy/sympy,sympy__sympy-24213,e8c22f6eac7314be8d92590bfff92ced79ee03e2,diff --git a/sympy/physics/units/unitsystem.py...,diff --git a/sympy/physics/units/tests/test_qu...,collect_factor_and_dimension does not detect e...,,2022-11-03T14:00:09Z,1.12,"[""test_issue_24211""]",...,0,7960,7794,0,12112,27788,0,10724.25,17422.25,4.0
496,sympy/sympy,sympy__sympy-24443,809c53c077485ca48a206cee78340389cb83b7f1,diff --git a/sympy/combinatorics/homomorphisms...,diff --git a/sympy/combinatorics/tests/test_ho...,`_check_homomorphism` is broken on Permutation...,,2022-12-30T14:43:19Z,1.12,"[""test_homomorphism""]",...,32,9823,10412,16,12035,8887,8,28968.00,14446.75,16.0
497,sympy/sympy,sympy__sympy-24539,193e3825645d93c73e31cdceb6d742cc6919624d,diff --git a/sympy/polys/rings.py b/sympy/poly...,diff --git a/sympy/polys/tests/test_rings.py b...,`PolyElement.as_expr()` not accepting symbols\...,,2023-01-17T17:26:42Z,1.12,"[""test_PolyElement_as_expr""]",...,0,4900,6761,0,4915,5032,0,6068.00,8747.75,0.0
498,sympy/sympy,sympy__sympy-24562,b1cb676cf92dd1a48365b731979833375b188bf2,diff --git a/sympy/core/numbers.py b/sympy/cor...,diff --git a/sympy/core/tests/test_numbers.py ...,"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...,2023-01-21T12:06:36Z,1.12,"[""test_issue_24543""]",...,8,10840,12017,16,14569,12197,0,11768.50,10305.50,6.0


In [8]:
for tool in TOOLS:
    # ----------  average #calls across 4 runs --------------------
    call_cols   = [f"total_tool_usage_{tool}_run{r}"           for r in RUN_IDS]
    token_cols  = [f"total_tool_usage_{tool}_token_cost_run{r}" for r in RUN_IDS]
    
    df[f"avg_tool_usage_{tool}"] = df[call_cols].mean(axis=1)
    
    # ----------   average tokens per *call* -----------------------
    total_calls  = df[call_cols].sum(axis=1)
    total_tokens = df[token_cols].sum(axis=1)
    
    # Avoid division-by-zero
    df[f"avg_token_cost_{tool}_per_call"] = total_tokens.div(total_calls).fillna(0)

# ---------------------------------------------------------------------
#    Print global mean token-per-call for each tool
# ---------------------------------------------------------------------
print("=== Global average token cost *per call* (across 500 problems & 4 runs) ===")
for tool in TOOLS:
    per_call_series = df.loc[df[f"avg_tool_usage_{tool}"] > 0, f"avg_token_cost_{tool}_per_call"]
    print(f"{tool:22s}: {per_call_series.mean():,.1f} tokens")

df

=== Global average token cost *per call* (across 500 problems & 4 runs) ===
str_replace_editor    : 818.6 tokens
execute_bash          : 626.0 tokens
think                 : 11.0 tokens


Unnamed: 0,repo,instance_id,base_commit,patch,test_patch,problem_statement,hints_text,created_at,version,FAIL_TO_PASS,...,total_tool_usage_think_token_cost_run4,avg_token_cost_str_replace_editor,avg_token_cost_execute_bash,avg_token_cost_think,avg_tool_usage_str_replace_editor,avg_token_cost_str_replace_editor_per_call,avg_tool_usage_execute_bash,avg_token_cost_execute_bash_per_call,avg_tool_usage_think,avg_token_cost_think_per_call
0,astropy/astropy,astropy__astropy-12907,d16bfe05a744909de4b27f5875fe0d4ed41ce607,diff --git a/astropy/modeling/separable.py b/a...,diff --git a/astropy/modeling/tests/test_separ...,Modeling's `separability_matrix` does not comp...,,2022-03-03T15:14:54Z,4.30,"[""astropy/modeling/tests/test_separable.py::te...",...,16,12135.25,12893.75,14.0,20.50,591.963415,27.50,468.863636,1.75,8.0
1,astropy/astropy,astropy__astropy-13033,298ccb478e6bf092953bca67a3d29dc6c35f6752,diff --git a/astropy/timeseries/core.py b/astr...,diff --git a/astropy/timeseries/tests/test_sam...,TimeSeries: misleading exception when required...,The relevant code that produces the misleading...,2022-03-31T23:28:27Z,4.30,"[""astropy/timeseries/tests/test_sampled.py::te...",...,0,8219.25,6572.50,2.0,11.25,730.600000,16.75,392.388060,0.25,8.0
2,astropy/astropy,astropy__astropy-13236,6ed769d58d89380ebaa1ef52b300691eefda8928,diff --git a/astropy/table/table.py b/astropy/...,diff --git a/astropy/table/tests/test_mixin.py...,Consider removing auto-transform of structured...,@mhvk - I'm happy to do this PR if you think i...,2022-05-09T14:16:30Z,5.00,"[""astropy/table/tests/test_mixin.py::test_ndar...",...,8,11459.50,11383.50,2.0,18.25,627.917808,23.25,489.612903,0.25,8.0
3,astropy/astropy,astropy__astropy-13398,6500928dc0e57be8f06d1162eacc3ba5e2eff692,diff --git a/astropy/coordinates/builtin_frame...,diff --git a/astropy/coordinates/tests/test_in...,A direct approach to ITRS to Observed transfor...,"cc @StuartLittlefair, @adrn, @eteq, @eerovaher...",2022-06-24T15:22:11Z,5.00,"[""astropy/coordinates/tests/test_intermediate_...",...,0,15930.50,10217.00,6.0,15.75,1011.460317,19.00,537.736842,0.75,8.0
4,astropy/astropy,astropy__astropy-13453,19cc80471739bcb67b7e8099246b391c355023ee,diff --git a/astropy/io/ascii/html.py b/astrop...,diff --git a/astropy/io/ascii/tests/test_html....,ASCII table output to HTML does not support su...,Welcome to Astropy 👋 and thank you for your fi...,2022-07-14T10:04:40Z,5.00,"[""astropy/io/ascii/tests/test_html.py::test_wr...",...,16,20908.75,19186.50,6.0,35.00,597.392857,39.75,482.679245,0.50,12.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,sympy/sympy,sympy__sympy-24213,e8c22f6eac7314be8d92590bfff92ced79ee03e2,diff --git a/sympy/physics/units/unitsystem.py...,diff --git a/sympy/physics/units/tests/test_qu...,collect_factor_and_dimension does not detect e...,,2022-11-03T14:00:09Z,1.12,"[""test_issue_24211""]",...,0,10724.25,17422.25,4.0,14.25,752.578947,22.75,765.813187,0.50,8.0
496,sympy/sympy,sympy__sympy-24443,809c53c077485ca48a206cee78340389cb83b7f1,diff --git a/sympy/combinatorics/homomorphisms...,diff --git a/sympy/combinatorics/tests/test_ho...,`_check_homomorphism` is broken on Permutation...,,2022-12-30T14:43:19Z,1.12,"[""test_homomorphism""]",...,8,28968.00,14446.75,16.0,35.50,816.000000,39.00,370.429487,1.00,16.0
497,sympy/sympy,sympy__sympy-24539,193e3825645d93c73e31cdceb6d742cc6919624d,diff --git a/sympy/polys/rings.py b/sympy/poly...,diff --git a/sympy/polys/tests/test_rings.py b...,`PolyElement.as_expr()` not accepting symbols\...,,2023-01-17T17:26:42Z,1.12,"[""test_PolyElement_as_expr""]",...,0,6068.00,8747.75,0.0,11.00,551.636364,16.00,546.734375,0.00,0.0
498,sympy/sympy,sympy__sympy-24562,b1cb676cf92dd1a48365b731979833375b188bf2,diff --git a/sympy/core/numbers.py b/sympy/cor...,diff --git a/sympy/core/tests/test_numbers.py ...,"Rational calc value error\npython 3.11, sympy ...",This should probably raise an error. The expec...,2023-01-21T12:06:36Z,1.12,"[""test_issue_24543""]",...,0,11768.50,10305.50,6.0,15.00,784.566667,20.25,508.913580,0.75,8.0


In [None]:
# —— optionally save the augmented dataframe again ——————————
df.to_csv("./swe_bench_token_cost_aggregated_total_with_tool_avgs.csv", index=False)
print("  Saved augmented dataframe with per-call stats.")

  Saved augmented dataframe with per-call stats.
