# Estimate cost of running a model on the KindsOfReasoning dataset


In [1]:
import pandas as pd

from set_up_datasets import DATASETS_DICT
from src.utils import compute_n_tokens, compute_cost

# enable reloading of modules
%load_ext autoreload
%autoreload 2

In [2]:
from transformers import GPT2Tokenizer

tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

  _torch_pytree._register_pytree_node(


In [3]:
DATASETS_DICT

{'space_nli': set_up_datasets.SpaceNLI,
 'anli': set_up_datasets.ANLI,
 'copa': set_up_datasets.COPA,
 'alpha_nli': set_up_datasets.AlphaNLI,
 'wanli': set_up_datasets.WANLI,
 'babi_task_16': set_up_datasets.bABItask16,
 'ropes': set_up_datasets.ROPES,
 'cosmos_qa': set_up_datasets.CosmosQA,
 'formal_fallacies_syllogisms_negation': set_up_datasets.FormalFallaciesSyllogismsNegation,
 'logical_args': set_up_datasets.LogicalArgs,
 'crass_ai': set_up_datasets.CrassAI,
 'geometric_shapes': set_up_datasets.GeometricShapes,
 'emoji_movie': set_up_datasets.EmojiMovie,
 'odd_one_out': set_up_datasets.OddOneOut,
 'metaphor_boolean': set_up_datasets.MetaphorBoolean,
 'fantasy_reasoning': set_up_datasets.FantasyReasoning,
 'abstract_narrative_understanding': set_up_datasets.AbstractNarrativeUnderstanding,
 'cause_and_effect': set_up_datasets.CauseAndEffect,
 'goal_step_wikihow': set_up_datasets.GoalStepWikihow,
 'arithmetic': set_up_datasets.Arithmetic}

In [4]:
len(DATASETS_DICT)

20

# Extract the number of tokens for each dataset and the cost per model

## Compute the number of tokens

In [5]:
registry_path = "1_registry"

n_tokens_dict = {}
for dataset_id, dataset_class in DATASETS_DICT.items():
        
    # print(dataset_id)
    dataset = dataset_class()
    
    if hasattr(dataset, "subtasks") and dataset.subtasks is not None:
        subtasks = dataset.subtasks
        if not isinstance(subtasks, list):
            subtasks = [subtasks]
        
        filepath_list  = [dataset._get_samples_path(registry_path,subtask=subtask) for subtask in subtasks]
        # print(filepath_list)
    else:
        filepath_list = [dataset._get_samples_path(registry_path)]
        # print(filepath_list)
        
    # load the dataframe
    for filepath in filepath_list:
        print(filepath)
        original_df = pd.read_json(filepath, lines=True)
        original_df["n_tokens"] = original_df["input"].apply(
    lambda x: compute_n_tokens("System: " + x[0]["content"] + "\nUser: " + x[1]["content"] + "\nAnswer: ", tokenizer))
        print(filepath, len(original_df))
        n_tokens_dict[filepath] = original_df["n_tokens"].sum()
total_n_tokens = sum(n_tokens_dict.values())

1_registry/data/space_nli/samples.jsonl
1_registry/data/space_nli/samples.jsonl 1600
1_registry/data/anli/samples.jsonl
1_registry/data/anli/samples.jsonl 3196
1_registry/data/copa/samples.jsonl
1_registry/data/copa/samples.jsonl 496
1_registry/data/alpha_nli/samples.jsonl
1_registry/data/alpha_nli/samples.jsonl 1528
1_registry/data/wanli/samples.jsonl
1_registry/data/wanli/samples.jsonl 4996
1_registry/data/babi_task_16/samples.jsonl
1_registry/data/babi_task_16/samples.jsonl 4996
1_registry/data/ropes/samples.jsonl
1_registry/data/ropes/samples.jsonl 1684
1_registry/data/cosmos_qa/samples.jsonl
1_registry/data/cosmos_qa/samples.jsonl 2981
1_registry/data/formal_fallacies_syllogisms_negation/samples.jsonl
1_registry/data/formal_fallacies_syllogisms_negation/samples.jsonl 14196
1_registry/data/logical_args/samples.jsonl
1_registry/data/logical_args/samples.jsonl 28
1_registry/data/crass_ai/samples.jsonl
1_registry/data/crass_ai/samples.jsonl 40
1_registry/data/geometric_shapes/samples.

Token indices sequence length is longer than the specified maximum sequence length for this model (1131 > 1024). Running this sequence through the model will result in indexing errors


1_registry/data/abstract_narrative_understanding/9_distractors/samples.jsonl 996
1_registry/data/abstract_narrative_understanding/99_distractors/samples.jsonl
1_registry/data/abstract_narrative_understanding/99_distractors/samples.jsonl 996
1_registry/data/cause_and_effect/one_sentence/samples.jsonl
1_registry/data/cause_and_effect/one_sentence/samples.jsonl 47
1_registry/data/cause_and_effect/one_sentence_no_prompt/samples.jsonl
1_registry/data/cause_and_effect/one_sentence_no_prompt/samples.jsonl 47
1_registry/data/cause_and_effect/two_sentences/samples.jsonl
1_registry/data/cause_and_effect/two_sentences/samples.jsonl 47
1_registry/data/goal_step_wikihow/goal_inference/samples.jsonl
1_registry/data/goal_step_wikihow/goal_inference/samples.jsonl 1699
1_registry/data/goal_step_wikihow/step_inference/samples.jsonl
1_registry/data/goal_step_wikihow/step_inference/samples.jsonl 2246
1_registry/data/goal_step_wikihow/step_ordering/samples.jsonl
1_registry/data/goal_step_wikihow/step_order

## Compute the cost per model

In [6]:
models_to_evaluate = ["gpt-4-0125-preview", "gpt-4-1106-preview", "gpt-4-0613", "gpt-4-0314",
"gpt-3.5-turbo-0125", "gpt-3.5-turbo-1106", "gpt-3.5-turbo-0613", "gpt-3.5-turbo-0301",]
models_to_evaluate =  ["gpt-4o-2024-08-06",
    "gpt-4o-2024-05-13",
    "gpt-4o-mini-2024-07-18",
    "gpt-4-turbo-2024-04-09",]

In [7]:
total_cost = 0
for model_id in models_to_evaluate:
    print(f"Cost for {model_id}: {compute_cost(model_id, total_n_tokens):.2f}")
    total_cost += compute_cost(model_id, total_n_tokens)
print(f"Total cost: {total_cost:.2f}")

Cost for gpt-4o-2024-08-06: 19.07
Cost for gpt-4o-2024-05-13: 38.14
Cost for gpt-4o-mini-2024-07-18: 1.14
Cost for gpt-4-turbo-2024-04-09: 76.28
Total cost: 134.63


In [8]:
# convert n_tokens_dict to a dataframe
n_tokens_df = pd.DataFrame.from_dict(n_tokens_dict, orient="index", columns=["n_tokens"])

In [9]:
# add the cost
for model_id in models_to_evaluate:
    n_tokens_df[f"cost_{model_id}"] = n_tokens_df["n_tokens"].apply(lambda x: compute_cost(model_id, x))
n_tokens_df["total_cost"] = n_tokens_df[[f"cost_{model_id}" for model_id in models_to_evaluate]].sum(axis=1)

In [10]:
# add the number of samples in each row
for filepath in n_tokens_df.index:
    original_df = pd.read_json(filepath, lines=True)
    n_tokens_df.loc[filepath, "n_samples"] = len(original_df)

In [11]:
n_tokens_df = n_tokens_df.sort_values(by="total_cost", ascending=True)

In [12]:
n_tokens_df.head()

Unnamed: 0,n_tokens,cost_gpt-4o-2024-08-06,cost_gpt-4o-2024-05-13,cost_gpt-4o-mini-2024-07-18,cost_gpt-4-turbo-2024-04-09,total_cost,n_samples
1_registry/data/arithmetic/1_digit_division/samples.jsonl,703,0.001758,0.003515,0.000105,0.00703,0.012408,19.0
1_registry/data/cause_and_effect/one_sentence_no_prompt/samples.jsonl,1457,0.003643,0.007285,0.000219,0.01457,0.025716,47.0
1_registry/data/odd_one_out/samples.jsonl,2527,0.006318,0.012635,0.000379,0.02527,0.044602,82.0
1_registry/data/cause_and_effect/two_sentences/samples.jsonl,3015,0.007538,0.015075,0.000452,0.03015,0.053215,47.0
1_registry/data/cause_and_effect/one_sentence/samples.jsonl,3349,0.008373,0.016745,0.000502,0.03349,0.05911,47.0


In [13]:
n_tokens_df[["total_cost"]]

Unnamed: 0,total_cost
1_registry/data/arithmetic/1_digit_division/samples.jsonl,0.012408
1_registry/data/cause_and_effect/one_sentence_no_prompt/samples.jsonl,0.025716
1_registry/data/odd_one_out/samples.jsonl,0.044602
1_registry/data/cause_and_effect/two_sentences/samples.jsonl,0.053215
1_registry/data/cause_and_effect/one_sentence/samples.jsonl,0.05911
1_registry/data/arithmetic/1_digit_subtraction/samples.jsonl,0.060998
1_registry/data/arithmetic/1_digit_addition/samples.jsonl,0.060998
1_registry/data/arithmetic/1_digit_multiplication/samples.jsonl,0.060998
1_registry/data/crass_ai/samples.jsonl,0.069982
1_registry/data/logical_args/samples.jsonl,0.102935


cumsum

In [14]:
n_tokens_df["cumsum_cost_instruct"] = n_tokens_df["total_cost"].cumsum()
n_tokens_df["cumsum_n_samples"] = n_tokens_df["n_samples"].cumsum()

In [15]:
n_tokens_df[["cumsum_cost_instruct", "total_cost", "cumsum_n_samples", "n_samples"]]

Unnamed: 0,cumsum_cost_instruct,total_cost,cumsum_n_samples,n_samples
1_registry/data/arithmetic/1_digit_division/samples.jsonl,0.012408,0.012408,19.0,19.0
1_registry/data/cause_and_effect/one_sentence_no_prompt/samples.jsonl,0.038124,0.025716,66.0,47.0
1_registry/data/odd_one_out/samples.jsonl,0.082726,0.044602,148.0,82.0
1_registry/data/cause_and_effect/two_sentences/samples.jsonl,0.13594,0.053215,195.0,47.0
1_registry/data/cause_and_effect/one_sentence/samples.jsonl,0.19505,0.05911,242.0,47.0
1_registry/data/arithmetic/1_digit_subtraction/samples.jsonl,0.256049,0.060998,338.0,96.0
1_registry/data/arithmetic/1_digit_addition/samples.jsonl,0.317047,0.060998,434.0,96.0
1_registry/data/arithmetic/1_digit_multiplication/samples.jsonl,0.378045,0.060998,530.0,96.0
1_registry/data/crass_ai/samples.jsonl,0.448028,0.069982,570.0,40.0
1_registry/data/logical_args/samples.jsonl,0.550962,0.102935,598.0,28.0


# Compute the cost by keeping only a subset of samples for some dataset

For some datasets, if I run on all samples, the cost is too high -> reduce the cost by keeping only 1000 samples for some of them


In [16]:
evals_cheap = ['odd_one_out',
 'crass_ai',
 'logical_args',
 'emoji_movie',
 'fantasy_reasoning',
 'metaphor_boolean',
 'geometric_shapes',
 'space_nli',
 'copa',
 'anli',
 'cosmos_qa',
 'ropes',
 'arithmetic']

In [17]:
len(evals_cheap)

13

In [18]:
# extract all other evals that are not in evals_cheap
evals_expensive = list(set(DATASETS_DICT.keys()) - set(evals_cheap))

In [19]:
evals_expensive

['babi_task_16',
 'wanli',
 'goal_step_wikihow',
 'formal_fallacies_syllogisms_negation',
 'abstract_narrative_understanding',
 'alpha_nli',
 'cause_and_effect']

## Cost computation

In [20]:
# copy n_tokens_df and redefine the index with .split("data/")[1].split("/samples")[0].replace("/", "_") 
n_tokens_df_new_index = n_tokens_df.copy()
n_tokens_df_new_index.index = [i.split("data/")[1].split("/samples")[0].replace("/", "_") for i in n_tokens_df.index]

In [21]:
n_tokens_df_new_index

Unnamed: 0,n_tokens,cost_gpt-4o-2024-08-06,cost_gpt-4o-2024-05-13,cost_gpt-4o-mini-2024-07-18,cost_gpt-4-turbo-2024-04-09,total_cost,n_samples,cumsum_cost_instruct,cumsum_n_samples
arithmetic_1_digit_division,703,0.001758,0.003515,0.000105,0.00703,0.012408,19.0,0.012408,19.0
cause_and_effect_one_sentence_no_prompt,1457,0.003643,0.007285,0.000219,0.01457,0.025716,47.0,0.038124,66.0
odd_one_out,2527,0.006318,0.012635,0.000379,0.02527,0.044602,82.0,0.082726,148.0
cause_and_effect_two_sentences,3015,0.007538,0.015075,0.000452,0.03015,0.053215,47.0,0.13594,195.0
cause_and_effect_one_sentence,3349,0.008373,0.016745,0.000502,0.03349,0.05911,47.0,0.19505,242.0
arithmetic_1_digit_subtraction,3456,0.00864,0.01728,0.000518,0.03456,0.060998,96.0,0.256049,338.0
arithmetic_1_digit_addition,3456,0.00864,0.01728,0.000518,0.03456,0.060998,96.0,0.317047,434.0
arithmetic_1_digit_multiplication,3456,0.00864,0.01728,0.000518,0.03456,0.060998,96.0,0.378045,530.0
crass_ai,3965,0.009913,0.019825,0.000595,0.03965,0.069982,40.0,0.448028,570.0
logical_args,5832,0.01458,0.02916,0.000875,0.05832,0.102935,28.0,0.550962,598.0


In [22]:
# compute the total cost with 1000 samples
n_tokens_df_new_index["total_cost_1000"] = n_tokens_df_new_index.apply(lambda x: x["total_cost"]/x["n_samples"] * 1000, axis=1)

In [23]:
# total cost of the cheap evals on instruct and base models
# loop over rows and check if index startswith a cheapeval
# total cost of the cheap evals on instruct and base models
total_cost_cheap = 0
for index, row in n_tokens_df_new_index.iterrows():
    if any(index.startswith(eval) for eval in evals_cheap):
        total_cost_cheap += row["total_cost"]

print(f"Total cost of cheap evals on instruct and base models: {total_cost_cheap:.2f}")

Total cost of cheap evals on instruct and base models: 39.20


In [24]:
# total cost of the expensive evals on instruct and base models
# loop over rows and check if index startswith a expensiveeval
# total cost of the expensive evals on instruct and base models
total_cost_expensive = 0
for index, row in n_tokens_df_new_index.iterrows():
    if any(index.startswith(eval) for eval in evals_expensive):
        total_cost_expensive += row["total_cost_1000"]

print(f"Total cost of expensive evals on instruct and base models: {total_cost_expensive:.2f}")

Total cost of expensive evals on instruct and base models: 43.60


In [25]:
grand_total = total_cost_cheap  + total_cost_expensive
print(f"Grand total: {grand_total:.2f}")

Grand total: 82.80
