# Cost Analysis: API Usage for Modismos Dataset

This notebook calculates the total cost in coins for processing the complete modismos dataset using all available models across three different prompts.

## Configuration and Data Loading

In [9]:
import json
import csv
import pandas as pd

# File paths
MODELS_FILE = 'Straico/text_only_models.json'
DATASET_FILE = 'modismos_Dataset_Final.csv'

# Load models data
with open(MODELS_FILE, 'r', encoding='utf-8') as f:
    models_data = json.load(f)

# Load dataset and count unique modismos
modismos_unicos = set()
with open(DATASET_FILE, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter=';')
    for row in reader:
        modismo = row.get('modismo', '').strip()
        if modismo:
            modismos_unicos.add(modismo.casefold())

n_modismos_unicos = len(modismos_unicos)
n_modismos_unicos = 20
n_models = len(models_data['text_models'])

print(f"Total unique modismos: {n_modismos_unicos:,}")
print(f"Total models available: {n_models}")

Total unique modismos: 20
Total models available: 67


## Prompt Specifications

Definition of input and output word counts for each prompt type.

In [10]:
# Prompt specifications (words)
PROMPTS = {
    'Prompt 1': {
        'description': 'Modismo → Definición',
        'input_words': 138,
        'output_words': 22,
        'n_calls': n_modismos_unicos  # One call per unique modismo
    },
    'Prompt 2': {
        'description': 'Modismo → Es Modismo (Sí/No)',
        'input_words': 113,
        'output_words': 21,
        'n_calls': n_modismos_unicos  # One call per unique modismo
    },
    'Prompt 3': {
        'description': 'Modismo + Ejemplo → Literal + Definición',
        'input_words': 227,
        'output_words': 52,
        'n_calls': n_modismos_unicos  # One call per unique modismo
    }
}

# Display prompt specifications
prompt_df = pd.DataFrame([
    {
        'Prompt': name,
        'Description': info['description'],
        'Input (words)': info['input_words'],
        'Output (words)': info['output_words'],
        'Total words': info['input_words'] + info['output_words'],
        'API Calls': f"{info['n_calls']:,}"
    }
    for name, info in PROMPTS.items()
])

print("\nPrompt Specifications:")
print(prompt_df.to_string(index=False))


Prompt Specifications:
  Prompt                              Description  Input (words)  Output (words)  Total words API Calls
Prompt 1                     Modismo → Definición            138              22          160        20
Prompt 2             Modismo → Es Modismo (Sí/No)            113              21          134        20
Prompt 3 Modismo + Ejemplo → Literal + Definición            227              52          279        20


## Cost Calculation Function

In [11]:
def calculate_cost(model, input_words, output_words):
    """
    Calculate the cost in coins for a single API call.
    
    Args:
        model: Model pricing information dictionary
        input_words: Number of input words
        output_words: Number of output words
    
    Returns:
        Total cost in coins for the call
    """
    pricing = model['pricing']
    coins_per_100_words = pricing['coins']
    
    total_words = input_words + output_words
    cost = (total_words / 100.0) * coins_per_100_words
    
    return cost

## Total Cost Calculation per Model and Prompt

In [12]:
# Calculate costs for all models and prompts
cost_analysis = []

for model in models_data['text_models']:
    model_name = model['name']
    model_id = model['model']
    pricing_info = model['pricing']
    
    model_costs = {
        'Model': model_name,
        'Model ID': model_id,
        'Coins/100 words': pricing_info['coins']
    }
    
    total_model_cost = 0
    
    for prompt_name, prompt_info in PROMPTS.items():
        cost_per_call = calculate_cost(
            model,
            prompt_info['input_words'],
            prompt_info['output_words']
        )
        
        total_prompt_cost = cost_per_call * prompt_info['n_calls']
        model_costs[f"{prompt_name} (total)"] = total_prompt_cost
        total_model_cost += total_prompt_cost
    
    model_costs['Total Cost'] = total_model_cost
    cost_analysis.append(model_costs)

# Create DataFrame
cost_df = pd.DataFrame(cost_analysis)

# Sort by total cost
cost_df = cost_df.sort_values('Total Cost', ascending=True)

print("Cost Analysis Summary:")
print("=" * 80)
print(cost_df.to_string(index=False, float_format="{:,.2f}".format))
print("=" * 80)

Cost Analysis Summary:
                                                Model                                   Model ID  Coins/100 words  Prompt 1 (total)  Prompt 2 (total)  Prompt 3 (total)  Total Cost
                               Amazon: Nova Micro 1.0                       amazon/nova-micro-v1             0.10              3.20              2.68              5.58       11.46
                                     Microsoft: Phi 4                            microsoft/phi-4             0.10              3.20              2.68              5.58       11.46
                                Amazon: Nova Lite 1.0                        amazon/nova-lite-v1             0.20              6.40              5.36             11.16       22.92
                          Cohere: Command R (08-2024)                   cohere/command-r-08-2024             0.20              6.40              5.36             11.16       22.92
                           Google: Gemini Flash 2.08B                google/g

In [13]:
# Calculate and display the total cost across all models and prompts
total_cost = cost_df['Total Cost'].sum()
print("\nGrand Total Cost:")
print(f"{total_cost:,.2f} coins")


Grand Total Cost:
28,262.65 coins


In [None]:
# Incrementally sum model costs until the total exceeds 1 million coins
incremental_sum = 0
models_until_threshold = []

for _, row in cost_df.iterrows():
    incremental_sum += row['Total Cost']
    models_until_threshold.append(row['Model'])
    if incremental_sum > 1_000:
        break

print("\nModels contributing to Total Cost > 1,000,000 coins:")
print(models_until_threshold)
print(f"\nCumulative Total: {incremental_sum:,.2f} coins")


Models contributing to Total Cost > 1,000,000 coins:
['Amazon: Nova Micro 1.0', 'Microsoft: Phi 4', 'Amazon: Nova Lite 1.0', 'Cohere: Command R (08-2024)', 'Google: Gemini Flash 2.08B', 'Mistral: Codestral Mamba', 'Qwen2-VL 72B Instruct', 'Qwen2.5 72B Instruct', 'Google: Gemma 2 27B', 'Qwen 2 72B Instruct', 'OpenAI: GPT-5 Nano', 'NVIDIA: Llama 3.3 Nemotron Super 49B v1', 'OpenAI: GPT-4.1 Nano', 'MiniMax: MiniMax M2', 'Meta: Llama 3.3 70B Instruct', 'Google: Gemini 2.5 Flash Lite', 'NVIDIA: Llama 3.3 Nemotron Super 49B V1.5 (Reasoning)', 'Qwen: Qwen3 235B A22B Reasoning', 'WizardLM-2 8x22B', 'Meta: Llama 3.1 70B Instruct', 'Meta: Llama 4 Maverick']

Cumulative Total: 962.64 coins
