# Cost Analysis: API Usage for Modismos Dataset

This notebook calculates the total cost in coins for processing the complete modismos dataset using all available models across three different prompts.

## Configuration and Data Loading

In [1]:
import json
import csv
import pandas as pd

# File paths
MODELS_FILE = 'Straico/text_only_models.json'
DATASET_FILE = 'modismos_Dataset_Final.csv'

# Load models data
with open(MODELS_FILE, 'r', encoding='utf-8') as f:
    models_data = json.load(f)

# Load dataset and count unique modismos
modismos_unicos = set()
with open(DATASET_FILE, 'r', encoding='utf-8') as f:
    reader = csv.DictReader(f, delimiter=';')
    for row in reader:
        modismo = row.get('modismo', '').strip()
        if modismo:
            modismos_unicos.add(modismo.casefold())

n_modismos_unicos = len(modismos_unicos)
n_models = len(models_data['text_models'])

print(f"Total unique modismos: {n_modismos_unicos:,}")
print(f"Total models available: {n_models}")

Total unique modismos: 6,531
Total models available: 67


## Prompt Specifications

Definition of input and output word counts for each prompt type.

In [2]:
# Prompt specifications (words)
PROMPTS = {
    'Prompt 1': {
        'description': 'Modismo → Definición',
        'input_words': 138,
        'output_words': 22,
        'n_calls': n_modismos_unicos  # One call per unique modismo
    },
    'Prompt 2': {
        'description': 'Modismo → Es Modismo (Sí/No)',
        'input_words': 113,
        'output_words': 21,
        'n_calls': n_modismos_unicos  # One call per unique modismo
    },
    'Prompt 3': {
        'description': 'Modismo + Ejemplo → Literal + Definición',
        'input_words': 227,
        'output_words': 52,
        'n_calls': n_modismos_unicos  # One call per unique modismo
    }
}

# Display prompt specifications
prompt_df = pd.DataFrame([
    {
        'Prompt': name,
        'Description': info['description'],
        'Input (words)': info['input_words'],
        'Output (words)': info['output_words'],
        'Total words': info['input_words'] + info['output_words'],
        'API Calls': f"{info['n_calls']:,}"
    }
    for name, info in PROMPTS.items()
])

print("\nPrompt Specifications:")
print(prompt_df.to_string(index=False))


Prompt Specifications:
  Prompt                              Description  Input (words)  Output (words)  Total words API Calls
Prompt 1                     Modismo → Definición            138              22          160     6,531
Prompt 2             Modismo → Es Modismo (Sí/No)            113              21          134     6,531
Prompt 3 Modismo + Ejemplo → Literal + Definición            227              52          279     6,531


## Cost Calculation Function

In [3]:
def calculate_cost(model, input_words, output_words):
    """
    Calculate the cost in coins for a single API call.
    
    Args:
        model: Model pricing information dictionary
        input_words: Number of input words
        output_words: Number of output words
    
    Returns:
        Total cost in coins for the call
    """
    pricing = model['pricing']
    coins_per_100_words = pricing['coins']
    
    total_words = input_words + output_words
    cost = (total_words / 100.0) * coins_per_100_words
    
    return cost

## Total Cost Calculation per Model and Prompt

In [4]:
# Calculate costs for all models and prompts
cost_analysis = []

for model in models_data['text_models']:
    model_name = model['name']
    model_id = model['model']
    pricing_info = model['pricing']
    
    model_costs = {
        'Model': model_name,
        'Model ID': model_id,
        'Coins/100 words': pricing_info['coins']
    }
    
    total_model_cost = 0
    
    for prompt_name, prompt_info in PROMPTS.items():
        cost_per_call = calculate_cost(
            model,
            prompt_info['input_words'],
            prompt_info['output_words']
        )
        
        total_prompt_cost = cost_per_call * prompt_info['n_calls']
        model_costs[f"{prompt_name} (total)"] = total_prompt_cost
        total_model_cost += total_prompt_cost
    
    model_costs['Total Cost'] = total_model_cost
    cost_analysis.append(model_costs)

# Create DataFrame
cost_df = pd.DataFrame(cost_analysis)

# Sort by total cost
cost_df = cost_df.sort_values('Total Cost', ascending=True)

print("Cost Analysis Summary:")
print("=" * 80)
print(cost_df.to_string(index=False, float_format="{:,.2f}".format))
print("=" * 80)

cost_df.to_excel('Straico_Cost_Analysis.xlsx', index=False)

Cost Analysis Summary:
                                                Model                                   Model ID  Coins/100 words  Prompt 1 (total)  Prompt 2 (total)  Prompt 3 (total)   Total Cost
                               Amazon: Nova Micro 1.0                       amazon/nova-micro-v1             0.10          1,044.96            875.15          1,822.15     3,742.26
                                     Microsoft: Phi 4                            microsoft/phi-4             0.10          1,044.96            875.15          1,822.15     3,742.26
                                Amazon: Nova Lite 1.0                        amazon/nova-lite-v1             0.20          2,089.92          1,750.31          3,644.30     7,484.53
                          Cohere: Command R (08-2024)                   cohere/command-r-08-2024             0.20          2,089.92          1,750.31          3,644.30     7,484.53
                           Google: Gemini Flash 2.08B                goo

In [5]:
# Calculate and display the total cost across all models and prompts
total_cost = cost_df['Total Cost'].sum()
print("\nGrand Total Cost:")
print(f"{total_cost:,.2f} coins")


Grand Total Cost:
9,229,169.01 coins


## Dataset Filtering: Remove Duplicates and Empty Definitions

In [6]:
# Load the original dataset
df = pd.read_csv(DATASET_FILE, delimiter=';', encoding='utf-8')

print(f"Original dataset: {len(df):,} rows")
print(f"\nColumns: {list(df.columns)}")
print(f"\nFirst few rows:")
print(df.head())

Original dataset: 10,195 rows

Columns: ['modismo', 'significado', 'ejemplo', 'fuente']

First few rows:
    modismo                                        significado  \
0   abajeño  Propio o nativo de las costas o de las tierras...   
1   abalear  Disparar a alguien o a algo de manera repetida...   
2   abalear  Herir o matar a alguien con disparos de un arm...   
3    abaleo  Situación en la que hay disparos repetidos que...   
4  abanicar  En el beisbol, fallar el bateador al no tocar ...   

                                             ejemplo fuente  
0  Empezaron cultivos de tabaco y a elaborar un d...  DICOL  
1  El funcionario resultó ileso a pesar de que el...  DICOL  
2  Limpiaba la hojarasca cuando hombres armados l...  DICOL  
3  Por la calle doce se oía un tremendo abaleo y ...  DICOL  
4  El bateador se ponchó abanicando el tercer lan...  DICOL  


In [7]:
# Step 1: Remove duplicates based on 'modismo' column (case-insensitive)
# Create a temporary column for case-insensitive comparison
df['modismo_lower'] = df['modismo'].str.strip().str.lower()

# Remove duplicates keeping the first occurrence
df_no_duplicates = df.drop_duplicates(subset='modismo_lower', keep='first')

# Drop the temporary column
df_no_duplicates = df_no_duplicates.drop(columns=['modismo_lower'])

print(f"\nAfter removing duplicates: {len(df_no_duplicates):,} rows")
print(f"Duplicates removed: {len(df) - len(df_no_duplicates):,} rows")


After removing duplicates: 6,531 rows
Duplicates removed: 3,664 rows


In [8]:
# Step 2: Remove rows without definition (ejemplo)
# Check for empty, null, or whitespace-only definitions
df_filtered = df_no_duplicates[
    df_no_duplicates['ejemplo'].notna() & 
    (df_no_duplicates['ejemplo'].str.strip() != '')
].copy()

print(f"\nAfter removing empty ejemplo: {len(df_filtered):,} rows")
print(f"Empty ejemplo removed: {len(df_no_duplicates) - len(df_filtered):,} rows")
print(f"\nTotal rows removed: {len(df) - len(df_filtered):,}")
print(f"Final dataset: {len(df_filtered):,} rows ({len(df_filtered)/len(df)*100:.1f}% of original)")


After removing empty ejemplo: 4,616 rows
Empty ejemplo removed: 1,915 rows

Total rows removed: 5,579
Final dataset: 4,616 rows (45.3% of original)


In [9]:
df_filtered.to_csv('modismos_Dataset_Cleaned.csv', index=False, sep=';', encoding='utf-8')