In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import json, re
import math
import pandas as pd

In [2]:
# Check CSAS documents to get an idea of average doc length

parsed_documents_folder = "../Data/ParsedPublications/"

def create_word_counts(folder):
    files = list(Path(folder).rglob('*.json'))
    rows = []
    for file_path in tqdm(files, desc="Processing documents"):
        with open(file_path, encoding='utf-8') as f:
            data = json.load(f)
        name = data['name']
        number_words = len(re.findall(r'\b\w+\b', data.get('text', '')))
        rows.append((name, number_words))
    return pd.DataFrame(rows, columns=['name', 'number_words'])

df = create_word_counts(parsed_documents_folder)

Processing documents:   0%|          | 0/12752 [00:00<?, ?it/s]

In [3]:
percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
percentile_names = ["mean", "1%", "10%", "50%", "90%", "99%"]

print(percentile_names)
print(
    [int(round(x, -3)) for x in df.describe(percentiles=percentiles).loc[percentile_names, 'number_words'].to_list()]
)

['mean', '1%', '10%', '50%', '90%', '99%']
[12000, 1000, 3000, 8000, 25000, 67000]


In [4]:
percentiles = [0.1, 0.9]
percentile_names = ["10%", "mean", "90%"]
word_count_list = [int(round(x, -3)) for x in df.describe(percentiles=percentiles).loc[percentile_names, 'number_words'].to_list()] 

print(percentile_names)
print(word_count_list)

['10%', 'mean', '90%']
[3000, 12000, 25000]


In [5]:
# Token <-> word: 1 token ≈ 3/4 of a word → tokens ≈ words / 0.75  [OpenAI help]
# https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
TOKENS_PER_WORD = 1/0.75

# Device power under load (RTX 5000 Ada max power 250 W) [NVIDIA product page]
# https://www.nvidia.com/en-us/products/workstations/rtx-5000/
DEVICE_POWER_WATTS = 250.0

# Model families (for context/size; smallest is Opus-MT among listed)
# mBART-50 ~610M params [HF docs]: https://huggingface.co/transformers/v4.11.3/pretrained_models.html
# M2M100_418M = 418M params [HF card]: https://huggingface.co/facebook/m2m100_418M
# Opus-MT is MarianMT (6e/6d "base" style; size << mBART-50/M2M100) [HF Marian docs]
# https://huggingface.co/docs/transformers/en/model_doc/marian

# Throughput (tokens/sec) — engineering assumptions; replace with your measurements.
# No canonical source; depends on batch size, decoding params, kernels, driver, etc.
TPS = {
    # Opus-MT is smallest here; start with a conservative high throughput.
    "opus": 1500.0,      # assumption: adjust if you measure differently
    "m2m100_418m": 600.0,  # assumption
    "mbart50_mmt": 400.0,  # assumption
}

# Overhead for dict lookups + embedding similarity + comparison in "all models" scenario
# (tiny vs GPU decode; add 5–10% buffer). Choose midpoint = 7.5%. [engineering assumption]
OVERHEAD_FRACTION = 0.075

# Scaling factor based on actual vs estimated performance (including retry error loops, ensembling, etc)
REAL_WORLD_SCALING_FACTOR = 1125 / 166

In [6]:
def words_to_tokens(words):
    return math.ceil(words * TOKENS_PER_WORD)

def energy_time_for_model(tokens, tps, power_watts=DEVICE_POWER_WATTS):
    t_sec = tokens / tps
    e_Wh = power_watts * (t_sec / 3600.0)
    return e_Wh, t_sec

def scenario_smallest_only(words, tps_opus=TPS["opus"]):
    tokens = words_to_tokens(words)
    e_Wh, t_sec = energy_time_for_model(tokens, tps_opus)
    t_sec *= REAL_WORLD_SCALING_FACTOR
    e_Wh *= REAL_WORLD_SCALING_FACTOR
    return e_Wh, t_sec

def scenario_all_models(words, tps=TPS, overhead=OVERHEAD_FRACTION):
    tokens = words_to_tokens(words)
    groups = {
        "opus": 2,
        "m2m100_418m": 2,
        "mbart50_mmt": 2,
    }
    total_e_Wh = 0.0
    total_t_sec = 0.0
    for k, n in groups.items():
        e, t = energy_time_for_model(tokens, tps[k])
        total_e_Wh += n * e
        total_t_sec += n * t
    total_e_Wh *= (1.0 + overhead)
    total_t_sec *= (1.0 + overhead)
    total_t_sec *= REAL_WORLD_SCALING_FACTOR
    total_e_Wh *= REAL_WORLD_SCALING_FACTOR
    return total_e_Wh, total_t_sec

def summarize(words_list=(1000, 10000)):
    cost_per_kWh = 0.10
    cost_per_Wh = cost_per_kWh / 1000.0

    rows = []
    for words in words_list:
        # scenario 1: smallest only (Opus)
        e1, t1 = scenario_smallest_only(words)
        rows.append({
            "words": words,
            "scenario": "smallest_only (Opus-MT)",
            "time_sec": round(t1, 1),
            "energy_Wh": round(e1, 2),
            "cost_$": round(e1 * cost_per_Wh, 4),
        })

        # scenario 2: all models
        e2, t2 = scenario_all_models(words)
        rows.append({
            "words": words,
            "scenario": "all_models + best_similarity (6 models)",
            "time_sec": round(t2, 1),
            "energy_Wh": round(e2, 2),
            "cost_$": round(e2 * cost_per_Wh, 4),
        })

    return pd.DataFrame(rows)

In [7]:
df = summarize(word_count_list)

n_docs = 200  # higher than previously published in one year

df['yr_energy_kWh'] = df['energy_Wh'] * n_docs / 1000
df['yr_cost_$'] = round(df['cost_$'] * n_docs, 2)

display(df)

Unnamed: 0,words,scenario,time_sec,energy_Wh,cost_$,yr_energy_kWh,yr_cost_$
0,3000,smallest_only (Opus-MT),18.1,1.26,0.0001,0.252,0.02
1,3000,all_models + best_similarity (6 models),281.7,19.56,0.002,3.912,0.4
2,12000,smallest_only (Opus-MT),72.3,5.02,0.0005,1.004,0.1
3,12000,all_models + best_similarity (6 models),1126.8,78.25,0.0078,15.65,1.56
4,25000,smallest_only (Opus-MT),150.6,10.46,0.001,2.092,0.2
5,25000,all_models + best_similarity (6 models),2347.6,163.03,0.0163,32.606,3.26


In [9]:
# how much does it cost just to leave a computer on? 
# very roughly maybe 60–80 W * 24 h/day * 365 ≈ 500–700 kWh/year
#  $50-70/yr
# 
#  this is the dominant cost, not translations

In [10]:
# how much would it cost to use a few different cloud providers instead of calculating this on our servers?

def energy_cost_provider(n_word_list, n_docs=200):
    rows = []
    
    rates = {
        "aws": 0.12,
        "azure": 0.11,
        "gcp": 0.10,
        "oracle": 0.09,
        "ibm": 0.115,
    }
    
    for n in n_word_list:
        kWh_small, _ = scenario_smallest_only(n)
        kWh_all, _ = scenario_all_models(n)
        kWh_small /= 1000.0
        kWh_all /= 1000.0
        
        for provider, rate in rates.items():
            rows.append({
                "provider": provider,
                "rate_$": rate,
                "n_documents": n_docs,
                "words_per_doc": n,
                "small_model_cost_$": round(n_docs * rate * kWh_small, 2),
                "all_models_cost_$": round(n_docs * rate * kWh_all, 2),
            })
        
    return pd.DataFrame(rows)

In [11]:
energy_cost_provider(word_count_list)

Unnamed: 0,provider,rate_$,n_documents,words_per_doc,small_model_cost_$,all_models_cost_$
0,aws,0.12,200,3000,0.03,0.47
1,azure,0.11,200,3000,0.03,0.43
2,gcp,0.1,200,3000,0.03,0.39
3,oracle,0.09,200,3000,0.02,0.35
4,ibm,0.115,200,3000,0.03,0.45
5,aws,0.12,200,12000,0.12,1.88
6,azure,0.11,200,12000,0.11,1.72
7,gcp,0.1,200,12000,0.1,1.57
8,oracle,0.09,200,12000,0.09,1.41
9,ibm,0.115,200,12000,0.12,1.8
