In [1]:
from pathlib import Path
from tqdm.notebook import tqdm
import json, re
import math
import pandas as pd

In [2]:
# Check CSAS documents to get an idea of average doc length

parsed_documents_folder = "../Data/ParsedPublications/"

def create_word_counts(folder):
    files = list(Path(folder).rglob('*.json'))
    rows = []
    for file_path in tqdm(files, desc="Processing documents"):
        with open(file_path, encoding='utf-8') as f:
            data = json.load(f)
        name = data['name']
        number_words = len(re.findall(r'\b\w+\b', data.get('text', '')))
        rows.append((name, number_words))
    return pd.DataFrame(rows, columns=['name', 'number_words'])

df = create_word_counts(parsed_documents_folder)

Processing documents:   0%|          | 0/12752 [00:00<?, ?it/s]

In [25]:
percentiles = [0.01, 0.1, 0.5, 0.9, 0.99]
percentile_names = ["mean", "1%", "10%", "50%", "90%", "99%"]

print(percentile_names)
print(
    [int(round(x, -3)) for x in df.describe(percentiles=percentiles).loc[percentile_names, 'number_words'].to_list()]
)

['mean', '1%', '10%', '50%', '90%', '99%']
[12000, 1000, 3000, 8000, 25000, 67000]


In [27]:
percentiles = [0.1, 0.9]
percentile_names = ["10%", "mean", "90%"]
word_count_list = [int(round(x, -3)) for x in df.describe(percentiles=percentiles).loc[percentile_names, 'number_words'].to_list()] 

print(percentile_names)
print(word_count_list)

['10%', 'mean', '90%']
[3000, 12000, 25000]


In [28]:
# Token <-> word: 1 token ≈ 3/4 of a word → tokens ≈ words / 0.75  [OpenAI help]
# https://help.openai.com/en/articles/4936856-what-are-tokens-and-how-to-count-them
TOKENS_PER_WORD = 1/0.75

# Device power under load (RTX 5000 Ada max power 250 W) [NVIDIA product page]
# https://www.nvidia.com/en-us/products/workstations/rtx-5000/
DEVICE_POWER_WATTS = 250.0

# Model families (for context/size; smallest is Opus-MT among listed)
# mBART-50 ~610M params [HF docs]: https://huggingface.co/transformers/v4.11.3/pretrained_models.html
# M2M100_418M = 418M params [HF card]: https://huggingface.co/facebook/m2m100_418M
# Opus-MT is MarianMT (6e/6d “base” style; size << mBART-50/M2M100) [HF Marian docs]
# https://huggingface.co/docs/transformers/en/model_doc/marian

# Throughput (tokens/sec) — engineering assumptions; replace with your measurements.
# No canonical source; depends on batch size, decoding params, kernels, driver, etc.
TPS = {
    # Opus-MT is smallest here; start with a conservative high throughput.
    "opus": 1500.0,      # assumption: adjust if you measure differently
    "m2m100_418m": 600.0,  # assumption
    "mbart50_mmt": 400.0,  # assumption
}

# Overhead for dict lookups + embedding similarity + comparison in “all models” scenario
# (tiny vs GPU decode; add 5–10% buffer). Choose midpoint = 7.5%. [engineering assumption]
OVERHEAD_FRACTION = 0.075


In [44]:
def words_to_tokens(words):
    return math.ceil(words * TOKENS_PER_WORD)

def energy_time_for_model(tokens, tps, power_watts=DEVICE_POWER_WATTS):
    t_sec = tokens / tps
    e_Wh = power_watts * (t_sec / 3600.0)
    return e_Wh, t_sec

def scenario_smallest_only(words, tps_opus=TPS["opus"]):
    tokens = words_to_tokens(words)
    return energy_time_for_model(tokens, tps_opus)

def scenario_all_models(words, tps=TPS, overhead=OVERHEAD_FRACTION):
    tokens = words_to_tokens(words)
    # 12 models per direction: 4 Opus variants, 4 M2M100 variants, 4 mBART-50 variants
    groups = {
        "opus": 4,
        "m2m100_418m": 4,
        "mbart50_mmt": 4,
    }
    total_e_Wh = 0.0
    total_t_sec = 0.0
    for k, n in groups.items():
        e, t = energy_time_for_model(tokens, tps[k])
        total_e_Wh += n * e
        total_t_sec += n * t
    total_e_Wh *= (1.0 + overhead)
    total_t_sec *= (1.0 + overhead)
    return total_e_Wh, total_t_sec

def summarize(words_list=(1000, 10000)):
    cost_per_kWh = 0.10
    cost_per_Wh = cost_per_kWh / 1000.0

    rows = []
    for words in words_list:
        # scenario 1: smallest only (Opus)
        e1, t1 = scenario_smallest_only(words)
        rows.append({
            "words": words,
            "scenario": "smallest_only (Opus-MT)",
            "time_sec": round(t1, 1),
            "energy_Wh": round(e1, 2),
            "cost_$": round(e1 * cost_per_Wh, 5),
        })

        # scenario 2: all models
        e2, t2 = scenario_all_models(words)
        rows.append({
            "words": words,
            "scenario": "all_models + best_similarity (12 models)",
            "time_sec": round(t2, 1),
            "energy_Wh": round(e2, 2),
            "cost_$": round(e2 * cost_per_Wh, 5),
        })

    return pd.DataFrame(rows)

In [45]:
df = summarize(word_count_list)
display(df)

Unnamed: 0,words,scenario,time_sec,energy_Wh,cost_$
0,3000,smallest_only (Opus-MT),2.7,0.19,2e-05
1,3000,all_models + best_similarity (12 models),83.1,5.77,0.00058
2,12000,smallest_only (Opus-MT),10.7,0.74,7e-05
3,12000,all_models + best_similarity (12 models),332.5,23.09,0.00231
4,25000,smallest_only (Opus-MT),22.2,1.54,0.00015
5,25000,all_models + best_similarity (12 models),692.8,48.11,0.00481


In [58]:
# # If we have terrible hardware or an incorrectly configured pipeline, what is the worst-case energy consumption?
# 
# Worst-case multipliers
# 
# Hardware
#   Switch to CPU-only (very old Xeon, no AVX2): 100–200x slower vs GPU.
#   Or use an ancient GPU (Kepler K20, ~3–4 TFLOPS, 225 W): ~50x slower vs Ada. 
#   (CPU-only is actually worse here.)
# 
# Precision
#  Force FP32 instead of fp16/int8: ~2–3x slower.
# 
# Decoding
#   Large beam search (width 20+): ~10x slowdown vs greedy.
#   Enable nucleus sampling with small cutoff, long tail: ~2x.
#   Disable batching (one sentence at a time): ~2x.
# 
# Facility
#   Poor datacenter PUE (Power Usage Effectiveness) = 2.0 (vs efficient 1.2): ~1.7× overhead.
# 
# Total worst-case multiplier ≈ 150 * 2.5 * 20 * 2 * 2 * 2 ≈ 60,000x
# 
# Most realistic worst-case is using a CPU only ≈ 150x

n_csas_documents_low = 50
n_csas_documents_normal = 130 
n_csas_documents_high = 200

print(f"Yearly CSAS Translation Costs for Translating between {n_csas_documents_low} and {n_csas_documents_high} documents")

print("\nAssuming Properly Configured GPU and Pipeline")
print(f"Best case costs = ${n_csas_documents_low * 0.00007:.2f}")
print(f"Worst case costs = ${n_csas_documents_high * 0.00231:.2f}")
print(f"Best guess costs = ${n_csas_documents_low * (0.00007 + 0.00231) / 2:.2f}")

print("\nAssuming Poorly Configured Computer (CPU only)")
print(f"Best case costs = ${150 * n_csas_documents_low * 0.00007:.2f}")
print(f"Worst case costs = ${150 * n_csas_documents_high * 0.00231:.2f}")
print(f"Best guess costs = ${150 * n_csas_documents_low * (0.00007 + 0.00231) / 2:.2f}")

Yearly CSAS Translation Costs for Translating between 50 and 200 documents

Assuming Properly Configured GPU and Pipeline
Best case costs = $0.00
Worst case costs = $0.46
Best guess costs = $0.06

Assuming Poorly Configured Computer (CPU only)
Best case costs = $0.52
Worst case costs = $69.30
Best guess costs = $8.93


In [None]:
# how much does it cost just to leave a computer on? 
# 60–80 W * 24 h/day * 365 ≈ 500–700 kWh/year
#  $50-70/yr
# 
#  this is the dominant cost