In [1]:
import itertools
import math
import numpy as np
import pandas as pd

# -----------------------------
# 1) Parameter spaces (inputs)
# -----------------------------
GPU_TYPES     = ["A100", "H100", "GB200"]
MODEL_SIZES   = ["7B", "70B", "405B"]
BATCH_SIZES   = [256, 512, 1024, 2048, 4096, 8192]
LEARNING_RATES= [1e-5, 5e-5, 1e-4, 5e-4]
SEQ_LENGTHS   = [512, 1024, 2048, 4096]
RUN_IDS       = [1, 2, 3]

# ----------------------------------
# 2) Constants (speeds, power, etc.)
# ----------------------------------
# Throughput baseline: A100 + 7B + batch=1024 + seq=1024
BASE_TOKENS_PER_SEC_A100_7B = 800_000.0

SPEED_FACTOR = {  # higher = faster
    "A100": 1.0,
    "H100": 1.8,
    "GB200": 3.0,
}

BASE_POWER_W = {  # approximate TDP upper bound
    "A100": 400.0,
    "H100": 700.0,
    "GB200": 1000.0,
}

MODEL_COMPUTE_FACTOR = {  # bigger = more FLOPs per token = slower
    "7B":   1.0,
    "70B":  5.0,
    "405B": 20.0,
}

TARGET_TOKENS = {  # synthetic convergence goals
    "7B":   2e9,
    "70B":  1e10,
    "405B": 5e10,
}

BATCH_REF = 1024.0
SEQ_REF   = 1024.0
ALPHA = 0.85   # throughput ~ batch^alpha (diminishing returns)
GAMMA = 0.50   # throughput ~ seq^-gamma  (longer seq slows)

LR_OPT = 1e-4  # optimal learning rate for this synthetic setup
LR_C = 0.25    # penalty strength
LR_P = 1.5     # penalty curvature

# Noise: time ±5%, power ±3%
TIME_NOISE_STD  = 0.05
POWER_NOISE_STD = 0.03

# Reproducibility
rng = np.random.default_rng(42)

# -----------------------------------------
# 3) Helper functions (equations/rules)
# -----------------------------------------
def lr_penalty(lr: float) -> float:
    """Penalize distance from LR_OPT (1e-4) by increasing tokens needed."""
    lr_dist = abs(math.log10(lr) - math.log10(LR_OPT))
    penalty = 1.0 + LR_C * (lr_dist ** LR_P)
    return float(np.clip(penalty, 1.0, 2.0))

def tokens_per_sec(gpu: str, model: str, batch: int, seq: int) -> float:
    tps = (
        BASE_TOKENS_PER_SEC_A100_7B
        * SPEED_FACTOR[gpu]
        / MODEL_COMPUTE_FACTOR[model]
        * (batch / BATCH_REF) ** ALPHA
        * (seq / SEQ_REF) ** (-GAMMA)
    )
    return max(tps, 1e3)  # safety floor

def avg_power_watts(gpu: str, batch: int, seq: int) -> float:
    util_batch = (batch / BATCH_REF) ** 0.5
    util_seq   = (seq / SEQ_REF) ** 0.3
    util_raw   = 0.5 * util_batch + 0.5 * util_seq
    util       = min(1.0, util_raw)
    return BASE_POWER_W[gpu] * (0.60 + 0.40 * util)

# -----------------------------------------
# 4) Generate rows
# -----------------------------------------
rows = []
for gpu, model, batch, lr, seq, run_id in itertools.product(
    GPU_TYPES, MODEL_SIZES, BATCH_SIZES, LEARNING_RATES, SEQ_LENGTHS, RUN_IDS
):
    # Throughput (tokens/sec)
    tps = tokens_per_sec(gpu, model, batch, seq)

    # Effective tokens (LR farther from optimal needs more)
    eff_tokens = TARGET_TOKENS[model] * lr_penalty(lr)

    # Base time (hours)
    time_hrs = (eff_tokens / tps) / 3600.0

    # Base power (W)
    power_w = avg_power_watts(gpu, batch, seq)

    # Add run-to-run noise
    time_noise  = rng.normal(0.0, TIME_NOISE_STD)
    power_noise = rng.normal(0.0, POWER_NOISE_STD)

    time_hrs_run  = time_hrs  * max(0.0, 1.0 + time_noise)
    power_w_run   = power_w   * max(0.0, 1.0 + power_noise)

    # Derived metrics
    energy_kwh = (power_w_run * time_hrs_run) / 1000.0
    eff_tok_per_w = tps / power_w_run if power_w_run > 0 else np.nan

    rows.append({
        "gpu_type": gpu,
        "model_size": model,
        "batch_size": batch,
        "learning_rate": lr,
        "seq_length": seq,
        "run_id": run_id,
        "training_time_hrs": round(time_hrs_run, 4),
        "energy_kwh": round(energy_kwh, 4),
        "efficiency_tok_per_watt": round(eff_tok_per_w, 4),
    })

# -----------------------------------------
# 5) Build DataFrame & save
# -----------------------------------------
df = pd.DataFrame(rows)
df.to_csv("synthetic_gpu_training.csv", index=False)

print("✅ Generated:", df.shape, "rows")
print(df.head(8))


✅ Generated: (2592, 9) rows
  gpu_type model_size  batch_size  learning_rate  seq_length  run_id  \
0     A100         7B         256        0.00001         512       1   
1     A100         7B         256        0.00001         512       2   
2     A100         7B         256        0.00001         512       3   
3     A100         7B         256        0.00001        1024       1   
4     A100         7B         256        0.00001        1024       2   
5     A100         7B         256        0.00001        1024       3   
6     A100         7B         256        0.00001        2048       1   
7     A100         7B         256        0.00001        2048       2   

   training_time_hrs  energy_kwh  efficiency_tok_per_watt  
0             2.0247      0.6767                1041.8987  
1             2.0691      0.7339                 981.6916  
2             1.7997      0.5966                1050.4273  
3             2.8383      1.0121                 690.5203  
4             2.8180   