<a href="https://colab.research.google.com/github/Ino54/MA_GreenAI-Practical-Experiments/blob/main/bloom_framew.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 # ---------- Requirements ----------
%%writefile requirements.txt
transformers>=4.41,<5
accelerate>=0.30
datasets>=2.19
evaluate
sacrebleu
codecarbon>=2.5,<3
pynvml>=11.5.0
psutil
numpy
pandas
huggingface_hub
optimum>=1.18.0
onnx>=1.15.0

Writing requirements.txt


In [None]:
!pip -q install -r requirements.txt
# ONNX Runtime zuerst versuchen, ggf. GPU, sonst CPU
try:
    import onnxruntime as _ort_test
except Exception:
    try:
        !pip -q install onnxruntime-gpu
    except Exception:
        !pip -q install onnxruntime
!pip uninstall -y -q google-genai firebase-admin || true

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.6/517.6 kB[0m [31m32.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m425.8/425.8 kB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m18.2/18.2 MB[0m [31m73.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m113.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.4/76.4 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ---------- Drive & Ordner ----------
import os, shutil, time, platform, gc, math, warnings, inspect
warnings.filterwarnings("ignore")

from google.colab import drive
MOUNTPOINT="/content/drive"
already=os.path.isdir(os.path.join(MOUNTPOINT,"MyDrive"))
if not already and os.path.isdir(MOUNTPOINT) and os.listdir(MOUNTPOINT):
    backup=f"/content/drive_stale_{int(time.time())}"
    shutil.move(MOUNTPOINT, backup)
    os.makedirs(MOUNTPOINT, exist_ok=True)
drive.mount(MOUNTPOINT, force_remount=(not already))

work_dir="/content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/bloom_frameworks"
if not os.path.isdir(work_dir):
    raise FileNotFoundError(f"Zielordner fehlt: {work_dir}")
os.chdir(work_dir)
project_dir=work_dir
print("Arbeitsordner:", os.getcwd())

Mounted at /content/drive
Arbeitsordner: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/bloom_frameworks


In [None]:
# ---------- HF Login ----------
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get("HF_TOKEN")
if hf_token:
    login(hf_token); print("Hugging Face Login erfolgreich!")
else:
    print("WARNUNG: Kein HF_TOKEN gefunden.")

Hugging Face Login erfolgreich!


In [None]:
# ---------- Imports & Device ----------
import numpy as np, pandas as pd
import torch, psutil
from contextlib import nullcontext
from types import SimpleNamespace
from datasets import load_dataset
import evaluate

from transformers import AutoTokenizer, AutoModelForCausalLM, set_seed
from codecarbon import EmissionsTracker

# ORT/Optimum
ort_available=False; ort_cuda=False
try:
    import onnxruntime as ort
    from optimum.onnxruntime import ORTModelForCausalLM
    ort_available=True
    prov=ort.get_available_providers()
    ort_cuda=("CUDAExecutionProvider" in prov)
    print(f"[ORT] verfügbar. Provider: {prov}")
    # kleine Compat-Absicherung
    if not hasattr(ORTModelForCausalLM, "_is_stateful"):
        ORTModelForCausalLM._is_stateful=False
except Exception as e:
    print("[ORT] Import/Verfügbarkeit fehlgeschlagen → nur HF-Backend.", repr(e))

# PyTorch Tuning
os.environ["PYTORCH_CUDA_ALLOC_CONF"]="expandable_segments:True"
set_seed(42)
device="cuda" if torch.cuda.is_available() else "cpu"
if device=="cuda":
    gpu_name=torch.cuda.get_device_name(0)
    vram_total_gb=torch.cuda.get_device_properties(0).total_memory/(1024**3)
    torch.backends.cuda.matmul.allow_tf32=True
    torch.backends.cudnn.benchmark=True
    try: torch.set_float32_matmul_precision("high")
    except: pass
else:
    gpu_name="CPU"; vram_total_gb=0.0
print(f"Device: {device} | GPU: {gpu_name} | VRAM={vram_total_gb:.1f} GB | Torch {torch.__version__} | Py {platform.python_version()}")

RESULT_BASENAME="bloom_frameworks"
PINNED_MEM=True; NON_BLOCKING=True; MAX_LEN_CAP=128

[ORT] verfügbar. Provider: ['TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider']
Device: cuda | GPU: NVIDIA A100-SXM4-80GB | VRAM=79.3 GB | Torch 2.8.0+cu126 | Py 3.12.11


In [None]:
# ---------- CodeCarbon ----------
import pandas as _pd
def _cleanup_cc_locks():
    for p in ["/tmp/.codecarbon.lock",
              os.path.expanduser("~/.codecarbon/codecarbon.lock"),
              "/content/.codecarbon/codecarbon.lock"]:
        try:
            if os.path.exists(p): os.remove(p)
        except: pass
os.environ["CODECARBON_CACHE_DIR"]=f"/content/.cc_cache_fw_{int(time.time())}"

def _cc_supported_kwargs():
    base=dict(log_level="error", output_dir=".", measure_power_secs=1, tracking_mode="process")
    try:
        params=inspect.signature(EmissionsTracker.__init__).parameters
        if "allow_multiple_runs" in params: base["allow_multiple_runs"]=True
        if "cloud_provider" in params: base["cloud_provider"]="google"
        if "cloud_region" in params: base["cloud_region"]="europe-west10"
        if "country_iso_code" in params: base["country_iso_code"]="DEU"
    except: pass
    return base
def make_trk(name,out):
    _cleanup_cc_locks()
    return EmissionsTracker(project_name=name, output_file=out, **_cc_supported_kwargs())
def start(tr):
    try: tr.start(); return True
    except:
        _cleanup_cc_locks()
        try: tr.start(); return True
        except: return False
def stop(tr, st):
    if not st: return SimpleNamespace(energy_consumed=0.0, emissions=0.0)
    try: return tr.stop()
    except: return SimpleNamespace(energy_consumed=0.0, emissions=0.0)
def unpack(em):
    if hasattr(em,"energy_consumed") and hasattr(em,"emissions"):
        try: return float(em.energy_consumed), float(em.emissions)
        except: return 0.0,0.0
    if isinstance(em, dict):
        e=em.get("energy_consumed",0.0); c=em.get("emissions", em.get("emissions_kg",0.0))
        try: return float(e), float(c)
        except: return 0.0,0.0
    try: return 0.0, float(em)
    except: return 0.0,0.0
def read_energy(path):
    try:
        if not os.path.exists(path): return 0.0
        df=pd.read_csv(path)
        for c in ["energy_consumed","energy_consumed_kwh","energy_consumed (kWh)","energy (kWh)"]:
            if c in df.columns: return float(df[c].iloc[-1])
        for c in df.columns:
            if "energy" in c.lower() and "kwh" in c.lower(): return float(df[c].iloc[-1])
    except: pass
    return 0.0
def measure(phase, fn, prefix):
    logfile=os.path.join(project_dir, f"{prefix}_{phase}.csv")
    tr=make_trk(f"{prefix}_{phase}", logfile)
    import time as _t
    st=start(tr); t0=_t.time(); res=fn(); t1=_t.time()
    em=stop(tr, st); ekwh,co2=unpack(em)
    if ekwh==0.0:
        ek=read_energy(logfile)
        if ek: ekwh=ek
    return {"phase":phase,"time_s":t1-t0,"energy_kwh":ekwh,"co2_kg":co2}, res

# ---------- Eval-Config ----------
MODELS=[("bigscience/bloom-560m","bloom560m","b560"),
        ("bigscience/bloom-3b","bloom3b","b3b")]
EVAL={"max_new_tokens":32,
      "ppl":{"name":"wikitext","config":"wikitext-2-raw-v1","split":"test[:1%]"},
      "bleu":{"name":"wmt14","config":"de-en","split":"test[:32]"}}
PROMPTS=["Schreibe einen kurzen Absatz über nachhaltige KI.",
         "Erkläre in einfachen Worten, was Quantisierung in neuronalen Netzen ist.",
         "Nenne drei Vorteile von Mixture-of-Experts-Modellen."]
bleu_metric=evaluate.load("sacrebleu")

def autocast_ctx():
    return torch.autocast(device_type="cuda", dtype=torch.float16) if torch.cuda.is_available() else nullcontext()
def safe_max_len(tok, model, fallback=2048, upper=100000):
    cand=getattr(tok,"model_max_length",None)
    if isinstance(cand,int) and 0<cand<upper: return cand
    cand=getattr(getattr(model,"config",None),"max_position_embeddings",None)
    if isinstance(cand,int) and 0<cand<upper: return cand
    return fallback
def capture_memory():
    ram=psutil.Process().memory_info().rss
    valloc=torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
    vres =torch.cuda.memory_reserved()  if torch.cuda.is_available() else 0
    return ram, valloc, vres
def bytes_to_gb(b): return float(b)/(1024**3)

def _pin_and_move(batch, device):
    out={}
    for k,v in batch.items():
        if isinstance(v, torch.Tensor):
            try:
                if v.device.type=="cpu": v=v.pin_memory()
            except: pass
            out[k]=v.to(device, non_blocking=True)
        else:
            out[k]=v
    return out
def get_model_device(model):
    try: return next(model.parameters()).device
    except: return torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ---------- Loader: HF ----------
from transformers import AutoModelForCausalLM
def load_hf_eager(model_id:str):
    tok=AutoTokenizer.from_pretrained(model_id, use_fast=True)
    tok.padding_side="left"; tok.pad_token=tok.eos_token
    if torch.cuda.is_available(): torch.cuda.empty_cache(); gc.collect()
    model=AutoModelForCausalLM.from_pretrained(
        model_id, device_map="auto",
        dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        low_cpu_mem_usage=True,
        attn_implementation="eager",
    )
    try: model.config.use_cache=True
    except: pass
    model.eval()
    return tok, model, "hf_eager"

# ---------- Loader: ORT (ONNX-Cache, labels-freies forward) ----------
def load_ort(model_id:str):
    if not ort_available: raise RuntimeError("ORT/Optimum nicht verfügbar.")
    tok=AutoTokenizer.from_pretrained(model_id, use_fast=True)
    tok.padding_side="left"; tok.pad_token=tok.eos_token
    providers=["CUDAExecutionProvider","CPUExecutionProvider"] if ort_cuda else ["CPUExecutionProvider"]
    onnx_dir=os.path.join(project_dir, f"onnx_{model_id.split('/')[-1]}")
    if os.path.isdir(onnx_dir) and len(os.listdir(onnx_dir))>0:
        model=ORTModelForCausalLM.from_pretrained(
            onnx_dir, provider=providers[0], providers=providers,
            use_cache=False, use_io_binding=False  # wichtig
        )
        note=f"ort_{'cuda' if ort_cuda else 'cpu'}(cached)"
    else:
        model=ORTModelForCausalLM.from_pretrained(
            model_id, export=True, provider=providers[0], providers=providers,
            use_cache=False, use_io_binding=False
        )
        try:
            os.makedirs(onnx_dir, exist_ok=True)
            model.save_pretrained(onnx_dir)
        except: pass
        note=f"ort_{'cuda' if ort_cuda else 'cpu'}(exported)"
    return tok, model, note

def _is_ort_model(m):
    try:
        return ort_available and isinstance(m, ORTModelForCausalLM)
    except:
        return False

# ---------- Generation ----------
def simple_generate(model, tok, prompts, max_new_tokens=32):
    dev=get_model_device(model)
    ml=min(MAX_LEN_CAP, safe_max_len(tok, model))
    enc=tok(prompts, return_tensors="pt", truncation=True, max_length=ml, padding=True)
    if dev.type=="cuda": enc=_pin_and_move(enc, dev)
    room=ml-enc["input_ids"].shape[1]
    cur_new=max(1, min(max_new_tokens, int(room)))
    gen_kwargs=dict(max_new_tokens=cur_new, do_sample=False, pad_token_id=tok.eos_token_id)
    if _is_ort_model(model): gen_kwargs["use_cache"]=False  # wichtig
    with torch.inference_mode(), autocast_ctx():
        out=model.generate(**enc, **gen_kwargs)
    texts, total= [], 0
    max_in=int(enc["input_ids"].shape[1]); pad_id=tok.pad_token_id; eos_id=tok.eos_token_id
    for i in range(out.size(0)):
        seq=out[i]; gen_slice=seq[max_in:]; gen_i=0
        for t in gen_slice.tolist():
            if t==eos_id: gen_i+=1; break
            if pad_id is not None and t==pad_id: break
            gen_i+=1
        total+=gen_i; texts.append(tok.decode(seq, skip_special_tokens=True))
    return texts, total

# ---------- Perplexity ----------
import torch.nn.functional as F
def eval_perplexity(model, tok, ds_cfg):
    dev=get_model_device(model)
    ds=load_dataset(ds_cfg["name"], ds_cfg["config"], split=ds_cfg["split"])
    ml=min(MAX_LEN_CAP, safe_max_len(tok, model))
    losses=[]
    with torch.inference_mode(), autocast_ctx():
        for t in ds["text"]:
            if not isinstance(t,str) or len(t.strip())<4: continue
            enc=tok(t, return_tensors="pt", truncation=True, max_length=ml)
            if dev.type=="cuda": enc=_pin_and_move(enc, dev)
            if _is_ort_model(model):
                # ORT: kein labels-Forward → Loss manuell
                out=model(input_ids=enc["input_ids"],
                          attention_mask=enc.get("attention_mask"),
                          use_cache=False)
                logits=out.logits[..., :-1, :]
                labels=enc["input_ids"][..., 1:].clone()
                if "attention_mask" in enc:
                    mask=enc["attention_mask"][..., 1:]
                    labels[mask==0]=-100
                loss=F.cross_entropy(logits.transpose(1,2), labels, ignore_index=-100)
            else:
                out=model(enc["input_ids"], labels=enc["input_ids"])
                loss=out.loss
            losses.append(float(loss.detach().cpu()))
    return math.exp(np.mean(losses)) if losses else None

# ---------- BLEU ----------
def eval_bleu_llm(model, tok, ds_cfg, max_new_tokens=32, batch_size=8):
    dev=get_model_device(model)
    ds=load_dataset(ds_cfg["name"], ds_cfg["config"], split=ds_cfg["split"])
    ml=min(MAX_LEN_CAP, safe_max_len(tok, model))
    preds, refs=[], []
    with torch.inference_mode(), autocast_ctx():
        batch_prompts, batch_refs=[], []
        for ex in ds:
            de,en=ex["translation"]["de"], ex["translation"]["en"]
            prompt=f"Translate to English:\nGerman: {de}\nEnglish:"
            batch_prompts.append(prompt); batch_refs.append(en)
            if len(batch_prompts)>=batch_size:
                enc=tok(batch_prompts, return_tensors="pt", truncation=True, max_length=ml, padding=True)
                if dev.type=="cuda": enc=_pin_and_move(enc, dev)
                room=ml-enc["input_ids"].shape[1]
                cur_new=max(1, min(max_new_tokens, int(room)))
                gen_kwargs=dict(max_new_tokens=cur_new, do_sample=False, pad_token_id=tok.eos_token_id)
                if _is_ort_model(model): gen_kwargs["use_cache"]=False
                out=model.generate(**enc, **gen_kwargs)
                for i in range(out.size(0)):
                    gen=tok.decode(out[i], skip_special_tokens=True)
                    hyp=gen.split("English:")[-1].strip().split("\n")[0].strip() or gen.strip()
                    preds.append(hyp); refs.append([batch_refs[i]])
                batch_prompts, batch_refs=[], []
        if batch_prompts:
            enc=tok(batch_prompts, return_tensors="pt", truncation=True, max_length=ml, padding=True)
            if dev.type=="cuda": enc=_pin_and_move(enc, dev)
            room=ml-enc["input_ids"].shape[1]
            cur_new=max(1, min(max_new_tokens, int(room)))
            gen_kwargs=dict(max_new_tokens=cur_new, do_sample=False, pad_token_id=tok.eos_token_id)
            if _is_ort_model(model): gen_kwargs["use_cache"]=False
            out=model.generate(**enc, **gen_kwargs)
            for i in range(out.size(0)):
                gen=tok.decode(out[i], skip_special_tokens=True)
                hyp=gen.split("English:")[-1].strip().split("\n")[0].strip() or gen.strip()
                preds.append(hyp); refs.append([batch_refs[i]])
    return float(bleu_metric.compute(predictions=preds, references=refs)["score"])

# ---------- Runner ----------
def run_backend(model_id:str, alias:str, backend:str):
    print(f"\n### Starte Framework: {backend} | {alias} ({model_id})")
    if backend=="hf":
        loader=load_hf_eager
    elif backend=="ort":
        if not ort_available:
            print("[ORT] nicht verfügbar → Backend übersprungen."); return None, []
        loader=load_ort
    else:
        raise ValueError("Unbekanntes Backend")

    def _do_load():
        tok, model, note=loader(model_id)
        dev=get_model_device(model); ml=min(MAX_LEN_CAP, safe_max_len(tok, model))
        wenc=tok(["Warmup token 1","Warmup token 2"], return_tensors="pt",
                 truncation=True, max_length=ml, padding=True)
        if dev.type=="cuda": wenc=_pin_and_move(wenc, dev)
        gen_kwargs=dict(max_new_tokens=1, do_sample=False, pad_token_id=tok.eos_token_id)
        if _is_ort_model(model): gen_kwargs["use_cache"]=False
        with torch.inference_mode(), autocast_ctx():
            _=model.generate(**wenc, **gen_kwargs)
        return tok, model, note

    m_warm, (tok, model, backend_note)=measure("warmup", _do_load, f"{RESULT_BASENAME}_{alias}_{backend}")

    m_gen, (samples, n_tok)=measure("gen",
        lambda: simple_generate(model, tok, PROMPTS, EVAL["max_new_tokens"]),
        f"{RESULT_BASENAME}_{alias}_{backend}"
    )
    if torch.cuda.is_available(): torch.cuda.empty_cache(); gc.collect()

    m_ppl, ppl=measure("ppl",
        lambda: eval_perplexity(model, tok, EVAL["ppl"]),
        f"{RESULT_BASENAME}_{alias}_{backend}"
    )
    if torch.cuda.is_available(): torch.cuda.empty_cache(); gc.collect()

    m_bleu, bleu=measure("bleu",
        lambda: eval_bleu_llm(model, tok, EVAL["bleu"], EVAL["max_new_tokens"], batch_size=8),
        f"{RESULT_BASENAME}_{alias}_{backend}"
    )
    if torch.cuda.is_available(): torch.cuda.empty_cache(); gc.collect()

    total_time=m_warm["time_s"]+m_gen["time_s"]+m_ppl["time_s"]+m_bleu["time_s"]
    total_energy=m_warm["energy_kwh"]+m_gen["energy_kwh"]+m_ppl["energy_kwh"]+m_bleu["energy_kwh"]
    total_co2=m_warm["co2_kg"]+m_gen["co2_kg"]+m_ppl["co2_kg"]+m_bleu["co2_kg"]

    ram,valloc,vres=capture_memory()
    per_phase=[m_warm,m_gen,m_ppl,m_bleu]
    for p in per_phase:
        p["alias"]=alias; p["model_id"]=model_id; p["backend"]=backend_note

    row=dict(
        model_id=model_id, alias=alias, backend=backend_note,
        precision=f"{'fp16' if device=='cuda' else 'fp32'}",
        time_s=total_time, energy_kwh=total_energy, co2_kg=total_co2,
        kg_per_kwh=(total_co2/total_energy) if total_energy else None,
        tokens_out=int(n_tok), ppl=ppl, bleu=bleu,
        ram_GB=bytes_to_gb(ram), vram_alloc_GB=bytes_to_gb(valloc), vram_reserved_GB=bytes_to_gb(vres),
        notes=f"MAX_LEN_CAP={MAX_LEN_CAP}; pinned={PINNED_MEM}; nb={NON_BLOCKING}"
    )

    samples_path=os.path.join(project_dir, f"{RESULT_BASENAME}_samples_{alias}_{backend}.txt")
    with open(samples_path,"w",encoding="utf-8") as f:
        for i,txt in enumerate(samples,1):
            f.write(f"--- Beispiel {i} ({alias}, {backend_note}) ---\n{txt}\n\n")
    print("Beispiele gespeichert:", samples_path)

    return row, per_phase

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# ---------- Ausführen & Speichern ----------
all_rows, all_phases=[], []
backends=["hf","ort"]
for mid, long_name, alias in MODELS:
    for be in backends:
        res=run_backend(mid, alias, be)
        if res is None: continue
        row, phases=res
        if row is None: continue
        all_rows.append(row); all_phases.extend(phases)

df=pd.DataFrame(all_rows).sort_values(["alias","backend"]).reset_index(drop=True)
dfp=pd.DataFrame(all_phases)
dfp["wh_total"]=dfp["energy_kwh"]*1000.0
dfp["phase"]=pd.Categorical(dfp["phase"], categories=["warmup","gen","ppl","bleu"], ordered=True)

out_csv=os.path.join(project_dir, f"{RESULT_BASENAME}_results.csv"); df.to_csv(out_csv, index=False)
out_phase_csv=os.path.join(project_dir, f"{RESULT_BASENAME}_per_phase.csv"); dfp.to_csv(out_phase_csv, index=False)

print("\nErgebnisse (gesamt):"); print(df); print("Gespeichert (gesamt):", out_csv)
print("\nPer-Phase Übersicht:")
print(dfp[["alias","backend","phase","time_s","energy_kwh","co2_kg"]].sort_values(["alias","backend","phase"]))
print("Gespeichert (per Phase):", out_phase_csv)





### Starte Framework: hf | b560 (bigscience/bloom-560m)


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

de-en/train-00000-of-00003.parquet:   0%|          | 0.00/280M [00:00<?, ?B/s]

de-en/train-00001-of-00003.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

de-en/train-00002-of-00003.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

de-en/validation-00000-of-00001.parquet:   0%|          | 0.00/474k [00:00<?, ?B/s]

de-en/test-00000-of-00001.parquet:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Beispiele gespeichert: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/bloom_frameworks/bloom_frameworks_samples_b560_hf.txt

### Starte Framework: ort | b560 (bigscience/bloom-560m)


The model bigscience/bloom-560m was already converted to ONNX but got `export=True`, the model will be converted to ONNX once again. Don't forget to save the resulting model with `.save_pretrained()`


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Found different candidate ONNX initializers (likely duplicate) for the tied weights:
	lm_head.weight: {'onnx::MatMul_5381'}
	transformer.word_embeddings.weight: {'transformer.word_embeddings.weight'}


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Beispiele gespeichert: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/bloom_frameworks/bloom_frameworks_samples_b560_ort.txt

### Starte Framework: hf | b3b (bigscience/bloom-3b)


tokenizer_config.json:   0%|          | 0.00/222 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/14.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/85.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/693 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/6.01G [00:00<?, ?B/s]

Beispiele gespeichert: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/bloom_frameworks/bloom_frameworks_samples_b3b_hf.txt

### Starte Framework: ort | b3b (bigscience/bloom-3b)


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Found different candidate ONNX initializers (likely duplicate) for the tied weights:
	lm_head.weight: {'onnx::MatMul_6659'}
	transformer.word_embeddings.weight: {'transformer.word_embeddings.weight'}


Fetching 0 files: 0it [00:00, ?it/s]

Fetching 0 files: 0it [00:00, ?it/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Beispiele gespeichert: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/bloom_frameworks/bloom_frameworks_samples_b3b_ort.txt

Ergebnisse (gesamt):
                model_id alias             backend precision      time_s  \
0    bigscience/bloom-3b   b3b            hf_eager      fp16   43.787972   
1    bigscience/bloom-3b   b3b  ort_cuda(exported)      fp16  369.111210   
2  bigscience/bloom-560m  b560            hf_eager      fp16   37.042503   
3  bigscience/bloom-560m  b560  ort_cuda(exported)      fp16  143.148229   

   energy_kwh    co2_kg  kg_per_kwh  tokens_out         ppl       bleu  \
0    0.001386  0.000371    0.267622          96  238.532132  10.054131   
1    0.012881  0.003447    0.267622          96  238.424206  10.092805   
2    0.001087  0.000291    0.267622          96  341.261974   5.069542   
3    0.004801  0.001285    0.267622          96  339.798549   3.784713   

      ram_GB  vram_alloc_GB  vram_reserved_GB  \
0   5.153816       5.601610          6.