<a href="https://colab.research.google.com/github/Ino54/MA_GreenAI-Practical-Experiments/blob/main/deepseek_hardware.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ---------- Requirements ----------
%%writefile requirements.txt
transformers>=4.44
accelerate>=0.33
bitsandbytes
datasets>=2.20
evaluate>=0.4
sacrebleu>=2.4
codecarbon>=2.5,<3
pynvml>=12,<13
psutil
numpy==2.0.2
pandas==2.2.2
huggingface_hub

Writing requirements.txt


In [None]:
!pip -q install -U -r requirements.txt --no-warn-conflicts
# potenzielle Konflikte leise entfernen
!pip uninstall -y -q google-genai firebase-admin || true

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m131.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m38.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m39.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.6/517.6 kB[0m [31m40.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ---------- Hugging Face Login via Colab-Secret ----------
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get("HF_TOKEN")
if hf_token:
    login(hf_token); print("Hugging Face Login erfolgreich!")
else:
    print("WARNUNG: Kein HF_TOKEN gefunden")

Hugging Face Login erfolgreich!


In [None]:
# ---------- Drive mounten & Zielordner prüfen ----------
import os, shutil, time, pathlib, platform, gc, re, math, warnings, inspect
warnings.filterwarnings("ignore")

from google.colab import drive
MOUNTPOINT = "/content/drive"
already = os.path.isdir(os.path.join(MOUNTPOINT, "MyDrive"))
if not already and os.path.isdir(MOUNTPOINT) and os.listdir(MOUNTPOINT):
    backup = f"/content/drive_stale_{int(time.time())}"
    shutil.move(MOUNTPOINT, backup)
    os.makedirs(MOUNTPOINT, exist_ok=True)
drive.mount(MOUNTPOINT, force_remount=(not already))

work_dir = "/content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_hardware"
pathlib.Path(work_dir).mkdir(parents=True, exist_ok=True)
os.chdir(work_dir)
project_dir = work_dir
print("Arbeitsordner:", os.getcwd())

Mounted at /content/drive
Arbeitsordner: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_hardware


In [None]:
# ---------- Imports & Device ----------
import numpy as np, pandas as pd
import torch, psutil
from contextlib import nullcontext
from types import SimpleNamespace
from datasets import load_dataset
import evaluate

from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GenerationConfig, set_seed
from codecarbon import EmissionsTracker

# PyTorch Runtimes (Performance)
import os as _os
_os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"  # weniger Fragmentierung

set_seed(42)
device = "cuda" if torch.cuda.is_available() else "cpu"
if device == "cuda":
    gpu_name = torch.cuda.get_device_name(0)
    vram_total_gb = torch.cuda.get_device_properties(0).total_memory/(1024**3)
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True
    try: torch.set_float32_matmul_precision("high")
    except: pass
else:
    gpu_name = "CPU"; vram_total_gb = 0.0
print(f"Device: {device} | GPU: {gpu_name} | VRAM={vram_total_gb:.1f} GB | Torch {torch.__version__} | Py {platform.python_version()}")

RESULT_BASENAME = "deepseek_hardware_4bit"

Device: cuda | GPU: NVIDIA A100-SXM4-40GB | VRAM=39.6 GB | Torch 2.8.0+cu126 | Py 3.12.11


In [None]:
# ---------- Flags für Hardware-Optimierung ----------
APPLY_TORCH_COMPILE     = False    # bei längeren Läufen True probieren
PINNED_MEM              = True     # Host-Puffer pinnen
NON_BLOCKING            = True     # non_blocking Transfers zur GPU
USE_BF16_AUTOCAST       = (device=="cuda" and torch.cuda.get_device_capability(0)[0] >= 8)  # Ampere+
ATTN_IMPL               = "sdpa"   # Qwen/DeepSeek: SDPA ist schnell/stabil

# Eval-Config (identisch zum BLOOM-Notebook, aber hier nur ein Modell)
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ALIAS    = "r1q15b"

EVAL = {
    "max_new_tokens": 32,
    "ppl":  {"name":"wikitext","config":"wikitext-2-raw-v1","split":"test[:1%]"},
    "bleu": {"name":"wmt14","config":"de-en","split":"test[:32]"},
}
PROMPTS = [
    "Schreibe einen kurzen Absatz über nachhaltige KI.",
    "Erkläre in einfachen Worten, was Quantisierung in neuronalen Netzen ist.",
    "Nenne drei Vorteile von Mixture-of-Experts-Modellen."
]
bleu_metric = evaluate.load("sacrebleu")

# Schutz: max. Eingabelänge (senkt VRAM, stabilisiert Bench)
MAX_LEN_CAP = 128  # ggf. 192/128 für noch weniger VRAM

# Warnungsfreie, deterministische Generation (keine temperature/top_p-Flags)
GENCFG = GenerationConfig(do_sample=False, temperature=None, top_p=None, top_k=None, num_beams=1)

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# ---------- CodeCarbon-Helfer ----------
def _cleanup_cc_locks():
    for p in [
        "/tmp/.codecarbon.lock",
        _os.path.expanduser("~/.codecarbon/codecarbon.lock"),
        "/content/.codecarbon/codecarbon.lock",
    ]:
        try:
            if _os.path.exists(p): _os.remove(p)
        except Exception:
            pass

_os.environ["CODECARBON_CACHE_DIR"] = f"/content/.cc_cache_hw_{int(time.time())}"

def _cc_supported_kwargs():
    base = dict(log_level="error", output_dir=".", measure_power_secs=1, tracking_mode="process")
    try:
        params = inspect.signature(EmissionsTracker.__init__).parameters
        if "allow_multiple_runs" in params: base["allow_multiple_runs"] = True
        if "cloud_provider" in params:      base["cloud_provider"] = "google"
        if "cloud_region" in params:        base["cloud_region"]   = "europe-west10"
        if "country_iso_code" in params:    base["country_iso_code"] = "DEU"
    except Exception:
        pass
    return base

def make_trk(name, out):
    _cleanup_cc_locks()
    return EmissionsTracker(project_name=name, output_file=out, **_cc_supported_kwargs())

def start(tr):
    try:
        tr.start(); return True
    except Exception:
        _cleanup_cc_locks()
        try:
            tr.start(); return True
        except Exception:
            return False

def stop(tr, st):
    if not st:
        return SimpleNamespace(energy_consumed=0.0, emissions=0.0)
    try:
        return tr.stop()
    except Exception:
        return SimpleNamespace(energy_consumed=0.0, emissions=0.0)

def unpack(em):
    if hasattr(em, "energy_consumed") and hasattr(em, "emissions"):
        try: return float(em.energy_consumed), float(em.emissions)
        except: return 0.0, 0.0
    if isinstance(em, dict):
        e = em.get("energy_consumed", 0.0)
        c = em.get("emissions", em.get("emissions_kg", 0.0))
        try: return float(e), float(c)
        except: return 0.0, 0.0
    try: return 0.0, float(em)
    except: return 0.0, 0.0

def read_energy(path):
    try:
        if not _os.path.exists(path): return 0.0
        df = pd.read_csv(path)
        for c in ["energy_consumed","energy_consumed_kwh","energy_consumed (kWh)","energy (kWh)"]:
            if c in df.columns: return float(df[c].iloc[-1])
        for c in df.columns:
            if "energy" in c.lower() and "kwh" in c.lower():
                return float(df[c].iloc[-1])
    except Exception:
        pass
    return 0.0

def measure(phase, fn, prefix):
    logfile = _os.path.join(project_dir, f"{prefix}_{phase}.csv")
    tr = make_trk(f"{prefix}_{phase}", logfile)
    import time as _t
    st = start(tr); t0 = _t.time(); res = fn(); t1 = _t.time()
    em = stop(tr, st); ekwh, co2 = unpack(em)
    if ekwh == 0.0:
        ek = read_energy(logfile)
        if ek: ekwh = ek
    return {"phase": phase, "time_s": t1-t0, "energy_kwh": ekwh, "co2_kg": co2}, res

In [None]:
# ---------- Loader: DeepSeek in 4-bit (NF4) + HW-Tweaks ----------
def load_model_4bit_optimized(model_id: str):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    if tok.pad_token_id is None: tok.pad_token = tok.eos_token
    tok.padding_side = "left"

    if torch.cuda.is_available():
        torch.cuda.empty_cache(); gc.collect()

    bnb4 = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_use_double_quant=True,
        bnb_4bit_compute_dtype=(torch.bfloat16 if USE_BF16_AUTOCAST else torch.float16),
    )

    model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map="auto",
        quantization_config=bnb4,
        attn_implementation=ATTN_IMPL,  # SDPA
    )

    # Channels-last kann Speicher/Cache hit verbessern
    if device=="cuda":
        model = model.to(memory_format=torch.channels_last)

    # torch.compile: optional (für lange Läufe)
    if APPLY_TORCH_COMPILE and hasattr(torch, "compile"):
        try:
            model = torch.compile(model, mode="reduce-overhead", fullgraph=False)
            print("[HW] torch.compile aktiv (reduce-overhead).")
        except Exception as e:
            print("[HW] torch.compile nicht möglich → normal weiter:", repr(e))

    model.eval()
    return tok, model

In [None]:
# ---------- Pinned Memory & Utils ----------
def _pin_and_move(batch, device):
    out = {}
    for k, v in batch.items():
        if isinstance(v, torch.Tensor):
            if PINNED_MEM and v.device.type == "cpu":
                try: v = v.pin_memory()
                except Exception: pass
            out[k] = v.to(device, non_blocking=NON_BLOCKING)
        else:
            out[k] = v
    return out

def get_model_device(model):
    try: return next(model.parameters()).device
    except StopIteration: return torch.device("cuda" if torch.cuda.is_available() else "cpu")

def autocast_ctx():
    if device!="cuda": return nullcontext()
    return torch.autocast(device_type="cuda", dtype=(torch.bfloat16 if USE_BF16_AUTOCAST else torch.float16))

def safe_max_len(tok, model, fallback=2048, upper=100000):
    cand = getattr(tok, "model_max_length", None)
    if isinstance(cand, int) and 0 < cand < upper: return min(cand, fallback)
    cand = getattr(getattr(model, "config", None), "max_position_embeddings", None)
    if isinstance(cand, int) and 0 < cand < upper: return min(cand, fallback)
    return fallback

def capture_memory():
    ram = psutil.Process().memory_info().rss
    valloc = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
    vres  = torch.cuda.memory_reserved()  if torch.cuda.is_available() else 0
    return ram, valloc, vres

def bytes_to_gb(b): return float(b)/(1024**3)

In [None]:
# ---------- Evaluation (batched, exakte Tokenzählung) ----------
def simple_generate(model, tok, prompts, max_new_tokens=32):
    dev = get_model_device(model)
    ml = min(MAX_LEN_CAP, safe_max_len(tok, model))
    enc = tok(prompts, return_tensors="pt", truncation=True, max_length=ml, padding=True)
    enc = _pin_and_move(enc, dev)
    room = ml - enc["input_ids"].shape[1]
    cur_new = max(1, min(max_new_tokens, int(room)))
    with torch.inference_mode(), autocast_ctx():
        out = model.generate(**enc, max_new_tokens=cur_new, generation_config=GENCFG, pad_token_id=tok.eos_token_id)

    texts, total_gen_tokens = [], 0
    max_input_len = int(enc["input_ids"].shape[1])
    pad_id = tok.pad_token_id
    eos_id = tok.eos_token_id
    for i in range(out.size(0)):
        seq = out[i]
        gen_slice = seq[max_input_len:]
        gen_i = 0
        for t in gen_slice.tolist():
            if t == eos_id:
                gen_i += 1
                break
            if (pad_id is not None) and (t == pad_id):
                break
            gen_i += 1
        total_gen_tokens += gen_i
        texts.append(tok.decode(seq, skip_special_tokens=True))
    return texts, total_gen_tokens

def eval_perplexity(model, tok, ds_cfg):
    dev = get_model_device(model)
    ds = load_dataset(ds_cfg["name"], ds_cfg["config"], split=ds_cfg["split"])
    ml = min(MAX_LEN_CAP, safe_max_len(tok, model))
    losses = []
    with torch.inference_mode(), autocast_ctx():
        for t in ds["text"]:
            if not isinstance(t, str) or len(t.strip()) < 4: continue
            enc = tok(t, return_tensors="pt", truncation=True, max_length=ml)
            enc = _pin_and_move(enc, dev)
            out = model(enc["input_ids"], labels=enc["input_ids"])
            losses.append(float(out.loss.detach().cpu()))
    return math.exp(np.mean(losses)) if losses else None

bleu_metric = evaluate.load("sacrebleu")
def eval_bleu_llm(model, tok, ds_cfg, max_new_tokens=32, batch_size=8):
    dev = get_model_device(model)
    ds = load_dataset(ds_cfg["name"], ds_cfg["config"], split=ds_cfg["split"])
    ml = min(MAX_LEN_CAP, safe_max_len(tok, model))
    preds, refs = [], []
    with torch.inference_mode(), autocast_ctx():
        batch_prompts, batch_refs = [], []
        for ex in ds:
            de, en = ex["translation"]["de"], ex["translation"]["en"]
            prompt = f"Translate to English:\nGerman: {de}\nEnglish:"
            batch_prompts.append(prompt); batch_refs.append(en)
            if len(batch_prompts) >= batch_size:
                enc = tok(batch_prompts, return_tensors="pt", truncation=True, max_length=ml, padding=True)
                enc = _pin_and_move(enc, dev)
                room = ml - enc["input_ids"].shape[1]
                cur_new = max(1, min(max_new_tokens, int(room)))
                out = model.generate(**enc, max_new_tokens=cur_new, generation_config=GENCFG, pad_token_id=tok.eos_token_id)
                for i in range(out.size(0)):
                    gen = tok.decode(out[i], skip_special_tokens=True)
                    hyp = gen.split("English:")[-1].strip().split("\n")[0].strip() or gen.strip()
                    preds.append(hyp); refs.append([batch_refs[i]])
                batch_prompts, batch_refs = [], []
        if batch_prompts:
            enc = tok(batch_prompts, return_tensors="pt", truncation=True, max_length=ml, padding=True)
            enc = _pin_and_move(enc, dev)
            room = ml - enc["input_ids"].shape[1]
            cur_new = max(1, min(max_new_tokens, int(room)))
            out = model.generate(**enc, max_new_tokens=cur_new, generation_config=GENCFG, pad_token_id=tok.eos_token_id)
            for i in range(out.size(0)):
                gen = tok.decode(out[i], skip_special_tokens=True)
                hyp = gen.split("English:")[-1].strip().split("\n")[0].strip() or gen.strip()
                preds.append(hyp); refs.append([batch_refs[i]])
    return float(bleu_metric.compute(predictions=preds, references=refs)["score"])

In [None]:
# ---------- Run (warmup/gen/ppl/bleu) ----------
def run_once(model_id: str, alias: str):
    print(f"\n### Starte Hardware-Optimierung 4-bit: {alias} ({model_id})")

    def _do_load():
        tok, model = load_model_4bit_optimized(model_id)
        # Warmup (batch=2) für stabilere Kernelwahl/Compile
        dev = get_model_device(model); ml = min(MAX_LEN_CAP, safe_max_len(tok, model))
        wenc = tok(["Warmup token 1", "Warmup token 2"], return_tensors="pt", truncation=True, max_length=ml, padding=True)
        wenc = _pin_and_move(wenc, dev)
        with torch.inference_mode(), autocast_ctx():
            _ = model.generate(**wenc, max_new_tokens=1, generation_config=GENCFG, pad_token_id=tok.eos_token_id)
        return tok, model

    m_warm, (tok, model) = measure("warmup", _do_load, f"{RESULT_BASENAME}_{alias}")

    m_gen,  (samples, n_tok) = measure("gen",
        lambda: simple_generate(model, tok, PROMPTS, EVAL["max_new_tokens"]),
        f"{RESULT_BASENAME}_{alias}"
    )
    if torch.cuda.is_available(): torch.cuda.empty_cache(); gc.collect()

    m_ppl,  ppl  = measure("ppl",
        lambda: eval_perplexity(model, tok, EVAL["ppl"]),
        f"{RESULT_BASENAME}_{alias}"
    )
    if torch.cuda.is_available(): torch.cuda.empty_cache(); gc.collect()

    m_bleu, bleu = measure("bleu",
        lambda: eval_bleu_llm(model, tok, EVAL["bleu"], EVAL["max_new_tokens"], batch_size=8),
        f"{RESULT_BASENAME}_{alias}"
    )
    if torch.cuda.is_available(): torch.cuda.empty_cache(); gc.collect()

    total_time   = m_warm["time_s"] + m_gen["time_s"] + m_ppl["time_s"] + m_bleu["time_s"]
    total_energy = m_warm["energy_kwh"] + m_gen["energy_kwh"] + m_ppl["energy_kwh"] + m_bleu["energy_kwh"]
    total_co2    = m_warm["co2_kg"]     + m_gen["co2_kg"]     + m_ppl["co2_kg"]     + m_bleu["co2_kg"]

    steady_time   = m_gen["time_s"] + m_ppl["time_s"] + m_bleu["time_s"]
    steady_energy = m_gen["energy_kwh"] + m_ppl["energy_kwh"] + m_bleu["energy_kwh"]
    steady_co2    = m_gen["co2_kg"]     + m_ppl["co2_kg"]     + m_bleu["co2_kg"]

    ram = psutil.Process().memory_info().rss
    valloc = torch.cuda.memory_allocated() if torch.cuda.is_available() else 0
    vres  = torch.cuda.memory_reserved()  if torch.cuda.is_available() else 0

    per_phase = [m_warm, m_gen, m_ppl, m_bleu]
    for p in per_phase:
        p["alias"] = alias
        p["model_id"] = model_id

    row = dict(
        model_id=model_id, alias=alias, precision="int4 (NF4, double-quant, sdpa)",
        time_s=total_time, energy_kwh=total_energy, co2_kg=total_co2,
        steady_time_s=steady_time, steady_energy_kwh=steady_energy, steady_co2_kg=steady_co2,
        kg_per_kwh=(total_co2/total_energy) if total_energy else None,
        tokens_out=int(n_tok), ppl=ppl, bleu=bleu,
        ram_GB=bytes_to_gb(ram), vram_alloc_GB=bytes_to_gb(valloc), vram_reserved_GB=bytes_to_gb(vres),
        notes=f"compile={APPLY_TORCH_COMPILE}; pinned={PINNED_MEM}; nb={NON_BLOCKING}; MAX_LEN_CAP={MAX_LEN_CAP}; bf16_autocast={USE_BF16_AUTOCAST}"
    )

    # Samples speichern
    samples_path = os.path.join(project_dir, f"{RESULT_BASENAME}_samples_{alias}.txt")
    with open(samples_path, "w", encoding="utf-8") as f:
        for i, txt in enumerate(samples, 1):
            f.write(f"--- Beispiel {i} ({alias}) ---\n{txt}\n\n")
    print("Beispiele gespeichert:", samples_path)

    return row, per_phase

In [None]:
# ---------- Ausführen & Speichern ----------
row, phases = run_once(MODEL_ID, ALIAS)
df  = pd.DataFrame([row])
dfp = pd.DataFrame(phases)

out_csv = os.path.join(project_dir, f"{RESULT_BASENAME}_results.csv")
df.to_csv(out_csv, index=False)

out_phase_csv = os.path.join(project_dir, f"{RESULT_BASENAME}_per_phase.csv")
dfp.to_csv(out_phase_csv, index=False)

print("\nErgebnisse (gesamt):")
print(df)
print("Gespeichert (gesamt):", out_csv)

print("\nPer-Phase Übersicht:")
print(dfp[["alias","phase","time_s","energy_kwh","co2_kg"]].sort_values(["alias","phase"]))
print("Gespeichert (per Phase):", out_phase_csv)




### Starte Hardware-Optimierung 4-bit: r1q15b (deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

de-en/train-00000-of-00003.parquet:   0%|          | 0.00/280M [00:00<?, ?B/s]

de-en/train-00001-of-00003.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

de-en/train-00002-of-00003.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

de-en/validation-00000-of-00001.parquet:   0%|          | 0.00/474k [00:00<?, ?B/s]

de-en/test-00000-of-00001.parquet:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Beispiele gespeichert: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_hardware/deepseek_hardware_4bit_samples_r1q15b.txt

Ergebnisse (gesamt):
                                    model_id   alias  \
0  deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B  r1q15b   

                        precision     time_s  energy_kwh    co2_kg  \
0  int4 (NF4, double-quant, sdpa)  67.666614    0.001894  0.000857   

   steady_time_s  steady_energy_kwh  steady_co2_kg  kg_per_kwh  tokens_out  \
0      35.728374           0.001073       0.000486    0.452621          96   

          ppl      bleu    ram_GB  vram_alloc_GB  vram_reserved_GB  \
0  238.563029  11.30959  4.432037       1.517489          1.583984   

                                               notes  
0  compile=False; pinned=True; nb=True; MAX_LEN_C...  
Gespeichert (gesamt): /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_hardware/deepseek_hardware_4bit_results.csv

Per-Phase Übersicht:
    alias   ph