<a href="https://colab.research.google.com/github/Ino54/MA_GreenAI-Practical-Experiments/blob/main/deepseek_quantisierung.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# ============= Requirements =============
%%writefile requirements.txt
transformers>=4.44
accelerate>=0.33
bitsandbytes
datasets>=2.20
evaluate>=0.4
sacrebleu>=2.4
codecarbon>=2.5,<3
pynvml>=12,<13
psutil
numpy==2.0.2
pandas==2.2.2
huggingface_hub

Writing requirements.txt


In [None]:
!pip -q install -U -r requirements.txt --no-warn-conflicts
!pip uninstall -y -q google-genai firebase-admin || true

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m104.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m37.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.6/517.6 kB[0m [31m45.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ============= Drive mount + Ordnercheck =============
import os, shutil, time, pathlib, platform
from google.colab import drive
MOUNTPOINT="/content/drive"
already=os.path.isdir(os.path.join(MOUNTPOINT,"MyDrive"))
if not already and os.path.isdir(MOUNTPOINT) and os.listdir(MOUNTPOINT):
    backup=f"/content/drive_stale_{int(time.time())}"
    shutil.move(MOUNTPOINT, backup)
    os.makedirs(MOUNTPOINT, exist_ok=True)
drive.mount(MOUNTPOINT, force_remount=(not already))

work_dir="/content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_quantisierung"
if not os.path.isdir(work_dir):
    raise FileNotFoundError(
        f"Zielordner fehlt: {work_dir}\n"
        "Bitte diesen Ordner manuell in Google Drive anlegen und das Notebook erneut starten."
    )
os.chdir(work_dir)
project_dir=work_dir
print("Arbeitsordner:", os.getcwd())

Mounted at /content/drive
Arbeitsordner: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_quantisierung


In [None]:
# ============= HF-Login =============
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get("HF_TOKEN")
if hf_token:
    login(hf_token); print("Hugging Face Login erfolgreich!")
else:
    print("WARNUNG: Kein HF_TOKEN gefunden.")

Hugging Face Login erfolgreich!


In [None]:
# ============= Imports & Setup =============
import re, math, gc, warnings, inspect
warnings.filterwarnings("ignore")

import numpy as np, pandas as pd, torch, psutil
from contextlib import nullcontext
from types import SimpleNamespace
from typing import Tuple
from datasets import load_dataset
import evaluate

from transformers import (
    AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, set_seed, GenerationConfig
)
from codecarbon import EmissionsTracker

RESULT_BASENAME = "deepseek_quant"
MODEL_ID = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ALIAS = "r1q15b"

set_seed(42)
device="cuda" if torch.cuda.is_available() else "cpu"
if device=="cuda":
  gpu_name=torch.cuda.get_device_name(0)
  vram_total_gb=torch.cuda.get_device_properties(0).total_memory/(1024**3)
  torch.backends.cuda.matmul.allow_tf32=True
else:
  gpu_name="CPU"; vram_total_gb=0.0
import platform as _pf
print(f"Device: {device} | GPU: {gpu_name} | VRAM={vram_total_gb:.1f} GB | Torch {torch.__version__} | Py {_pf.python_version()}")

Device: cuda | GPU: NVIDIA A100-SXM4-80GB | VRAM=79.3 GB | Torch 2.8.0+cu126 | Py 3.12.11


In [None]:
# ============= CodeCarbon Helpers (Berlin) =============
import os as _os, time as _time

def _cc_supported_kwargs():
    base=dict(log_level="error", output_dir=".", measure_power_secs=1, tracking_mode="process")
    try:
        params=inspect.signature(EmissionsTracker.__init__).parameters
        if "cloud_provider" in params: base["cloud_provider"]="google"
        if "cloud_region"  in params: base["cloud_region"]="europe-west10"  # GCP Berlin
        if "country_iso_code" in params: base["country_iso_code"]="DEU"
    except Exception: pass
    return base

def make_tracker(name,out): return EmissionsTracker(project_name=name, output_file=out, **_cc_supported_kwargs())

def safe_start(tr):
    try: tr.start(); return True
    except Exception as e: print("[CodeCarbon] Start fehlgeschlagen:", e); return False

def safe_stop(tr, started):
    if not started: return SimpleNamespace(energy_consumed=0.0, emissions=0.0)
    try: return tr.stop()
    except Exception as e: print("[CodeCarbon] Stop fehlgeschlagen:", e); return SimpleNamespace(energy_consumed=0.0, emissions=0.0)

def unpack(em):
    if hasattr(em,"energy_consumed") and hasattr(em,"emissions"):
        try: return float(em.energy_consumed), float(em.emissions)
        except: pass
    if isinstance(em, dict):
        e=em.get("energy_consumed",0.0); c=em.get("emissions", em.get("emissions_kg",0.0))
        try: return float(e), float(c)
        except: return 0.0,0.0
    try: return 0.0, float(em)
    except: return 0.0,0.0

def read_energy_from_log(path):
    try:
        if not os.path.exists(path): return 0.0
        df=pd.read_csv(path)
        for c in ["energy_consumed","energy_consumed_kwh","energy_consumed (kWh)","energy (kWh)"]:
            if c in df.columns: return float(df[c].iloc[-1])
        for c in df.columns:
            if "energy" in c.lower() and "kwh" in c.lower(): return float(df[c].iloc[-1])
    except: pass
    return 0.0

def measure_phase(phase, fn, prefix):
    logfile=os.path.join(project_dir, f"{prefix}_{phase}.csv")
    tr=make_tracker(f"{prefix}_{phase}", logfile)
    import time as _t
    st=safe_start(tr); t0=_t.time(); res=fn(); t1=_t.time()
    e=safe_stop(tr, st); ekwh, co2=unpack(e)
    if ekwh==0.0:
        ek=read_energy_from_log(logfile)
        if ek: ekwh=ek
    return {"phase":phase,"time_s":t1-t0,"energy_kwh":ekwh,"co2_kg":co2}, res

In [None]:
# ============= Eval-Konfiguration =============
EVAL={"max_new_tokens":32,
      "ppl":{"name":"wikitext","config":"wikitext-2-raw-v1","split":"test[:1%]"},
      "bleu":{"name":"wmt14","config":"de-en","split":"test[:32]"}}

def parse_subset_count(s, default=32):
  m=re.search(r":\s*(\d+)\s*\]$", s or ""); return int(m.group(1)) if m else default
BLEU_N=parse_subset_count(EVAL["bleu"]["split"], 32)

PROMPTS=[
  "Schreibe einen kurzen Absatz über nachhaltige KI.",
  "Erkläre in einfachen Worten, was Quantisierung in neuronalen Netzen ist.",
  "Nenne drei Vorteile von Mixture-of-Experts-Modellen."
]
bleu_metric=evaluate.load("sacrebleu")

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# ============= Helper-Funktionen =============
def autocast_ctx():
  return torch.autocast(device_type="cuda", dtype=torch.float16) if device=="cuda" else nullcontext()
def bytes_to_gb(b): return float(b)/(1024**3)
def capture_memory():
  ram=psutil.Process().memory_info().rss
  valloc=torch.cuda.memory_allocated() if device=="cuda" else 0
  vres =torch.cuda.memory_reserved()  if device=="cuda" else 0
  return ram, valloc, vres
def safe_max_len(tok, model, fallback=2048, upper=100000):
  cand=getattr(tok,"model_max_length",None)
  if isinstance(cand,int) and 0<cand<upper: return cand
  cand=getattr(getattr(model,"config",None),"max_position_embeddings",None)
  if isinstance(cand,int) and 0<cand<upper: return cand
  return fallback

# Warnungsfreie, deterministische Generation
GC_GREEDY = GenerationConfig(
    do_sample=False,
    temperature=None,
    top_p=None,
    top_k=None,
    num_beams=1,
)

def warmup(model, tok, max_len):
  with torch.no_grad(), autocast_ctx():
    x=tok("Hello", return_tensors="pt", truncation=True, max_length=max_len).to(model.device)
    _=model.generate(**x, max_new_tokens=1, generation_config=GC_GREEDY, pad_token_id=tok.eos_token_id)

def do_gen(model, tok, max_new_tokens):
  total, texts=0, []
  ml=safe_max_len(tok, model)
  for p in PROMPTS:
    enc=tok(p, return_tensors="pt", truncation=True, max_length=ml).to(model.device)
    room=ml-enc["input_ids"].shape[1]
    cur_new=max(1, min(max_new_tokens, int(room)))
    with torch.no_grad(), autocast_ctx():
      out=model.generate(**enc, max_new_tokens=cur_new, generation_config=GC_GREEDY, pad_token_id=tok.eos_token_id)
    total+=int(out.shape[1]-enc["input_ids"].shape[1])
    texts.append(tok.decode(out[0], skip_special_tokens=True))
  return texts, total

def do_ppl(model, tok, ds_cfg):
  ds=load_dataset(ds_cfg["name"], ds_cfg["config"], split=ds_cfg["split"])
  ml=safe_max_len(tok, model); losses=[]
  with torch.no_grad():
    for t in ds["text"]:
      if not isinstance(t,str) or len(t.strip())<4: continue
      enc=tok(t, return_tensors="pt", truncation=True, max_length=ml).to(model.device)
      with autocast_ctx():
        out=model(enc["input_ids"], labels=enc["input_ids"])
      losses.append(float(out.loss.detach().cpu()))
  return math.exp(np.mean(losses)) if losses else None

def do_bleu(model, tok, ds_cfg, max_new_tokens):
  ds=load_dataset(ds_cfg["name"], ds_cfg["config"], split=ds_cfg["split"])
  ml=safe_max_len(tok, model); preds, refs=[], []
  with torch.no_grad():
    for ex in ds:
      de,en=ex["translation"]["de"], ex["translation"]["en"]
      prompt=f"Translate to English:\nGerman: {de}\nEnglish:"
      enc=tok(prompt, return_tensors="pt", truncation=True, max_length=ml).to(model.device)
      room=ml-enc["input_ids"].shape[1]
      cur_new=max(1, min(max_new_tokens, int(room)))
      with autocast_ctx():
        out=model.generate(**enc, max_new_tokens=cur_new, generation_config=GC_GREEDY, pad_token_id=tok.eos_token_id)
      gen=tok.decode(out[0], skip_special_tokens=True)
      hyp=gen.split("English:")[-1].strip().split("\n")[0].strip() or gen.strip()
      preds.append(hyp); refs.append([en])
  return float(bleu_metric.compute(predictions=preds, references=refs)["score"])

In [None]:
# ============= Quantisiertes Laden (8-bit / 4-bit) =============
def load_quantized(model_id: str, variant: str) -> Tuple[AutoTokenizer, AutoModelForCausalLM, str]:
  tok=AutoTokenizer.from_pretrained(model_id, use_fast=True)
  tok.padding_side="left"
  if tok.pad_token_id is None:
    tok.pad_token=tok.eos_token
  if variant=="8bit":
    bnb=BitsAndBytesConfig(load_in_8bit=True)
  elif variant=="4bit":
    bnb=BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4",
                           bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.float16)
  else:
    raise ValueError("variant must be '8bit' or '4bit'")
  if device=="cuda": torch.cuda.empty_cache(); gc.collect()
  model=AutoModelForCausalLM.from_pretrained(
      model_id, device_map="auto", quantization_config=bnb, attn_implementation="sdpa"
  )
  return tok, model, variant

In [None]:
# ============= Einzellauf (gen/ppl/bleu) =============
def run_once(model_id, alias, variant):
  tok, model, prec=load_quantized(model_id, variant)
  ml=safe_max_len(tok, model); warmup(model, tok, ml)
  prefix=f"{RESULT_BASENAME}_{alias}_{variant}"

  m_gen,(samples,n_tok)=measure_phase("gen",  lambda: do_gen(model,tok,EVAL["max_new_tokens"]), prefix)
  m_ppl, p            =measure_phase("ppl",  lambda: do_ppl(model,tok,EVAL["ppl"]),             prefix)
  if device=="cuda": torch.cuda.empty_cache()
  m_bleu, b           =measure_phase("bleu", lambda: do_bleu(model,tok,EVAL["bleu"],EVAL["max_new_tokens"]), prefix)

  time_s=m_gen["time_s"]+m_ppl["time_s"]+m_bleu["time_s"]
  energy=m_gen["energy_kwh"]+m_ppl["energy_kwh"]+m_bleu["energy_kwh"]
  co2   =m_gen["co2_kg"]+m_ppl["co2_kg"]+m_bleu["co2_kg"]
  ram, valloc, vres = capture_memory()

  row=dict(model_id=model_id, alias=alias, variant=variant, precision=prec,
           time_s=time_s, energy_kwh=energy, co2_kg=co2,
           kg_per_kwh=(co2/energy if energy else None),
           tokens_out=int(n_tok), ppl=p, bleu=b,
           ram_GB=bytes_to_gb(ram), vram_alloc_GB=bytes_to_gb(valloc), vram_reserved_GB=bytes_to_gb(vres),
           notes=f"GPU={gpu_name}, VRAM={vram_total_gb:.1f} GB")
  return row, samples, (m_gen,m_ppl,m_bleu)

In [None]:
# ============= Ausführung (8-bit & 4-bit) + Speichern =============
variants = ["8bit","4bit"]
all_rows, phase_tables = [], []

for variant in variants:
  print(f"\n### {ALIAS} – {variant}")
  row, samples, phases = run_once(MODEL_ID, ALIAS, variant)
  all_rows.append(row)
  dfp=pd.DataFrame(list(phases)); dfp["alias"]=ALIAS; dfp["variant"]=variant
  dfp["kg_per_kwh"]=(dfp["co2_kg"]/dfp["energy_kwh"]).replace([np.inf,-np.inf],np.nan)
  phase_tables.append(dfp)
  with open(os.path.join(project_dir, f"{RESULT_BASENAME}_samples_{ALIAS}_{variant}.txt"),"w",encoding="utf-8") as f:
    for i,t in enumerate(samples,1): f.write(f"--- Beispiel {i} ---\n{t}\n\n")

df=pd.DataFrame(all_rows)
df_phase=pd.concat(phase_tables, ignore_index=True)
df.to_csv(os.path.join(project_dir, f"{RESULT_BASENAME}_results.csv"), index=False)
df_phase.to_csv(os.path.join(project_dir, f"{RESULT_BASENAME}_results_per_phase.csv"), index=False)
print("\nErgebnisse:\n", df)
print("\nPer-Phase:\n", df_phase)
print("Gespeichert:", os.path.join(project_dir, f"{RESULT_BASENAME}_results.csv"))


### r1q15b – 8bit


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

de-en/train-00000-of-00003.parquet:   0%|          | 0.00/280M [00:00<?, ?B/s]

de-en/train-00001-of-00003.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

de-en/train-00002-of-00003.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

de-en/validation-00000-of-00001.parquet:   0%|          | 0.00/474k [00:00<?, ?B/s]

de-en/test-00000-of-00001.parquet:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]


### r1q15b – 4bit

Ergebnisse:
                                     model_id   alias variant precision  \
0  deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B  r1q15b    8bit      8bit   
1  deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B  r1q15b    4bit      4bit   

       time_s  energy_kwh    co2_kg  kg_per_kwh  tokens_out         ppl  \
0  197.950969    0.005993  0.002821    0.470783          96  202.727943   
1   79.516219    0.002485  0.001170    0.470783          96  213.635419   

        bleu    ram_GB  vram_alloc_GB  vram_reserved_GB  \
0  11.736827  4.578312       2.140763          2.242188   
1  11.105658  5.511726       1.519278          1.562500   

                                     notes  
0  GPU=NVIDIA A100-SXM4-80GB, VRAM=79.3 GB  
1  GPU=NVIDIA A100-SXM4-80GB, VRAM=79.3 GB  

Per-Phase:
   phase      time_s  energy_kwh    co2_kg   alias variant  kg_per_kwh
0   gen   13.278282    0.000404  0.000190  r1q15b    8bit    0.470783
1   ppl   14.834761    0.000444  0.000209  r1q15b   