<a href="https://colab.research.google.com/github/Ino54/MA_GreenAI-Practical-Experiments/blob/main/deepseek_pruning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# =====================  Requirements =====================
%%writefile requirements.txt
transformers>=4.44
accelerate>=0.33
bitsandbytes
datasets>=2.20
evaluate>=0.4
sacrebleu>=2.4
codecarbon>=2.5,<3
psutil
pynvml>=12,<13
numpy==2.0.2
pandas==2.2.2

Writing requirements.txt


In [None]:
# ===================== 0b) Install =====================
!pip install -q -U -r requirements.txt --no-warn-conflicts
!pip uninstall -y -q google-genai firebase-admin || true

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.8/51.8 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m114.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m60.1/60.1 MB[0m [31m44.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m503.6/503.6 kB[0m [31m41.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m104.1/104.1 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m517.6/517.6 kB[0m [31m45.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# ===================== Drive & Ordner =====================
import os, pathlib, shutil, time, warnings, platform
warnings.filterwarnings("ignore")

from google.colab import drive
MOUNT="/content/drive"
already=os.path.isdir(os.path.join(MOUNT,"MyDrive"))
if not already and os.path.isdir(MOUNT) and os.listdir(MOUNT):
    backup=f"/content/drive_stale_{int(time.time())}"
    shutil.move(MOUNT, backup); os.makedirs(MOUNT, exist_ok=True)
drive.mount(MOUNT, force_remount=(not already))

project_path = "/content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_pruning"
pathlib.Path(project_path).mkdir(parents=True, exist_ok=True)
os.chdir(project_path); print("Arbeitsordner:", os.getcwd())

Mounted at /content/drive
Arbeitsordner: /content/drive/MyDrive/LLM-Effizienz/4_3_Effizienzstrategien/deepseek_pruning


In [None]:
# ===================== 2) HF Login (optional) =====================
from google.colab import userdata
from huggingface_hub import login
hf_token = userdata.get("HF_TOKEN")
if hf_token:
    login(hf_token); print("Hugging Face Login erfolgreich!")
else:
    print("WARNUNG: Kein HF_TOKEN.")

Hugging Face Login erfolgreich!


In [None]:
# ===================== 3) Imports & Device =====================
import math, gc, pandas as pd, numpy as np, psutil, torch, torch.nn as nn
import torch.nn.utils.prune as prune
from datasets import load_dataset
import evaluate
from transformers import (AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig,
                          GenerationConfig)
from codecarbon import EmissionsTracker
from contextlib import nullcontext
from types import SimpleNamespace

device = "cuda" if torch.cuda.is_available() else "cpu"
if device=="cuda":
    gpu_name = torch.cuda.get_device_name(0)
    vram_total_gb = torch.cuda.get_device_properties(0).total_memory/(1024**3)
else:
    gpu_name, vram_total_gb = "CPU", 0.0
print(f"Device: {device} | GPU: {gpu_name} | VRAM: {vram_total_gb:.1f} GB | Torch {torch.__version__} | Py {platform.python_version()}")

Device: cuda | GPU: NVIDIA A100-SXM4-40GB | VRAM: 39.6 GB | Torch 2.8.0+cu126 | Py 3.12.11


In [None]:
# ===================== CodeCarbon Helpers (Berlin) =====================
import os as _os, time as _time, inspect as _inspect

_os.environ["CODECARBON_ALLOW_MULTIPLE_RUNS"]="true"
USE_GCP_REGION=True; COUNTRY_ISO_CODE="DEU"; CLOUD_PROVIDER="google"; CLOUD_REGION="europe-west10"

def _trk_base():
    base=dict(log_level="error", output_dir=".")
    sig=_inspect.signature(EmissionsTracker.__init__)
    if "measure_power_secs" in sig.parameters: base["measure_power_secs"]=1
    if "tracking_mode" in sig.parameters:      base["tracking_mode"]="process"
    if USE_GCP_REGION:
        if "cloud_provider" in sig.parameters: base["cloud_provider"]=CLOUD_PROVIDER
        if "cloud_region" in sig.parameters:   base["cloud_region"]=CLOUD_REGION
        if "country_iso_code" in sig.parameters: base["country_iso_code"]=COUNTRY_ISO_CODE
    else:
        if "country_iso_code" in sig.parameters: base["country_iso_code"]=COUNTRY_ISO_CODE
    return base

def _tracker(name,out):
    cache=f"/content/.codecarbon_cache_{name}_{int(_time.time())}"
    _os.environ["CODECARBON_CACHE_DIR"]=cache
    for d in (_os.path.expanduser("~/.codecarbon"), "/content/.codecarbon"):
        lf=_os.path.join(d,"codecarbon.lock")
        if _os.path.exists(lf):
            try: _os.remove(lf)
            except: pass
    return EmissionsTracker(project_name=name, output_file=out, **_trk_base())

def measure(phase, fn, prefix):
    log=f"{prefix}_{phase}.csv"; tr=_tracker(f"{prefix}_{phase}",log)
    started=False
    try:
        tr.start(); started=True
    except Exception as e:
        print("[CodeCarbon] Start-Fehler:", e)
    t0=_time.time(); res=fn(); t1=_time.time()
    e = tr.stop() if started else None

    def _up(e):
        if e is None: return 0.0,0.0
        if hasattr(e,"energy_consumed") and hasattr(e,"emissions"):
            return float(e.energy_consumed), float(e.emissions)
        if isinstance(e,dict):
            return float(e.get("energy_consumed",0.0)), float(e.get("emissions", e.get("emissions_kg",0.0)))
        return 0.0,0.0

    ekwh,co2=_up(e)
    if ekwh==0.0 and _os.path.exists(log):
        try:
            df=pd.read_csv(log)
            for c in ["energy_consumed","energy_consumed_kwh","energy (kWh)","energy_consumed (kWh)"]:
                if c in df.columns: ekwh=float(df[c].iloc[-1])
        except: pass
    return {"phase":phase,"time_s":t1-t0,"energy_kwh":ekwh,"co2_kg":co2}, res

In [None]:
# ===================== Konfiguration =====================
MODEL_ID     = "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B"
ALIAS        = "r1q15b"
PRUNE_AMOUNT = 0.20  # 20% global L1
PRUNE_TARGETS = ["q_proj","k_proj","v_proj","o_proj","up_proj","down_proj","gate_proj"]

# Greedy-Generation ohne Sampling-Warnungen
GC_GREEDY = GenerationConfig(
    do_sample=False, temperature=None, top_p=None, top_k=None, num_beams=1
)

In [None]:
# ===================== Laden (CPU) & Pruning =====================
def load_for_pruning_cpu(model_id:str):
    tok = AutoTokenizer.from_pretrained(model_id, use_fast=True)
    if tok.pad_token_id is None: tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        model_id, device_map=None, torch_dtype=torch.float32, low_cpu_mem_usage=True
    ).eval()
    return tok, model

def global_magnitude_prune(model, amount:float):
    params=[]
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear) and any(tag in name for tag in PRUNE_TARGETS):
            params.append((module, "weight"))
    if not params:
        print("[Pruning] Keine passenden Linear-Layer gefunden."); return 0.0
    print("[Pruning] betroffene Linear-Layer:", len(params))
    fallback=False
    try:
        with torch.no_grad():
            prune.global_unstructured(params, pruning_method=prune.L1Unstructured, amount=amount)
    except Exception as e:
        print("[Pruning] global fehlgeschlagen, layer-weise:", repr(e))
        with torch.no_grad():
            for m, n in params:
                prune.l1_unstructured(m, n, amount=amount)
        fallback=True

    # Sparsity messen über Masken, dann remove()
    total=zeros=0
    for m,_ in params:
        mask = dict(m.named_buffers()).get("weight_mask", None)
        if mask is not None:
            zeros += int((mask == 0).sum().item())
            total += mask.numel()
        try: prune.remove(m,"weight")
        except: pass
    sp = (zeros/total) if total else 0.0
    print(f"[Pruning] fertig. Modus: {'layer-weise' if fallback else 'global'} | Sparsity ≈ {sp:.3f}")
    return sp

In [None]:
# ===================== Eval (gen/ppl/bleu) =====================
bleu_metric = evaluate.load("sacrebleu")

def safe_max_len(tok, model, fallback=512, upper=100000):
    cand=getattr(tok,"model_max_length",None)
    if isinstance(cand,int) and 0<cand<upper: return min(cand,fallback)
    cand=getattr(getattr(model,"config",None),"max_position_embeddings",None)
    if isinstance(cand,int) and 0<cand<upper: return min(cand,fallback)
    return fallback

def eval_generate(tok, model, max_new_tokens=32):
    prompts=[
      "List two advantages of pruning LLMs.",
      "Explain global magnitude pruning in one paragraph.",
      "Why does pruning help energy efficiency?"
    ]
    ml=safe_max_len(tok, model); outs=[]; n_tok=0
    with torch.no_grad():
        for p in prompts:
            enc=tok(p, return_tensors="pt", truncation=True, max_length=ml).to(model.device)
            room=ml-enc["input_ids"].shape[1]; cur=max(1,min(max_new_tokens,int(room)))
            out=model.generate(**enc, max_new_tokens=cur, generation_config=GC_GREEDY, pad_token_id=tok.eos_token_id)
            n_tok+=int(out.shape[1]-enc["input_ids"].shape[1])
            outs.append(tok.decode(out[0], skip_special_tokens=True))
    return outs, n_tok

def eval_ppl(tok, model, split="test[:1%]"):
    ds=load_dataset("wikitext","wikitext-2-raw-v1", split=split)
    ml=safe_max_len(tok, model); losses=[]
    with torch.no_grad():
        for t in ds["text"]:
            if not isinstance(t,str) or len(t.strip())<4: continue
            enc=tok(t, return_tensors="pt", truncation=True, max_length=ml)
            ids=enc["input_ids"].to(model.device)
            out=model(ids, labels=ids)
            losses.append(float(out.loss.detach().cpu()))
    return math.exp(np.mean(losses)) if losses else None

def eval_bleu(tok, model, split="test[:32]", max_new_tokens=32):
    ds=load_dataset("wmt14","de-en", split=split)
    ml=safe_max_len(tok, model); preds, refs=[],[]
    with torch.no_grad():
        for ex in ds:
            de, en = ex["translation"]["de"], ex["translation"]["en"]
            prompt = f"Translate to English:\nGerman: {de}\nEnglish:"
            enc=tok(prompt, return_tensors="pt", truncation=True, max_length=ml).to(model.device)
            room=ml-enc["input_ids"].shape[1]; cur=max(1,min(max_new_tokens,int(room)))
            out=model.generate(**enc, max_new_tokens=cur, generation_config=GC_GREEDY, pad_token_id=tok.eos_token_id)
            gen=tok.decode(out[0], skip_special_tokens=True)
            hyp=gen.split("English:")[-1].strip().split("\n")[0].strip() or gen.strip()
            preds.append(hyp); refs.append([en])
    return float(bleu_metric.compute(predictions=preds, references=refs)["score"])

Downloading builder script: 0.00B [00:00, ?B/s]

In [None]:
# ===================== Pipeline: prune → save → reload(int8) → eval =====================
def run_once(model_id:str, alias="model", prune_amount=PRUNE_AMOUNT):
    print(f"\n### Starte Pruning ({int(prune_amount*100)}%): {alias} ({model_id})")
    tok_cpu, model_cpu = load_for_pruning_cpu(model_id)
    prefix = f"deepseek_pruning_{alias}"

    # Phasen: prune
    m_prune, sparsity = measure("prune", lambda: global_magnitude_prune(model_cpu, prune_amount), prefix)

    # Speichern des pruned Modells
    save_dir = os.path.join(project_path, f"pruned_{int(prune_amount*100)}pct_tmp")
    os.makedirs(save_dir, exist_ok=True)
    model_cpu.save_pretrained(save_dir)
    tok_cpu.save_pretrained(save_dir)
    del model_cpu; gc.collect()

    # Reload quantisiert (8-bit) für Eval
    bnb8 = BitsAndBytesConfig(load_in_8bit=True)
    tok = AutoTokenizer.from_pretrained(save_dir, use_fast=True)
    if tok.pad_token_id is None: tok.pad_token = tok.eos_token
    model = AutoModelForCausalLM.from_pretrained(
        save_dir, device_map="auto", quantization_config=bnb8, attn_implementation="sdpa"
    ).eval()

    # Phasen: gen / ppl / bleu
    m_gen,  (samples, n_tok) = measure("gen",  lambda: eval_generate(tok, model, 32), prefix)
    m_ppl,  ppl_val          = measure("ppl",  lambda: eval_ppl(tok, model, "test[:1%]"), prefix)
    if device=="cuda": torch.cuda.empty_cache()
    m_bleu, bleu_val         = measure("bleu", lambda: eval_bleu(tok, model, "test[:32]", 32), prefix)

    # Speicherstände
    ram  = psutil.Process().memory_info().rss/(1024**3)
    valloc= torch.cuda.memory_allocated()/(1024**3) if device=="cuda" else 0.0
    vres = torch.cuda.memory_reserved() /(1024**3) if device=="cuda" else 0.0

    row = dict(model_id=model_id, alias=alias, precision="int8 (pruned)",
               time_s=m_prune["time_s"]+m_gen["time_s"]+m_ppl["time_s"]+m_bleu["time_s"],
               energy_kwh=m_prune["energy_kwh"]+m_gen["energy_kwh"]+m_ppl["energy_kwh"]+m_bleu["energy_kwh"],
               co2_kg=m_prune["co2_kg"]+m_gen["co2_kg"]+m_ppl["co2_kg"]+m_bleu["co2_kg"],
               tokens_out=int(n_tok), ppl=ppl_val, bleu=bleu_val, sparsity=sparsity,
               ram_GB=ram, vram_alloc_GB=valloc, vram_reserved_GB=vres,
               notes=f"GPU={gpu_name}, VRAM={vram_total_gb:.1f} GB")
    phases = pd.DataFrame([
        {**m_prune, "phase":"prune"},
        {**m_gen,   "phase":"gen"},
        {**m_ppl,   "phase":"ppl"},
        {**m_bleu,  "phase":"bleu"},
    ])

    # Samples speichern
    with open("deepseek_pruning_samples.txt","w",encoding="utf-8") as f:
        for i,txt in enumerate(samples,1): f.write(f"--- Beispiel {i} ---\n{txt}\n\n")

    return row, phases

In [None]:
# ===================== Ausführen & Speichern =====================
row, phases = run_once(MODEL_ID, alias=ALIAS, prune_amount=PRUNE_AMOUNT)
df  = pd.DataFrame([row])
df["kg_per_kwh"]=(df["co2_kg"]/df["energy_kwh"]).replace([np.inf,-np.inf],np.nan)

df.to_csv("deepseek_pruning_results.csv", index=False)
phases.to_csv("deepseek_pruning_per_phase.csv", index=False)

print("Gespeichert:")
print(" - deepseek_pruning_results.csv")
print(" - deepseek_pruning_per_phase.csv")
print(" - deepseek_pruning_samples.txt")

from IPython.display import display
display(df); display(phases)

print("\nKurzer Zusammenfassungsauszug:")
print(df[["sparsity","ppl","bleu","tokens_out","energy_kwh","co2_kg","time_s"]].to_string(index=False))


### Starte Pruning (20%): r1q15b (deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B)


tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/679 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.55G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]



[Pruning] betroffene Linear-Layer: 196
[Pruning] fertig. Modus: global | Sparsity ≈ 0.200


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

`generation_config` default values have been modified to match model-specific defaults: {'do_sample': True, 'bos_token_id': 151646, 'eos_token_id': 151643}. If this is not desired, please set these values explicitly.


README.md: 0.00B [00:00, ?B/s]

wikitext-2-raw-v1/test-00000-of-00001.pa(…):   0%|          | 0.00/733k [00:00<?, ?B/s]

wikitext-2-raw-v1/train-00000-of-00001.p(…):   0%|          | 0.00/6.36M [00:00<?, ?B/s]

wikitext-2-raw-v1/validation-00000-of-00(…):   0%|          | 0.00/657k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/4358 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/36718 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3760 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

de-en/train-00000-of-00003.parquet:   0%|          | 0.00/280M [00:00<?, ?B/s]

de-en/train-00001-of-00003.parquet:   0%|          | 0.00/265M [00:00<?, ?B/s]

de-en/train-00002-of-00003.parquet:   0%|          | 0.00/273M [00:00<?, ?B/s]

de-en/validation-00000-of-00001.parquet:   0%|          | 0.00/474k [00:00<?, ?B/s]

de-en/test-00000-of-00001.parquet:   0%|          | 0.00/509k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4508785 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/3000 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/3003 [00:00<?, ? examples/s]

Gespeichert:
 - deepseek_pruning_results.csv
 - deepseek_pruning_per_phase.csv
 - deepseek_pruning_samples.txt


Unnamed: 0,model_id,alias,precision,time_s,energy_kwh,co2_kg,tokens_out,ppl,bleu,sparsity,ram_GB,vram_alloc_GB,vram_reserved_GB,notes,kg_per_kwh
0,deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B,r1q15b,int8 (pruned),254.536563,0.007069,0.0,96,228.529511,5.439031,0.2,4.611973,2.101943,2.828125,"GPU=NVIDIA A100-SXM4-40GB, VRAM=39.6 GB",0.0


Unnamed: 0,phase,time_s,energy_kwh,co2_kg
0,prune,74.81119,0.002176,0.0
1,gen,16.775126,0.000455,0.0
2,ppl,9.360241,0.000253,0.0
3,bleu,153.590006,0.004185,0.0



Kurzer Zusammenfassungsauszug:
 sparsity        ppl     bleu  tokens_out  energy_kwh  co2_kg     time_s
      0.2 228.529511 5.439031          96    0.007069     0.0 254.536563
