# pip

In [8]:
!pip install -q transformers
!pip install -q peft
!pip install -q evaluate
!pip install sacremoses
!pip install bitsandbytes


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Downloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sacremoses
Successfully installed sacremoses-0.1.1


# tunning

In [24]:
# -*- coding: utf-8 -*-
import os
import numpy as np
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
from sklearn.metrics import classification_report, confusion_matrix

# ========================
# Konfiguracja
# ========================
CSV_PATH = "train_clean.csv"   # musi mieć kolumny: text,label (etykiety: "inne"/"krus")
MODEL_NAME = "allegro/herbert-base-cased"
os.environ["WANDB_DISABLED"] = "true"  # wyłącza Weights & Biases

# ========================
# 1) Wczytanie CSV -> Dataset + split
# ========================
raw = load_dataset("csv", data_files={"data": CSV_PATH})

# Gdyby kolumna nazywała się "tekst", to ją przemianuj na "text"
cols = raw["data"].column_names
if "tekst" in cols and "text" not in cols:
    raw = raw.rename_column("tekst", "text")

assert "text" in raw["data"].column_names and "label" in raw["data"].column_names, \
    "CSV musi mieć kolumny 'text' i 'label'."

# Uporządkuj etykiety (lower/trim), a potem zakoduj na int
raw = raw.map(lambda ex: {"label": ex["label"].strip().lower()})
raw = raw.class_encode_column("label")

names = raw["data"].features["label"].names  # np. ['inne', 'krus'] (kolejność wg danych)
id2label = {i: n for i, n in enumerate(names)}
label2id = {n: i for i, n in enumerate(names)}
print("Label mapping:", label2id)

# Podział 80/20 na train/test
splits = raw["data"].train_test_split(test_size=0.2, seed=42)
ds = DatasetDict(train=splits["train"], test=splits["test"])

# ========================
# 2) Tokenizer + preprocess (Z LABELS!)
# ========================
tok = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def preprocess(batch):
    enc = tok(batch["text"], truncation=True, max_length=128)  # padding zrobi collator
    enc["labels"] = batch["label"]  # KLUCZOWE: Trainer > model loss
    return enc

tok_ds = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)
print("Przykładowe klucze w batchu:", tok_ds["train"][0].keys())
# powinno zawierać: input_ids, token_type_ids, attention_mask, labels

data_collator = DataCollatorWithPadding(tokenizer=tok)

# ========================
# 3) Model
# ========================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(names),
    id2label=id2label,
    label2id=label2id,
)

# ========================
# 4) Metryki
# ========================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    # Macro F1/precision/recall (bez ważenia klas)
    from sklearn.metrics import precision_recall_fscore_support
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"accuracy": float(acc), "precision": float(p), "recall": float(r), "f1": float(f1)}

# ========================
# 5) Trainer + trening
# ========================
args = TrainingArguments(
    output_dir="./herbert_krus_model",
    run_name="herbert_krus_inne",
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir="./logs",
    report_to=[],   # nic nie raportujemy (W&B off)
    seed=42,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tok_ds["train"],
    eval_dataset=tok_ds["test"],
    tokenizer=tok,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

print("Start treningu…")
trainer.train()

print("Ewaluacja:")
eval_metrics = trainer.evaluate()
print(eval_metrics)

# ========================
# 6) Raport szczegółowy na teście
# ========================
pred = trainer.predict(tok_ds["test"])
y_true = pred.label_ids
y_pred = np.argmax(pred.predictions, axis=-1)

print("\nClassification report:")
print(classification_report(y_true, y_pred, target_names=names, digits=2))

print("Confusion matrix:")
print(confusion_matrix(y_true, y_pred))

# ========================
# 7) Zapis modelu + tokenizer
# ========================
save_dir = "./herbert_krus_model3"
trainer.save_model(save_dir)
tok.save_pretrained(save_dir)
print(f"Zapisano do: {save_dir}")


Generating data split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/2679 [00:00<?, ? examples/s]

Casting to class labels:   0%|          | 0/2679 [00:00<?, ? examples/s]

Label mapping: {'inne': 0, 'krus': 1}


Map:   0%|          | 0/2143 [00:00<?, ? examples/s]

Map:   0%|          | 0/536 [00:00<?, ? examples/s]

Przykładowe klucze w batchu: dict_keys(['input_ids', 'token_type_ids', 'attention_mask', 'labels'])


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allegro/herbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Start treningu…


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,No log,0.142035,0.94403,0.93909,0.949224,0.942755
2,0.176300,0.090383,0.972015,0.972028,0.969924,0.970942
3,0.176300,0.144902,0.973881,0.971768,0.974381,0.973015


Ewaluacja:


{'eval_loss': 0.1449022740125656, 'eval_accuracy': 0.9738805970149254, 'eval_precision': 0.9717679462902393, 'eval_recall': 0.9743811666955167, 'eval_f1': 0.9730149597238205, 'eval_runtime': 0.7228, 'eval_samples_per_second': 741.531, 'eval_steps_per_second': 92.691, 'epoch': 3.0}

Classification report:
              precision    recall  f1-score   support

        inne       0.98      0.97      0.98       318
        krus       0.96      0.98      0.97       218

    accuracy                           0.97       536
   macro avg       0.97      0.97      0.97       536
weighted avg       0.97      0.97      0.97       536

Confusion matrix:
[[309   9]
 [  5 213]]
Zapisano do: ./herbert_krus_model3


# Metryki

In [14]:
metrics = trainer.evaluate()
print(metrics)


{'eval_loss': 0.20416143536567688, 'eval_accuracy': 0.9521739130434783, 'eval_precision': 0.9515997805150329, 'eval_recall': 0.9531086853345989, 'eval_f1': 0.952064267985373, 'eval_runtime': 0.6276, 'eval_samples_per_second': 732.986, 'eval_steps_per_second': 92.42, 'epoch': 3.0}


In [15]:
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix

pred = trainer.predict(tok_ds["test"])
y_true = pred.label_ids
y_pred = pred.predictions.argmax(axis=1)

target_names = [id2label[i] for i in range(len(id2label))]
print(classification_report(y_true, y_pred, target_names=target_names))
print(confusion_matrix(y_true, y_pred))


              precision    recall  f1-score   support

        inne       0.93      0.97      0.95       215
        krus       0.97      0.94      0.95       245

    accuracy                           0.95       460
   macro avg       0.95      0.95      0.95       460
weighted avg       0.95      0.95      0.95       460

[[208   7]
 [ 15 230]]


# Tebele z błędami

In [16]:
import numpy as np
import pandas as pd

# 1) Predykcje na zbiorze testowym
pred = trainer.predict(tok_ds["test"])
logits = pred.predictions
y_true = pred.label_ids
y_pred = np.argmax(logits, axis=1)

# 2) Softmax (p-stwa klas)
probs = np.exp(logits - logits.max(axis=1, keepdims=True))
probs = probs / probs.sum(axis=1, keepdims=True)

# 3) Złóż DataFrame z oryginalnych danych testowych
df_test = pd.DataFrame({
    "text": ds["test"]["text"],
    "true_id": y_true,
    "pred_id": y_pred,
})
df_test["true_label"] = df_test["true_id"].map({i:n for i,n in enumerate(names)})
df_test["pred_label"] = df_test["pred_id"].map({i:n for i,n in enumerate(names)})

# kolumny z p-stwami klas (nazwy jak w `names`, np. ['inne','krus'])
for i, n in enumerate(names):
    df_test[f"prob_{n}"] = probs[:, i]

# 4) Odfiltruj błędne klasyfikacje
df_wrong = df_test[df_test["true_id"] != df_test["pred_id"]].copy()
print(f"Ilość błędnie sklasyfikowanych: {len(df_wrong)} / {len(df_test)}")

# Pokaż kilka najpewniejszych pomyłek (wysokie p-stwo predykcji, a i tak źle)
df_wrong["pred_prob"] = df_wrong.apply(lambda r: r[f"prob_{r['pred_label']}"], axis=1)
display(df_wrong.sort_values("pred_prob", ascending=False).head(10)[
    ["text","true_label","pred_label","pred_prob"] + [f"prob_{n}" for n in names]
])

# 5) Zapis do CSV
df_wrong.to_csv("misclassified.csv", index=False, encoding="utf-8-sig")
print("Zapisano pomyłki do: misclassified.csv")

# 6) (opcjonalnie) rozbicie na FP/FN dla klasy 'krus'
if "krus" in names and "inne" in names:
    krus_id = names.index("krus")
    inne_id = names.index("inne")
    fp_krus = df_wrong[(df_wrong["true_id"] == inne_id) & (df_wrong["pred_id"] == krus_id)]
    fn_krus = df_wrong[(df_wrong["true_id"] == krus_id) & (df_wrong["pred_id"] == inne_id)]
    print("False POS (pred=krus, true=inne):", len(fp_krus))
    print("False NEG (pred=inne, true=krus):", len(fn_krus))
    # szybki podgląd:
    display(fp_krus.head(5)[["text","true_label","pred_label","pred_prob"]])
    display(fn_krus.head(5)[["text","true_label","pred_label","pred_prob"]])


Ilość błędnie sklasyfikowanych: 22 / 460


Unnamed: 0,text,true_label,pred_label,pred_prob,prob_inne,prob_krus
202,W jaki spos˘b spata kredytu wpywa na wysoko...,inne,krus,0.999383,0.000617,0.999383
287,Jakie kryteria decydujĽ o przyznaniu premii ro...,inne,krus,0.999338,0.000662,0.999338
168,Jakie sĽ g˘wne korzyci wynikajĽce dla Kasy z...,inne,krus,0.997854,0.002146,0.997854
62,Ilu przedstawicieli do Rady Nadzorczej Fundusz...,krus,inne,0.997221,0.997221,0.002779
223,Dla kogo mogĽ by ustanawiane premie?,krus,inne,0.996768,0.996768,0.003233
130,Kto zarzĽdza funduszem motywacyjnym?,krus,inne,0.996695,0.996695,0.003305
391,Jak Kasa zarzĽdza swoimi rodkami finansowymi?,krus,inne,0.99594,0.99594,0.00406
219,Jakie rodki stanowiĽ dotacje z budžetu paästwa?,krus,inne,0.992405,0.992405,0.007595
300,Jakie dziaania ze sfery zadaä publicznych mog...,krus,inne,0.991856,0.991856,0.008144
224,Kto jest odpowiedzialny za prowadzenie finans˘...,inne,krus,0.9917,0.0083,0.9917


Zapisano pomyłki do: misclassified.csv
False POS (pred=krus, true=inne): 7
False NEG (pred=inne, true=krus): 15


Unnamed: 0,text,true_label,pred_label,pred_prob
90,Co to jest odpis od funduszu skadkowego w wys...,inne,krus,0.974226
168,Jakie sĽ g˘wne korzyci wynikajĽce dla Kasy z...,inne,krus,0.997854
202,W jaki spos˘b spata kredytu wpywa na wysoko...,inne,krus,0.999383
224,Kto jest odpowiedzialny za prowadzenie finans˘...,inne,krus,0.9917
287,Jakie kryteria decydujĽ o przyznaniu premii ro...,inne,krus,0.999338


Unnamed: 0,text,true_label,pred_label,pred_prob
6,Czy fundusz skadkowy može mie niedobory?,krus,inne,0.988698
15,W jaki spos˘b dzielony jest zasiek pogrzebowy...,krus,inne,0.831926
47,Kto wyznacza przedstawicieli do Rady Nadzorcze...,krus,inne,0.991301
62,Ilu przedstawicieli do Rady Nadzorczej Fundusz...,krus,inne,0.997221
88,Kto dysponuje funduszami wymienionymi w artykule?,krus,inne,0.630517


In [17]:
df = pd.read_csv("misclassified.csv")
display(df)

Unnamed: 0,text,true_id,pred_id,true_label,pred_label,prob_inne,prob_krus,pred_prob
0,Czy fundusz skadkowy može mie niedobory?,1,0,krus,inne,0.988698,0.011302,0.988698
1,W jaki spos˘b dzielony jest zasiek pogrzebowy...,1,0,krus,inne,0.831926,0.168074,0.831926
2,Kto wyznacza przedstawicieli do Rady Nadzorcze...,1,0,krus,inne,0.991301,0.008699,0.991301
3,Ilu przedstawicieli do Rady Nadzorczej Fundusz...,1,0,krus,inne,0.997221,0.002779,0.997221
4,Kto dysponuje funduszami wymienionymi w artykule?,1,0,krus,inne,0.630517,0.369483,0.630517
5,Co to jest odpis od funduszu skadkowego w wys...,0,1,inne,krus,0.025774,0.974225,0.974226
6,Kto zarzĽdza funduszem motywacyjnym?,1,0,krus,inne,0.996695,0.003305,0.996695
7,Kto nadaje Kasie statut?,1,0,krus,inne,0.981255,0.018745,0.981255
8,Jakie sĽ g˘wne korzyci wynikajĽce dla Kasy z...,0,1,inne,krus,0.002146,0.997854,0.997854
9,Kto gwarantuje wypatŠ wiadczeä finansowanych...,1,0,krus,inne,0.97554,0.02446,0.97554


# zmiana datasetu na odpowiedni format csv

In [22]:
import pandas as pd

IN_PATH  = "train.csv"
OUT_PATH = "train_clean.csv"

# 1) Wczytaj cały plik jako tekst (bez sep="\n")
enc_used = None
for enc in ("utf-8-sig", "utf-8", "cp1250", "iso-8859-2", "latin1"):
    try:
        with open(IN_PATH, "r", encoding=enc) as f:
            lines = f.read().splitlines()
        enc_used = enc
        print("Wczytano z encoding =", enc)
        break
    except UnicodeDecodeError:
        pass
if enc_used is None:
    raise RuntimeError("Nie udało się odczytać pliku w typowych kodowaniach.")

df_raw = pd.DataFrame({"raw": lines})
df_raw = df_raw[df_raw["raw"].astype(str).str.strip().ne("")]  # usuń puste

# 2) Wywal ewentualny nagłówek "text,label" / "text ; label"
hdr = df_raw["raw"].str.strip().str.lower()
is_header = hdr.isin({"text,label", "text , label", "text;label", "text ; label"})
df_raw = df_raw[~is_header]

# 3) Podziel wiersz na 2 kolumny po PIERWSZYM ; lub ,
tmp = df_raw["raw"].str.replace(r"\s*[;,]\s*", "§§", regex=True)
parts = tmp.str.rsplit("§§", n=1, expand=True)
parts.columns = ["text", "label"]

# 4) Oczyść cudzysłowy i spacje
q = '"“”„«»\''
parts["text"]  = parts["text"].astype(str).str.strip().str.strip(q)
parts["label"] = parts["label"].astype(str).str.strip().str.strip(q).str.lower()

# 5) Zostaw tylko oczekiwane etykiety
parts = parts[parts["label"].isin(["krus", "inne"])]

# 6) Zapis
parts.to_csv(OUT_PATH, index=False, encoding="utf-8-sig")
print(parts.head())


Wczytano z encoding = iso-8859-2
                                                text label
1  Jakie warunki naleĹźy speĹniÄ§§aby otrzymaÄ...  krus
2    Kiedy Ĺwiadczenie emerytalne nie przysĹuguje?  krus
4         terminy wypĹaty ĹwiadczeĹ emerytalnych?  krus
6  wymagane do ubiegania siÄ o Ĺwiadczenie emer...  krus
8       zasady waloryzacji ĹwiadczeĹ emerytalnych?  krus


## wartości klas

In [23]:
import re
from collections import Counter

PATH = "train_clean.csv"

# Czytaj najpierw jako utf-8-sig (tak zwykle zapisuje pandas), z fallbackiem
for enc in ("utf-8-sig", "utf-8", "cp1250", "latin-1"):
    try:
        with open(PATH, encoding=enc) as f:
            lines = f.read().splitlines()
        print("Loaded with encoding:", enc)
        break
    except UnicodeDecodeError:
        continue

cnt = Counter()
bad = []

# etykieta MUSI być na samym końcu
label_re = re.compile(r'(krus|inne)\s*[\s"\'`;,:-]*$', re.IGNORECASE)

for i, s in enumerate(lines, 1):
    s = s.strip()
    if not s or s.lower().startswith("text"):
        continue

    m = label_re.search(s)
    if m:
        cnt[m.group(1).lower()] += 1
    else:
        # pokaż, co jest po OSTATNIM separatorze – to kandydat na label
        tail = re.split(r"[;,]", s)[-1].strip().strip('"\'')
        bad.append((i, tail, s))

print("Licznik:", dict(cnt))
print("krus:", cnt.get("krus", 0))
print("inne:", cnt.get("inne", 0))
print("Wiersze bez rozpoznanej etykiety:", len(bad))

# Pokaż kilka problematycznych (nr wiersza, 'kandydat na label', końcówka linii)
for i, tail, s in bad[:20]:
    print(f"[{i}] tail='{tail}' ...{s[-120:]}")

# (opcjonalnie) zapisz pełną listę do pliku
with open("bad_lines.txt", "w", encoding="utf-8") as out:
    for i, tail, s in bad:
        out.write(f"{i}\t{tail}\t{s}\n")


Loaded with encoding: utf-8-sig
Licznik: {'krus': 1106, 'inne': 1573}
krus: 1106
inne: 1573
Wiersze bez rozpoznanej etykiety: 0


# Test modelu

In [31]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

tok   = AutoTokenizer.from_pretrained("./herbert_krus_model2")         # albo ścieżka, gdzie zapisałeś
model = AutoModelForSequenceClassification.from_pretrained("./herbert_krus_model2").eval()

def predict_label(text):
    enc = tok([text], truncation=True, padding=True, max_length=128, return_tensors="pt")
    with torch.no_grad():
        out = model(**enc)
        probs = out.logits.softmax(dim=-1).cpu().numpy()[0]
    pred_id = probs.argmax()
    pred_label = model.config.id2label[pred_id]
    return pred_label, {model.config.id2label[i]: float(p) for i, p in enumerate(probs)}

print(predict_label(""))


('inne', {'inne': 0.8777452707290649, 'krus': 0.12225469201803207})


# generowanie nowych pytań

In [None]:
#!/usr/bin/env python
# coding: utf-8
"""
Generate ONLY questions (no answers) from Polish statute articles using PLLuM-12B (4-bit),
and save them to a CSV file.

✓ Bardzo szczegółowe pytania (progi, terminy, wyjątki).
✓ Filtr: usuwa pytania zawierające zabronione słowa (np. "krus").
✓ CSV output: columns -> article_idx, question
"""

from __future__ import annotations
import argparse
import csv
import re
import sys
from pathlib import Path
from typing import List, Iterable

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, pipeline
from tqdm.auto import tqdm

# ========================= PROMPTS =========================

SYSTEM_PROMPT = (
    "Jesteś kreatorem trudnych, precyzyjnych pytań kontrolnych do treści aktu prawnego. "
    "Zawsze piszesz po polsku. Formułujesz WYŁĄCZNIE PYTANIA, bez odpowiedzi. "
    "Unikasz zdradzania nazwy instytucji, skrótów i słów kluczowych z artykułu — używaj opisów ogólnych typu: "
    "„organ”, „instytucja”, „świadczenie”, „fundusz”, „ubezpieczenie”, „osoba uprawniona”. "
    "Pytania mają być bardzo szczegółowe (progi, terminy, wyjątki, warunki brzegowe, tryby, podstawy i wyłączenia)."
)

QUESTION_TEMPLATE = (
    "<|system|>{system}\n"
    "<|user|>Na podstawie poniższego artykułu wygeneruj {n} RÓŻNYCH, BARDZO SZCZEGÓŁOWYCH pytań "
    "obejmujących możliwie cały zakres treści. "
    "Nie używaj zakazanych słów/skrótów (np.: {forbidden}). "
    "Nie cytuj dosłownie długich fragmentów. "
    "Dopytuj jakbyś nie rozumiał"
    "Zwróć TYLKO listę pytań, wypunktowaną cyframi.\n\n"
    "### ARTYKUŁ:\n{article}\n"
    "### PYTANIA:\n"
)

# ========================= HELPERS =========================

def load_model_4bit(model_name: str):
    bnb_cfg = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,
        bnb_4bit_use_double_quant=True,
        bnb_4bit_quant_type="nf4",
    )
    tok = AutoTokenizer.from_pretrained(
        model_name, use_fast=True, padding_side="left", trust_remote_code=True
    )
    model = AutoModelForCausalLM.from_pretrained(
        model_name, quantization_config=bnb_cfg, device_map="auto", trust_remote_code=True
    )
    return pipeline("text-generation", model=model, tokenizer=tok, device_map="auto")

def split_statute(path: Path) -> List[str]:
    text = path.read_text(encoding="utf-8")
    parts = re.split(r"^##\s+", text, flags=re.MULTILINE)
    return [p.strip() for p in parts if p.strip()]

def extract_questions(raw: str) -> List[str]:
    lines = [l.strip(" •-\t") for l in raw.splitlines() if l.strip()]
    qs: List[str] = []
    for l in lines:
        m = re.match(r"^\d+[).\s]+(.+)$", l)
        q = (m.group(1) if m else l).strip()
        if not q.endswith("?"):
            q = q.rstrip(".:;! ") + "?"
        qs.append(q)
    return qs

def compile_forbidden(forbidden: Iterable[str]) -> List[re.Pattern]:
    pats = []
    for term in forbidden:
        term = term.strip()
        if not term:
            continue
        pats.append(re.compile(re.escape(term), re.IGNORECASE))
    return pats

def violates_forbidden(q: str, patterns: List[re.Pattern]) -> bool:
    return any(p.search(q) for p in patterns)

# ========================= CORE =========================

def generate_questions(
    statute_path: str,
    model_name: str,
    out_path: str,
    limit: int = 0,
    n_per_article: int = 10,
    keep_per_article: int = 8,
    forbidden_terms: List[str] | None = None,
    attempts: int = 3,
):
    gen = load_model_4bit(model_name)
    articles = split_statute(Path(statute_path))
    if limit > 0:
        articles = articles[:limit]

    if forbidden_terms is None:
        forbidden_terms = ["krus"]
    forbidden_text = ", ".join(forbidden_terms)
    forbidden_pats = compile_forbidden(forbidden_terms)

    out_p = Path(out_path)
    out_p.parent.mkdir(parents=True, exist_ok=True)

    saved = 0
    with out_p.open("w", encoding="utf-8", newline="") as fout:
        writer = csv.writer(fout)
        writer.writerow(["article_idx", "question"])

        for idx, art in enumerate(tqdm(articles, desc="Artykuły:", unit="art"), start=1):
            remaining = keep_per_article
            tries = 0
            used: set[str] = set()

            while remaining > 0 and tries < attempts:
                tries += 1
                prompt_q = QUESTION_TEMPLATE.format(
                    system=SYSTEM_PROMPT,
                    article=art,
                    n=n_per_article,
                    forbidden=forbidden_text or "—",
                )
                out = gen(
                    prompt_q,
                    max_new_tokens=256,
                    temperature=0.7,
                    top_p=0.9,
                    do_sample=True,
                )[0]["generated_text"][len(prompt_q):]

                raw_qs = extract_questions(out)

                clean_qs = []
                for q in raw_qs:
                    if len(q) < 12:
                        continue
                    if violates_forbidden(q, forbidden_pats):
                        continue
                    key = " ".join(q.lower().split())
                    if key in used:
                        continue
                    clean_qs.append(q)

                for q in clean_qs[:remaining]:
                    writer.writerow([idx, q])
                    used.add(" ".join(q.lower().split()))
                    saved += 1
                    remaining -= 1

    print(f"\nZapisano {saved} pytań do pliku CSV: {out_p}")

# ========================= CLI =========================

if __name__ == "__main__":
    p = argparse.ArgumentParser(description="Generate ONLY questions (CSV) from statute with PLLuM-12B (4-bit)")
    p.add_argument("--statute", default="ustawa_processed.md", help="Markdown file with the law")
    p.add_argument("--model", default="CYFRAGOVPL/PLLuM-12B-nc-chat", help="HF model id")
    p.add_argument("--out", default="questions.csv", help="Output CSV file")
    p.add_argument("-n", "--limit", type=int, default=0, help="Process only first N articles (0 = all)")
    p.add_argument("--n-per-article", type=int, default=10, help="How many questions to ask model per attempt")
    p.add_argument("--keep-per-article", type=int, default=8, help="How many filtered questions to keep per article")
    p.add_argument("--attempts", type=int, default=3, help="Regeneration attempts per article if too few after filtering")
    p.add_argument("--forbidden", type=str, default="krus", help="Comma-separated forbidden terms (case-insensitive)")

    args, _ = p.parse_known_args(sys.argv[1:])
    forbidden = [t.strip() for t in args.forbidden.split(",")] if args.forbidden else ["krus"]

    generate_questions(
        statute_path=args.statute,
        model_name=args.model,
        out_path=args.out,
        limit=args.limit,
        n_per_article=args.n_per_article,
        keep_per_article=args.keep_per_article,
        forbidden_terms=forbidden,
        attempts=args.attempts,
    )


Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Device set to use cuda:0


Artykuły::   0%|          | 0/145 [00:00<?, ?art/s]

rozwiń to,
powiedz mi o tym więcej,
czy możesz zacytować ten artykuł,
a kiedy mogę to zrobić,
czy powiesz mi jak to zrobić,
wytłumacz mi jak to zrobić
wytłumacz mi co to jest
a co to jest?
i co mam z tym zrobić
