In [7]:
!pip -q install -U transformers datasets accelerate peft pandas sentencepiece


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [None]:
!pip -q install -U tiktoken protobuf sentencepiece
!pip -q install -U transformers datasets accelerate peft


[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip

[notice] A new release of pip is available: 25.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
import ast
from pathlib import Path
import pandas as pd
import torch

from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer
)
from peft import LoraConfig, get_peft_model, PeftModel

BASE_MODEL = "allegro/plt5-base"  

OUT_DIR = "plt5-lora-qa"
MAX_SOURCE_LEN = 384
MAX_TARGET_LEN = 32


  from .autonotebook import tqdm as notebook_tqdm


In [None]:
def normalize_answer(a: str) -> str:
    return str(a).strip().strip('"').strip("„”").strip().rstrip(" .,:;")

def load_contexts_csv(path) -> Dataset:
    df = pd.read_csv(path)

    rows = []
    for _, r in df.iterrows():
        q = str(r["question"]).strip()
        ctx = str(r["context"]).strip()

        answers_raw = r["answers"]
        try:
            ans_list = ast.literal_eval(answers_raw) if isinstance(answers_raw, str) else list(answers_raw)
        except Exception:
            ans_list = [str(answers_raw)]

        for a in ans_list:
            rows.append({
                "source": f"Pytanie: {q}\nKontekst: {ctx}\nOdpowiedź:",
                "target": normalize_answer(a)
            })

    return Dataset.from_list(rows)

ds = load_contexts_csv("llm_generated_context.csv")
ds = ds.train_test_split(test_size=0.1, seed=42)
train_ds, val_ds = ds["train"], ds["test"]

len(train_ds), len(val_ds)


(2554, 284)

In [3]:
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
base_model = AutoModelForSeq2SeqLM.from_pretrained(BASE_MODEL)

def preprocess(batch):
    x = tokenizer(batch["source"], max_length=MAX_SOURCE_LEN, truncation=True)
    y = tokenizer(text_target=batch["target"], max_length=MAX_TARGET_LEN, truncation=True)
    x["labels"] = y["input_ids"]
    return x

train_tok = train_ds.map(preprocess, batched=True, remove_columns=train_ds.column_names)
val_tok = val_ds.map(preprocess, batched=True, remove_columns=val_ds.column_names)


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Map:   0%|          | 0/2554 [00:00<?, ? exam

In [None]:
lora_cfg = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="SEQ_2_SEQ_LM",
    target_modules=["q", "v"], 
)

model = get_peft_model(base_model, lora_cfg)

device = "cuda" if torch.cuda.is_available() else "cpu"
model.to(device)

args = Seq2SeqTrainingArguments(
    output_dir=OUT_DIR,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_strategy="steps",
    eval_steps=200,
    save_steps=200,
    save_total_limit=2,
    logging_steps=50,
    fp16=torch.cuda.is_available(),   
    report_to="none",
)

collator = DataCollatorForSeq2Seq(tokenizer, model=model)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=collator,
)

trainer.train()

trainer.save_model(OUT_DIR)
tokenizer.save_pretrained(OUT_DIR)

print("Zapisano model do:", OUT_DIR)


  trainer = Seq2SeqTrainer(


Step,Training Loss,Validation Loss
200,9.9908,5.829335
400,5.9729,4.43168




Zapisano model do: plt5-lora-qa


In [None]:
import re

NUM_RE = re.compile(r"[-+]?\d+(?:[.,]\d+)?")

@torch.inference_mode()
def answer_question(question: str, context: str) -> str:
    prompt = f"Pytanie: {question}\nKontekst: {context}\nOdpowiedź:"
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=MAX_SOURCE_LEN).to(device)

    out = model.generate(
        **inputs,
        max_new_tokens=24,
        do_sample=False,
        num_beams=4,
        early_stopping=True,
    )
    ans = tokenizer.decode(out[0], skip_special_tokens=True).strip()
    ans = ans.strip("“”\"'„”").rstrip(" .,:;")
    ans = re.sub(r"\s+", " ", ans)

    m = NUM_RE.search(ans)
    if m:
        return m.group(0).replace(",", ".")
    return ans if ans else "Nie wiem"


In [7]:
ctx_df = pd.read_csv("llm_generated_context.csv")
q2ctx = dict(zip(ctx_df["question"].astype(str), ctx_df["context"].astype(str)))

def get_context_for_question(q: str) -> str:
    return q2ctx.get(q, "")


In [None]:
import urllib.request

dump_url = "https://dumps.wikimedia.org/plwiki/latest/plwiki-latest-pages-articles-multistream.xml.bz2"
dump_path = "plwiki.xml.bz2"

urllib.request.urlretrieve(dump_url, dump_path)
print("OK:", dump_path)
