In [2]:
"""
finetune_qa_answer_only.py
Fine-tune FLAN-T5 on SQuAD to produce only the answer text given question+context.
Requirements:
  pip install datasets transformers accelerate evaluate peft sentencepiece
Optional (recommended for large models / memory): bitsandbytes
Run example:
  accelerate launch finetune_qa_answer_only.py
"""

'\nfinetune_qa_answer_only.py\nFine-tune FLAN-T5 on SQuAD to produce only the answer text given question+context.\nRequirements:\n  pip install datasets transformers accelerate evaluate peft sentencepiece\nOptional (recommended for large models / memory): bitsandbytes\nRun example:\n  accelerate launch finetune_qa_answer_only.py\n'

In [20]:
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
)
import evaluate
import numpy as np
import os

In [21]:
try:
    from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
    PEFT_AVAILABLE = True
except Exception:
    PEFT_AVAILABLE = False

In [22]:
MODEL_NAME = "google/flan-t5-base"
DATASET_NAME = "squad"  # SQuAD v1.1 (contains answer spans). SQuAD v2 exists with unanswerable questions.
OUTPUT_DIR = "./flan_t5_squad_answer_only"
MAX_INPUT_LENGTH = 512
MAX_TARGET_LENGTH = 64
TRAIN_SAMPLE = "train[:10000]"   # для быстрой проверки можно брать меньше, для реального обучения снимите срез
EVAL_SAMPLE = "validation[:2000]"

In [26]:
ds_train

Dataset({
    features: ['id', 'title', 'context', 'question', 'answers'],
    num_rows: 10000
})

In [23]:
# Загружаем датасет
ds_train = load_dataset(DATASET_NAME, split=TRAIN_SAMPLE)
ds_eval = load_dataset(DATASET_NAME, split=EVAL_SAMPLE)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def preprocess(example):
    # берем первый ответ из списка answers.text (SQuAD гарантирует хотя бы один ответ в v1)
    answers = example.get("answers", {}).get("text", [])
    target = answers[0] if len(answers) > 0 else ""
    question = example["question"].strip()
    context = example["context"].strip()

    # Вход: шаблон, чтобы модель понимала задачу — можно менять
    input_text = f"question: {question}  context: {context}"
    model_inputs = tokenizer(
        input_text,
        max_length=MAX_INPUT_LENGTH,
        truncation=True,
    )
    labels = tokenizer(
        target,
        max_length=MAX_TARGET_LENGTH,
        truncation=True,
    )
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_train = ds_train.map(preprocess, remove_columns=ds_train.column_names, batched=False)
tokenized_eval = ds_eval.map(preprocess, remove_columns=ds_eval.column_names, batched=False)

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [25]:
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
use_lora = True
if use_lora and PEFT_AVAILABLE:
    # Небольшая LoRA-конфигурация — подстрой под свои ресурсы
    lora_config = LoraConfig(
        r=8,
        lora_alpha=32,
        target_modules=["q", "v", "k", "o", "wi", "wo"] if hasattr(model.config, "d_model") else None,
        lora_dropout=0.05,
        bias="none",
        task_type="SEQ_2_SEQ_LM",
    )
    model = get_peft_model(model, lora_config)
    print("LoRA applied to the model (PEFT).")
else:
    if use_lora and not PEFT_AVAILABLE:
        print("PEFT/LoRA не установлены — тренируем полную модель.")

LoRA applied to the model (PEFT).


In [10]:
def compute_metrics(pred):
    # pred: PredictionOutput from trainer
    tokenizer = pred.tokenizer if hasattr(pred, "tokenizer") else None
    # Но Trainer передаёт просто predictions, label_ids — используем evaluate library
    rouge = evaluate.load("rouge")
    preds = pred.predictions
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(np.argmax(preds, axis=-1) if preds.ndim==3 else preds, skip_special_tokens=True) if False else tokenizer.batch_decode(preds, skip_special_tokens=True)
    # Предполагаем, что trainer уже выдал текстовые строки в pred.label_ids? safer -> use evaluation in train loop skip heavy metric
    # Для простоты — вычислим ROUGE между decoded_preds и references (trainer позаботится о преобразовании)
    labels = pred.label_ids
    if labels is None:
        return {}
    # декодируем лейблы
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    # вернём rougeL и rouge1 для примера
    return {
        "rouge1": result.get("rouge1"),
        "rougeL": result.get("rougeL")
    }

In [13]:
# Data collator автоматически паддит и готовит labels
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,
    # evaluation_strategy="steps",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    predict_with_generate=True,
    logging_steps=100,
    eval_steps=500,
    save_steps=1000,
    save_total_limit=2,
    num_train_epochs=1,
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,  # требует GPU с поддержкой fp16
    report_to="none",
    remove_unused_columns=False,  # важно для Seq2SeqTrainer
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_eval,
    tokenizer=tokenizer,
    data_collator=data_collator,
    # compute_metrics=compute_metrics,  # можно включить, но это замедлит обучение (prediction + generation)
)

trainer.train()
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"Model and tokenizer saved to {OUTPUT_DIR}")

  trainer = Seq2SeqTrainer(


Step,Training Loss
100,0.0
200,0.0
300,0.0
400,0.0
500,0.0
600,0.0
700,0.0
800,0.0
900,0.0
1000,0.0


Model and tokenizer saved to ./flan_t5_squad_answer_only


In [16]:
# Пример инференса: генерируем ответ на первую валидационную запись
import torch
sample = tokenized_eval[0]

input_ids = torch.tensor(sample["input_ids"], dtype=torch.long).unsqueeze(0).to(model.device)
attention_mask = torch.tensor(sample["attention_mask"], dtype=torch.long).unsqueeze(0).to(model.device)

In [17]:
outputs = model.generate(
    input_ids=input_ids,
    attention_mask=attention_mask,
    max_length=MAX_TARGET_LENGTH,
    num_beams=4,
    early_stopping=True,
)

In [19]:
decoded = tokenizer.batch_decode(outputs, skip_special_tokens=True)
print(sample)
print(decoded[0])

{'input_ids': [822, 10, 4073, 10439, 372, 7283, 8, 71, 5390, 44, 2011, 9713, 943, 58, 2625, 10, 2011, 9713, 943, 47, 46, 797, 3370, 467, 12, 2082, 8, 6336, 13, 8, 868, 10929, 3815, 41, 12619, 434, 61, 21, 8, 1230, 774, 5, 37, 797, 10929, 4379, 41, 188, 5390, 61, 6336, 12154, 4027, 29, 509, 7, 17025, 8, 868, 10929, 4379, 41, 567, 5390, 61, 6336, 5089, 21149, 7, 997, 104, 1714, 12, 3807, 70, 1025, 2011, 9713, 2233, 5, 37, 467, 47, 1944, 30, 2083, 7973, 5123, 44, 16755, 31, 7, 12750, 16, 8, 1051, 5901, 2474, 5690, 44, 4625, 9908, 9, 6, 1826, 5, 282, 48, 47, 8, 943, 189, 2011, 9713, 6, 8, 5533, 3, 25472, 8, 96, 14910, 35, 7685, 121, 28, 796, 2045, 18, 24186, 6985, 6, 38, 168, 38, 18223, 12547, 53, 8, 4387, 13, 3, 21990, 284, 2011, 9713, 467, 28, 3385, 7507, 4900, 7, 41, 7248, 84, 8, 467, 133, 43, 118, 801, 38, 96, 23290, 9713, 301, 8512, 6, 78, 24, 8, 3554, 228, 8304, 120, 1451, 8, 19248, 7507, 4900, 7, 943, 5, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,

# как грузим датасеты

In [None]:
from datasets import load_dataset

ds = load_dataset("json", data_files="train.jsonl")
ds = load_dataset("json", data_files="train.json")
ds = load_dataset("csv", data_files="train.csv")
ds = load_dataset("parquet", data_files="train.parquet")

4. Как подготовить к модели?

Допустим, в датасете есть question, context, answer.

Ты делаешь map:

def preprocess(ex):
    text = f"question: {ex['question']}  context: {ex['context']}"
    model_in = tokenizer(text, truncation=True, max_length=512)
    label = tokenizer(ex["answer"], truncation=True, max_length=64)
    
    model_in["labels"] = label["input_ids"]
    return model_in

tokenized = ds.map(preprocess)