In [1]:
"""
Pipeline: fine-tune a causal LLM on math word problems (GSM8K).
Requirements (pip):
  pip install transformers datasets accelerate peft bitsandbytes evaluate sentencepiece tokenizers
(If you don't use bitsandbytes, you can omit it and set use_bnb=False)

Run example:
  accelerate config
  accelerate launch train_math_llm.py --model_name_or_path <base-model> --output_dir ./math-lora
"""

"\nPipeline: fine-tune a causal LLM on math word problems (GSM8K).\nRequirements (pip):\n  pip install transformers datasets accelerate peft bitsandbytes evaluate sentencepiece tokenizers\n(If you don't use bitsandbytes, you can omit it and set use_bnb=False)\n\nRun example:\n  accelerate config\n  accelerate launch train_math_llm.py --model_name_or_path <base-model> --output_dir ./math-lora\n"

In [2]:
import re
import argparse
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from peft import get_peft_model, LoraConfig, TaskType
import evaluate
import numpy as np

In [3]:
from datasets import load_dataset

ds = load_dataset("openai/gsm8k", "main")
ds

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 7473
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 1319
    })
})

In [4]:
def make_prompt(q):
    return f"Question: {q}\nAnswer:"

def preprocess(example):
    q = example["question"].strip()
    a = example["answer"].strip()
    prompt = make_prompt(q)
    full_text = prompt + " " + a
    return {"text": full_text, "prompt": prompt, "target": a}

ds = ds.map(preprocess, remove_columns=ds["train"].column_names)

train_ds = ds["train"]
valid_ds = ds["test"]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoTokenizer

model_name = "facebook/opt-1.3b"  # можешь заменить, например на Llama-3.2-1B

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

max_length = 256

def tokenize(example):
    full = example["text"]
    prompt = example["prompt"]

    enc_full = tokenizer(full, truncation=True, max_length=max_length, padding="max_length")
    enc_prompt = tokenizer(prompt, truncation=True, max_length=max_length, padding="max_length")

    input_ids = enc_full["input_ids"]
    labels = input_ids.copy()

    # mask prompt part
    prompt_len = sum(x != tokenizer.pad_token_id for x in enc_prompt["input_ids"])
    labels[:prompt_len] = [-100] * prompt_len

    return {
        "input_ids": input_ids,
        "attention_mask": enc_full["attention_mask"],
        "labels": labels,
    }

tokenized_train = train_ds.map(tokenize, remove_columns=train_ds.column_names)
tokenized_valid = valid_ds.map(tokenize, remove_columns=valid_ds.column_names)

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/1319 [00:00<?, ? examples/s]

In [6]:
from transformers import AutoModelForCausalLM
from peft import LoraConfig, get_peft_model, TaskType

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    load_in_4bit=True,     # ускоряет и экономит VRAM
)

lora_cfg = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"],  # для OPT
)

model = get_peft_model(model, lora_cfg)
model.print_trainable_parameters()

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


pytorch_model.bin:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/137 [00:00<?, ?B/s]

trainable params: 2,359,296 || all params: 1,318,117,376 || trainable%: 0.1790


In [11]:
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling

batch_size = 8
epochs = 1
lr = 2e-4
output_dir = "./math-lora"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy="epoch",
    # evaluation_strategy="epoch",
    logging_steps=50,
    num_train_epochs=epochs,
    learning_rate=lr,
    fp16=True,
    remove_unused_columns=False,
    save_total_limit=2,
    report_to="none"
)

data_collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_valid,
    data_collator=data_collator,
)

trainer.train()

Step,Training Loss
50,1.8362
100,1.5911
150,1.5355
200,1.5058
250,1.5038
300,1.4601
350,1.4235
400,1.4216
450,1.4274
500,1.4161


TrainOutput(global_step=935, training_loss=1.4525553820604946, metrics={'train_runtime': 831.5805, 'train_samples_per_second': 8.987, 'train_steps_per_second': 1.124, 'total_flos': 1.390006030565376e+16, 'train_loss': 1.4525553820604946, 'epoch': 1.0})

In [None]:
trainer.save_model(output_dir)
model.save_pretrained(output_dir + "/lora_adapter")

# оценка качества

In [20]:
valid_ds[0]

{'text': "Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nAnswer: Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18",
 'prompt': "Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?\nAnswer:",
 'target': 'Janet sells 16 - 3 - 4 = <<16-3-4=9>>9 duck eggs a day.\nShe makes 9 * 2 = $<<9*2=18>>18 every day at the farmer’s market.\n#### 18'}

In [None]:
def extract_number(text):
    """
    Ищет первое целое или десятичное число в строке.
    Возвращает строку с числом или пустую строку, если не найдено.
    """
    match = re.search(r"[-+]?\d*\.?\d+", text)
    if match:
        return match.group(0)
    return ""

In [18]:
import torch
from tqdm import tqdm

model.eval()
preds, golds = [], []

batch_size = 16
with torch.no_grad():
    for i in tqdm(range(0, 200, batch_size)):
        batch_prompts = [valid_ds[j]["prompt"] for j in range(i, min(i+batch_size, 200))]
        batch_golds = [valid_ds[j]["target"] for j in range(i, min(i+batch_size, 200))]

        inputs = tokenizer(batch_prompts, return_tensors="pt", padding=True, truncation=True).to(model.device)
        outs = model.generate(**inputs, max_new_tokens=128, do_sample=False)
        decoded = tokenizer.batch_decode(outs, skip_special_tokens=True)

        for prompt, pred_text, gold_text in zip(batch_prompts, decoded, batch_golds):
            continuation = pred_text[len(prompt):] if pred_text.startswith(prompt) else pred_text
            preds.append(extract_number(continuation))
            golds.append(extract_number(gold_text))


  0%|          | 0/13 [00:00<?, ?it/s]Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
  8%|▊         | 1/13 [00:14<02:53, 14.46s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 15%|█▌        | 2/13 [00:34<03:13, 17.62s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the tokenizer.
 23%|██▎       | 3/13 [00:56<03:15, 19.59s/it]A decoder-only architecture is being used, but right-padding was detected! For correct generation results, please set `padding_side='left'` when initializing the t

In [30]:
import numpy as np

preds = np.array(preds)
golds = np.array(golds)

mask = golds != ""   # оставляем только примеры с правильным числом

In [32]:
em = (preds[mask] == golds[mask]).mean()
print(f"Exact Match: {em:.4f}"

Exact Match: 0.0200


In [31]:
for i in range(5):
    print("PROMPT:", valid_ds[i]["prompt"])
    print("PRED:", preds[i])
    print("GOLD:", golds[i])
    print("---")


PROMPT: Question: Janet’s ducks lay 16 eggs per day. She eats three for breakfast every morning and bakes muffins for her friends every day with four. She sells the remainder at the farmers' market daily for $2 per fresh duck egg. How much in dollars does she make every day at the farmers' market?
Answer:
PRED: 6192
GOLD: 18
---
PROMPT: Question: A robe takes 2 bolts of blue fiber and half that much white fiber.  How many bolts in total does it take?
Answer:
PRED: 2
GOLD: 3
---
PROMPT: Question: Josh decides to try flipping a house.  He buys a house for $80,000 and then puts in $50,000 in repairs.  This increased the value of the house by 150%.  How much profit did he make?
Answer:
PRED: 10
GOLD: 70000
---
PROMPT: Question: James decides to run 3 sprints 3 times a week.  He runs 60 meters each sprint.  How many total meters does he run a week?
Answer:
PRED: 15
GOLD: 540
---
PROMPT: Question: Every day, Wendi feeds each of her chickens three cups of mixed chicken feed, containing seeds,

Да, давай разберём варианты для генеративного QA / ответа на вопрос по тексту, включая русские модели. Разобью по категориям: большие, средние, маленькие, с учётом LLM и моделей с seq2seq.

1️⃣ Английские / мультиязычные модели (seq2seq)
🔹 Большие

FLAN-T5-XL / FLAN-T5-Large — instruction-tuned, отлично для генерации ответов.

mT5-Large / mT5-XL — мультиязычная версия T5, работает и с русским.

🔹 Средние

FLAN-T5-Base / mT5-Base — хороший компромисс скорость/качество.

BART-large / MBart50 — seq2seq, MBart50 поддерживает русский.

🔹 Малые

FLAN-T5-Small / mT5-Small — для экспериментов и обучения на ноутбуке.

Seq2seq лучше для генерации коротких текстовых ответов, чем causal LM.

2️⃣ Русские модели для генеративного QA
🔹 Большие

ruT5-large / ruT5-base — русская версия T5, подходит для генеративного QA.

sberbank-ai/rugpt3large — causal LM (LLM) для русского, хорошо завершает тексты и ответы.

🔹 Средние

sberbank-ai/rugpt3medium — быстрее и легче, но качество немного ниже.

ai-forever/ruT5-base — seq2seq, легче fine-tune.

🔹 Малые / эксперимент

DeepPavlov/rubert-base-cased-sentiment — базовый transformer, можно использовать для коротких генераций через LM head.

ai-forever/ruT5-small — быстро для экспериментов.

3️⃣ Causal LLM vs Seq2Seq
Тип модели	Преимущество	Недостаток
seq2seq (T5, BART)	Легко обучать на “вопрос+контекст → ответ”, генерация коротких ответов	Требует токенизации input+output, нет встроенного контекста >512 токенов
causal LLM (GPT, ruGPT, Qwen)	Хорошо подходит для длинных инструкций, few-shot	Иногда модель “уходит в рассуждения”, а не выдаёт короткий ответ
4️⃣ Рекомендации

Для русского QA (короткие ответы): ruT5-base или ruT5-large → легко fine-tune на SQuAD-like датасет.

Если нужен мультиязычный вариант: mT5-base/large → русские и английские тексты.

Если хочешь большой LLM для few-shot без дообучения: sberbank-ai/rugpt3large или Qwen-7B-Russian (vLLM).