In [1]:
import os
import re
from typing import List
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    set_seed,
)
from peft import LoraConfig, get_peft_model, PeftModel

# Параметры
MODEL_PATH = "qwen2_local"
TRAIN_CSV = "jokes.csv"
TEXT_COLUMN = "text"
LORA_DIR = "qwen_jokes_lora"
PREFIXES_FILE = "prefixes.txt"
OUT_FILE = "jokes_for_bot.txt"

MAX_SEQ_LEN = 256
MAX_SAMPLES = 30000

EPOCHS = 1
LR = 2e-4
BATCH_SIZE = 2
GRAD_ACCUM = 8
SEED = 42

GENERATE_PER_PREFIX = 2
MAX_NEW_TOKENS = 75
TEMPERATURE = 0.9
TOP_P = 0.95
REPETITION_PENALTY = 1.08

# Для ускорения GPU
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

def normalize_text(t: str) -> str:
    t = (t or "").replace("\r\n", "\n").replace("\r", "\n").strip()
    t = re.sub(r"\s+", " ", t)
    return t

def load_texts_from_csv(path: str, column: str) -> List[str]:
    try:
        df = pd.read_csv(path, encoding="utf-8")
    except UnicodeDecodeError:
        try:
            df = pd.read_csv(path, encoding="utf-8-sig")
        except UnicodeDecodeError:
            df = pd.read_csv(path, encoding="latin1")
    if column not in df.columns:
        raise ValueError(f"Нет колонки '{column}'. Есть: {list(df.columns)}")
    texts = df[column].dropna().astype(str).map(normalize_text).tolist()
    if not texts:
        raise ValueError("Данные пустые")
    return texts

def build_sft_dataset(texts: List[str]) -> Dataset:
    prompt = "Сгенерируй короткий анекдот на русском языке.\nАнекдот:"
    rows = [ {"text": f"{prompt} {t}"} for t in texts if len(t) >= 20 ]
    return Dataset.from_list(rows)

# Подготовка данных
set_seed(SEED)
texts = load_texts_from_csv(TRAIN_CSV, TEXT_COLUMN)
print(f"Загружено текстов: {len(texts)}")

dataset = build_sft_dataset(texts)
print(f"Примеров до сэмплирования: {len(dataset)}")

if len(dataset) > MAX_SAMPLES:
    dataset = dataset.shuffle(seed=SEED).select(range(MAX_SAMPLES))
print(f"Используем примеров: {len(dataset)}")

# Токенизация
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

def tokenize_fn(example):
    return tokenizer(
        example["text"],
        truncation=True,
        max_length=MAX_SEQ_LEN
    )

dataset = dataset.map(tokenize_fn, remove_columns=["text"], batched=True)

# Модель + LoRA
model = AutoModelForCausalLM.from_pretrained(
    MODEL_PATH,
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map="auto" if torch.cuda.is_available() else None,
)

lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"]
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Обучение
training_args = TrainingArguments(
    output_dir=LORA_DIR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=BATCH_SIZE,
    gradient_accumulation_steps=GRAD_ACCUM,
    learning_rate=LR,
    warmup_ratio=0.05,
    logging_steps=10,
    save_steps=200,
    save_total_limit=2,
    fp16=torch.cuda.is_available(),
    optim="adamw_torch",
    report_to=[],
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    data_collator=data_collator
)

trainer.train()

model.save_pretrained(LORA_DIR)
tokenizer.save_pretrained(LORA_DIR)
print(f"LoRA сохранена в: {LORA_DIR}")

# Загрузка затравок
prefixes = []
with open(PREFIXES_FILE, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        try:
            idx, text = line.split(" ", 1)
            prefixes.append((idx, text))
        except:
            continue

print(f"Загрузили {len(prefixes)} затравок")

# Функция генерации по затравкам
@torch.inference_mode()
def generate_for_prefix(idx, text, n=1):
    results = []
    prompt = text
    for _ in range(n):
        inputs = tokenizer(prompt, return_tensors="pt")
        if torch.cuda.is_available():
            inputs = {k: v.cuda() for k, v in inputs.items()}

        out = model.generate(
            **inputs,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=True,
            temperature=TEMPERATURE,
            top_p=TOP_P,
            repetition_penalty=REPETITION_PENALTY,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )

        text_out = tokenizer.decode(out[0], skip_special_tokens=True)
        continuation = text_out[len(prompt):].strip()
        continuation = re.sub(r"\s+", " ", continuation)
        if continuation:
            results.append(f"{idx} {continuation}")

    return results

# Генерация всех анекдотов и сохранение
all_jokes = []

for idx, prefix in prefixes:
    jokes = generate_for_prefix(idx, prefix, n=GENERATE_PER_PREFIX)
    all_jokes.extend(jokes)

with open(OUT_FILE, "w", encoding="utf-8") as f:
    for j in all_jokes:
        f.write(j + "\n")

print(f"Готово, Сохранено {len(all_jokes)} анекдотов → {OUT_FILE}")

Загружено текстов: 130204
Примеров до сэмплирования: 129178
Используем примеров: 30000


Map:   0%|          | 0/30000 [00:00<?, ? examples/s]

`torch_dtype` is deprecated! Use `dtype` instead!
The model is already on multiple devices. Skipping the move to device specified in `args`.


trainable params: 8,798,208 || all params: 502,830,976 || trainable%: 1.7497


Step,Training Loss
10,2.7989
20,2.4225
30,2.2635
40,2.2127
50,2.1353
60,2.1906
70,2.1938
80,2.203
90,2.1031
100,2.1527


LoRA сохранена в: qwen_jokes_lora
Загрузили 75 затравок
Готово, Сохранено 150 анекдотов → jokes_for_bot.txt
