In [58]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from rouge_score import rouge_scorer

In [59]:
# Отключение wandb
os.environ["WANDB_DISABLED"] = "true"

# Проверяем использование GPU
print("Using GPU:", torch.cuda.is_available())

# Шаг 1: Загрузка датасета
print("Загрузка датасета...")
dataset = load_dataset("cnn_dailymail", "3.0.0")
print(dataset)

Using GPU: True
Загрузка датасета...
DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [60]:
# Шаг 2: Токенизация данных
print("Токенизация данных...")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_data(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True)  # Ограничение длины
    labels = tokenizer(examples["highlights"], max_length=32, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Подготовка уменьшенного набора данных для тестирования

train_data = tokenized_dataset["train"].select(range(len(tokenized_dataset["train"]) // 2))  # Половина данных
val_data = tokenized_dataset["validation"].select(range(len(tokenized_dataset["validation"]) // 2))
test_data = tokenized_dataset["test"].select(range(len(tokenized_dataset["test"]) // 2))

Токенизация данных...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at None
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/tokenizer_config.json


Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [61]:
# Шаг 3: Загрузка модели
print("Загрузка модели...")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Загрузка модели...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/config.json
Model config T5Config {
  "_name_or_path": "t5-small",
  "architectures": [
    "T5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 512,
  "decoder_start_token_id": 0,
  "dense_act_fn": "relu",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "relu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": false,
  "layer_norm_epsilon": 1e-06,
  "model_type": "t5",
  "n_positions": 512,
  "num_decoder_layers": 6,
  "num_heads": 8,
  "num_layers": 6,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "task_specific_params": {
    "summarization": {
      "early_stopping": true,
      "length_penalty": 2.0,
      "max_length": 200,
      "min_length": 30,
      "

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

loading weights file model.safetensors from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/model.safetensors
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}

All model checkpoint weights were used when initializing T5ForConditionalGeneration.

All the weights of T5ForConditionalGeneration were initialized from the model checkpoint at t5-small.
If your task is similar to the task the model of the checkpoint was trained on, you can already use T5ForConditionalGeneration for predictions without further training.


generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

loading configuration file generation_config.json from cache at /root/.cache/huggingface/hub/models--t5-small/snapshots/df1b051c49625cf57a3d0d8d3863ed4d13564fe4/generation_config.json
Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "pad_token_id": 0
}



In [62]:
# Шаг 4: Настройка функции вычисления метрик
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge1_scores.append(scores["rouge1"].fmeasure)
        rouge2_scores.append(scores["rouge2"].fmeasure)
        rougeL_scores.append(scores["rougeL"].fmeasure)

    return {
        "rouge1": sum(rouge1_scores) / len(rouge1_scores),
        "rouge2": sum(rouge2_scores) / len(rouge2_scores),
        "rougeL": sum(rougeL_scores) / len(rougeL_scores),
    }

In [63]:
# Шаг 5: Настройка параметров обучения
print("Настройка параметров обучения...")
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    per_device_train_batch_size=4,  # Небольшой размер батча
    per_device_eval_batch_size=4,
    num_train_epochs=7,  # Увеличенное количество эпох
    gradient_accumulation_steps=8,  # Для имитации большего батча
    predict_with_generate=True,
    save_total_limit=2,
    logging_steps=50,
    log_level="info",
    fp16=True,  # Смешанная точность
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Настройка параметров обучения...


In [64]:
# Шаг 6: Создание тренера
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
Using auto half precision backend


In [65]:
# Шаг 7: Обучение
print("Начало обучения...")
trainer.train()

Начало обучения...


The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights. If article, id, highlights are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 143,556
  Num Epochs = 7
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 8
  Total optimization steps = 31,402
  Number of trainable parameters = 60,506,624


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
0,2.7492,2.163796,0.315993,0.149637,0.269477
1,2.705,2.135216,0.316703,0.14918,0.270056
2,2.6506,2.126991,0.317847,0.150921,0.271172
3,2.6224,2.113619,0.317381,0.151159,0.270975
4,2.6256,2.10394,0.318482,0.152007,0.271794
5,2.6257,2.100536,0.318585,0.151775,0.271597
6,2.6086,2.096549,0.318511,0.151635,0.271466


Saving model checkpoint to ./results/checkpoint-500
Configuration saved in ./results/checkpoint-500/config.json
Configuration saved in ./results/checkpoint-500/generation_config.json
Model weights saved in ./results/checkpoint-500/model.safetensors
tokenizer config file saved in ./results/checkpoint-500/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-500/special_tokens_map.json
Copy vocab file to ./results/checkpoint-500/spiece.model
Saving model checkpoint to ./results/checkpoint-1000
Configuration saved in ./results/checkpoint-1000/config.json
Configuration saved in ./results/checkpoint-1000/generation_config.json
Model weights saved in ./results/checkpoint-1000/model.safetensors
tokenizer config file saved in ./results/checkpoint-1000/tokenizer_config.json
Special tokens file saved in ./results/checkpoint-1000/special_tokens_map.json
Copy vocab file to ./results/checkpoint-1000/spiece.model
Deleting older checkpoint [results/checkpoint-375] due to args.save_t

TrainOutput(global_step=31402, training_loss=2.67335812453745, metrics={'train_runtime': 17260.6821, 'train_samples_per_second': 58.219, 'train_steps_per_second': 1.819, 'total_flos': 3.400002599858995e+16, 'train_loss': 2.67335812453745, 'epoch': 6.999804954164229})

In [66]:
# Шаг 8: Оценка на тестовой выборке
print("Оценка на тестовой выборке...")
metrics = trainer.evaluate(test_data)

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: article, id, highlights. If article, id, highlights are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 5745
  Batch size = 4


Оценка на тестовой выборке...


In [67]:
# Шаг 9: Вывод метрик
def print_metrics(metrics):
    print("Evaluation Metrics:")
    print(f"ROUGE-1: {metrics['eval_rouge1']:.4f}")
    print(f"ROUGE-2: {metrics['eval_rouge2']:.4f}")
    print(f"ROUGE-L: {metrics['eval_rougeL']:.4f}")
    print(f"Loss: {metrics['eval_loss']:.4f}")
    print(f"Runtime (s): {metrics['eval_runtime']:.4f}")
    print(f"Samples per Second: {metrics['eval_samples_per_second']:.2f}")
    print(f"Steps per Second: {metrics['eval_steps_per_second']:.2f}")

# Вызов функции
print_metrics(metrics)


Evaluation Metrics:
ROUGE-1: 0.3173
ROUGE-2: 0.1514
ROUGE-L: 0.2710
Loss: 2.1118
Runtime (s): 370.1155
Samples per Second: 15.52
Steps per Second: 3.88
