In [2]:
!pip install rouge_score

  pid, fd = os.forkpty()


Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24935 sha256=deed6c9f1bab4eccf5ce67f8fae5abe0e360d76b61339dfcd7af3300d01346ba
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [3]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSeq2SeqLM,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq
)
from rouge_score import rouge_scorer

In [4]:
# Отключение wandb
os.environ["WANDB_DISABLED"] = "true"

# Проверяем использование GPU
print("Using GPU:", torch.cuda.is_available())

# Шаг 1: Загрузка датасета
print("Загрузка датасета...")
dataset = load_dataset("cnn_dailymail", "3.0.0")
print(dataset)

Using GPU: True
Загрузка датасета...


README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 287113
    })
    validation: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 13368
    })
    test: Dataset({
        features: ['article', 'highlights', 'id'],
        num_rows: 11490
    })
})


In [5]:
# Шаг 2: Токенизация данных
print("Токенизация данных...")
tokenizer = AutoTokenizer.from_pretrained("t5-small")

def preprocess_data(examples):
    inputs = ["summarize: " + doc for doc in examples["article"]]
    model_inputs = tokenizer(inputs, max_length=512, padding="max_length", truncation=True)
    labels = tokenizer(examples["highlights"], max_length=150, padding="max_length", truncation=True)
    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset.map(preprocess_data, batched=True)

# Подготовка уменьшенного набора данных для тестирования
train_data = tokenized_dataset["train"].select(range(1000))
val_data = tokenized_dataset["validation"].select(range(500))
test_data = tokenized_dataset["test"].select(range(500))

Токенизация данных...


tokenizer_config.json:   0%|          | 0.00/2.32k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Map:   0%|          | 0/287113 [00:00<?, ? examples/s]

Map:   0%|          | 0/13368 [00:00<?, ? examples/s]

Map:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [6]:
# Шаг 3: Загрузка модели
print("Загрузка модели...")
model = AutoModelForSeq2SeqLM.from_pretrained("t5-small")

Загрузка модели...


config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/242M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [7]:
# Шаг 4: Настройка функции вычисления метрик
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge1_scores = []
    rouge2_scores = []
    rougeL_scores = []

    for pred, label in zip(decoded_preds, decoded_labels):
        scores = scorer.score(label, pred)
        rouge1_scores.append(scores["rouge1"].fmeasure)
        rouge2_scores.append(scores["rouge2"].fmeasure)
        rougeL_scores.append(scores["rougeL"].fmeasure)

    return {
        "rouge1": sum(rouge1_scores) / len(rouge1_scores),
        "rouge2": sum(rouge2_scores) / len(rouge2_scores),
        "rougeL": sum(rougeL_scores) / len(rougeL_scores),
    }

In [8]:
# Шаг 5: Настройка параметров обучения
print("Настройка параметров обучения...")
training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    predict_with_generate=True,
    save_total_limit=2,
    logging_steps=10,
    log_level="info",
)

Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Настройка параметров обучения...


In [9]:
# Шаг 6: Создание тренера
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model, padding=True)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

In [10]:
# Шаг 7: Обучение
print("Начало обучения...")
trainer.train()

The following columns in the training set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: highlights, id, article. If highlights, id, article are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.


Начало обучения...


***** Running training *****
  Num examples = 1,000
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 375
  Number of trainable parameters = 60,506,624


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel
1,1.1602,0.803429,0.0,0.0,0.0
2,1.0652,0.763277,0.006475,0.003222,0.005235
3,0.9923,0.753941,0.010614,0.005787,0.009205


The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: highlights, id, article. If highlights, id, article are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: highlights, id, article. If highlights, id, article are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8
Saving model checkpoint to ./results/checkpoint-375
Configuration saved in ./results/checkpoint-375/config.json
Configuration saved in ./results/checkpoint-375/generation_config.json
Model weights saved in ./results/checkpoint-375/model.safetensors
tokenizer config file saved in 

TrainOutput(global_step=375, training_loss=1.7062289098103842, metrics={'train_runtime': 141.8018, 'train_samples_per_second': 21.156, 'train_steps_per_second': 2.645, 'total_flos': 406025404416000.0, 'train_loss': 1.7062289098103842, 'epoch': 3.0})

In [11]:
# Шаг 8: Оценка на тестовой выборке
print("Оценка на тестовой выборке...")
metrics = trainer.evaluate(test_data)

The following columns in the evaluation set don't have a corresponding argument in `T5ForConditionalGeneration.forward` and have been ignored: highlights, id, article. If highlights, id, article are not expected by `T5ForConditionalGeneration.forward`,  you can safely ignore this message.

***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


Оценка на тестовой выборке...


In [12]:
# Шаг 9: Вывод метрик
def print_metrics(metrics):
    print("Evaluation Metrics:")
    print(f"ROUGE-1: {metrics['eval_rouge1']:.4f}")
    print(f"ROUGE-2: {metrics['eval_rouge2']:.4f}")
    print(f"ROUGE-L: {metrics['eval_rougeL']:.4f}")
    print(f"Loss: {metrics['eval_loss']:.4f}")
    print(f"Runtime (s): {metrics['eval_runtime']:.4f}")
    print(f"Samples per Second: {metrics['eval_samples_per_second']:.2f}")
    print(f"Steps per Second: {metrics['eval_steps_per_second']:.2f}")

# Вызов функции
print_metrics(metrics)


Evaluation Metrics:
ROUGE-1: 0.0051
ROUGE-2: 0.0020
ROUGE-L: 0.0042
Loss: 0.7639
Runtime (s): 18.4310
Samples per Second: 27.13
Steps per Second: 3.42
