In [1]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
from datasets import load_dataset, load_metric
from transformers import (
    DataCollatorForSeq2Seq,
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    pipeline,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
) 
import evaluate
import numpy as np
import datasets

In [3]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small")  # cointegrated/rut5-small google/mt5-small
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small')

# tokenizer = MT5Tokenizer.from_pretrained("results/checkpoint-121500")  # это если мы хотим подгрузить веса модели которую уже обучали
# model = MT5ForConditionalGeneration.from_pretrained('results/checkpoint-121500')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
max_target_tokens_count = 128
max_source_tokens_count = 1024


def preprocess_function(examples):
    inputs = examples["text"]
    model_inputs = tokenizer(inputs, max_length=max_source_tokens_count, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=max_target_tokens_count, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [5]:
dataset = load_dataset('IlyaGusev/gazeta',revision="v2.0")
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [6]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [7]:
rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

In [12]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Replace -100 in the labels as we can't decode them.
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = {}
    result_rouge = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    
    # Extract a few results
    result.update({key: value * 100 for key, value in result_rouge.items()})
    
    result_bleu = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    # Extract a few results
    result["bleu"] = result_bleu["bleu"] * 100
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    result["char_len"] = np.mean([len(t) for t in decoded_preds])
    
    return {k: round(v, 4) for k, v in result.items()}

In [9]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/lab4/results",
    evaluation_strategy="steps",
    eval_steps=25,
    logging_steps=25,
    learning_rate=4e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=False,
    predict_with_generate=True,
    generation_max_length=max_target_tokens_count,
    generation_num_beams=5
)

In [13]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [14]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Gen Len,Char Len
25,3.6494,2.606828,8.8847,2.7985,8.6692,8.7287,3.132,41.8011,153.214
50,3.3109,2.50356,12.9199,4.2022,12.663,12.7079,4.7506,64.9957,229.3964
75,3.1599,2.460199,15.2437,5.0339,14.9985,15.044,5.4921,79.2074,274.5487
100,3.0652,2.445562,15.8515,5.2613,15.5667,15.6325,6.0029,84.8892,295.209
125,3.0196,2.399887,15.633,5.1442,15.3465,15.4388,5.9906,84.439,293.4923
150,2.9681,2.392379,15.8855,5.0941,15.5497,15.6974,6.2764,87.9223,306.3782
175,2.9402,2.371411,15.7012,5.0092,15.4142,15.5328,6.333,88.4297,307.6535
200,2.9281,2.370409,16.0383,5.2497,15.725,15.8409,6.3366,88.1644,306.7936
225,2.9132,2.371033,16.2935,5.1997,15.9909,16.0896,6.375,87.1608,302.9937


TrainOutput(global_step=238, training_loss=3.0958816303926358, metrics={'train_runtime': 40122.4096, 'train_samples_per_second': 1.519, 'train_steps_per_second': 0.006, 'total_flos': 6.442993023307776e+16, 'train_loss': 3.0958816303926358, 'epoch': 1.0})

In [None]:
trainer.evaluate()

{'eval_loss': 2.365630626678467,
 'eval_rouge1': 16.1474,
 'eval_rouge2': 5.2142,
 'eval_rougeL': 15.8831,
 'eval_rougeLsum': 15.9676,
 'eval_bleu': 6.3574,
 'eval_gen_len': 86.9866,
 'eval_char_len': 302.4871,
 'eval_runtime': 5223.6655,
 'eval_samples_per_second': 1.3,
 'eval_steps_per_second': 0.325,
 'epoch': 1.0}