In [6]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [7]:
from datasets import load_dataset, load_metric
from transformers import (
    DataCollatorForSeq2Seq,
    MT5ForConditionalGeneration,
    MT5Tokenizer,
    pipeline,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
) 
import evaluate
import numpy as np
import datasets

cache_dir = '/usr/src/app/!datasets/'

In [8]:
tokenizer = MT5Tokenizer.from_pretrained("google/mt5-small", cache_dir=cache_dir)  # cointegrated/rut5-small google/mt5-small
model = MT5ForConditionalGeneration.from_pretrained('google/mt5-small', cache_dir=cache_dir)

# tokenizer = MT5Tokenizer.from_pretrained("results/checkpoint-121500")  # это если мы хотим подгрузить веса модели которую уже обучали
# model = MT5ForConditionalGeneration.from_pretrained('results/checkpoint-121500')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [9]:
max_target_tokens_count = 128
max_source_tokens_count = 1024


def preprocess_function(examples):
    inputs = examples["text"]
    model_inputs = tokenizer(inputs, max_length=max_source_tokens_count, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=max_target_tokens_count, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    
    return model_inputs

In [10]:
dataset = load_dataset('IlyaGusev/gazeta',revision="v2.0", cache_dir=cache_dir)
tokenized_dataset = dataset.map(preprocess_function, batched=True)

In [11]:
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

In [12]:
rouge = load_metric("rouge")
bleu = evaluate.load("bleu")

  rouge = load_metric("rouge")


In [13]:
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    # Rouge expects a newline after each sentence
    decoded_preds = ["\n".join(nltk.sent_tokenize(pred.strip())) for pred in decoded_preds]
    decoded_labels = ["\n".join(nltk.sent_tokenize(label.strip())) for label in decoded_labels]
    
    result = {}
    result_rouge = rouge.compute(predictions=decoded_preds, references=decoded_labels)
    # Extract a few results
    result.update({key: value.mid.fmeasure * 100 for key, value in result_rouge.items()})
    
    result_bleu = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    # Extract a few results
    result["bleu"] = result_bleu["bleu"] * 100
    
    # Add mean generated length
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)
    result["char_len"] = np.mean([len(t) for t in decoded_preds])
    
    return {k: round(v, 4) for k, v in result.items()}

In [14]:
training_args = Seq2SeqTrainingArguments(
    output_dir="/results",
    evaluation_strategy="steps",
    eval_steps=25,
    logging_steps=25,
    learning_rate=4e-4,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=64,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=1,
    fp16=False,
    predict_with_generate=True,
    generation_max_length=max_target_tokens_count,
    generation_num_beams=5
)

In [15]:
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

Detected kernel version 4.15.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [16]:
trainer.train()

Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bleu,Gen Len,Char Len
25,7.1569,2.81118,5.6305,1.6718,5.5433,5.5758,1.1431,29.9919,106.5925
50,3.696,2.621763,8.6522,2.6872,8.3916,8.478,2.8084,40.1075,147.0604
75,3.3494,2.514841,12.2101,3.9651,11.8991,12.0007,3.7626,52.6212,187.1547
100,3.185,2.487571,14.7088,4.7838,14.3653,14.4587,5.0365,72.6901,252.8382
125,3.113,2.439365,15.3278,4.9601,15.0137,15.0978,5.4697,79.246,275.2048
150,3.0461,2.420113,15.6599,5.0697,15.3493,15.445,5.8959,84.5623,294.6867
175,3.011,2.401659,15.4311,5.0081,15.1077,15.1995,6.0795,88.0181,306.9644
200,2.9928,2.397269,15.7357,5.0369,15.4218,15.5019,6.1591,86.4198,300.9968
225,2.9756,2.397243,15.9807,5.2007,15.6348,15.7207,6.1597,86.1786,299.9204


TrainOutput(global_step=238, training_loss=3.5793695690251197, metrics={'train_runtime': 29918.0259, 'train_samples_per_second': 2.038, 'train_steps_per_second': 0.008, 'total_flos': 6.442993023307776e+16, 'train_loss': 3.5793695690251197, 'epoch': 1.0})

In [17]:
trainer.evaluate()

{'eval_loss': 2.3926541805267334,
 'eval_rouge1': 15.9686,
 'eval_rouge2': 5.1256,
 'eval_rougeL': 15.6403,
 'eval_rougeLsum': 15.7116,
 'eval_bleu': 6.1229,
 'eval_gen_len': 85.836,
 'eval_char_len': 298.8378,
 'eval_runtime': 3153.3697,
 'eval_samples_per_second': 2.154,
 'eval_steps_per_second': 0.539,
 'epoch': 1.0}