In [1]:
import torch
from transformers import MT5ForConditionalGeneration, MT5Tokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset
from os import environ


In [2]:
model_name = "yelyah/mT5-XLSUM-ua-news"
dataset_name = "andriibul/ukr-news-yt-summary"

model_max_length = 92

In [3]:
tokenizer = MT5Tokenizer.from_pretrained(
    model_name, 
    model_max_length=model_max_length, 
)
model = MT5ForConditionalGeneration.from_pretrained(
    model_name, 
)


tokenizer_config.json:   0%|          | 0.00/833 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/416 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/937 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.33G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/204 [00:00<?, ?B/s]

In [4]:
model.config

MT5Config {
  "_name_or_path": "yelyah/mT5-XLSUM-ua-news",
  "architectures": [
    "MT5ForConditionalGeneration"
  ],
  "classifier_dropout": 0.0,
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 0.6,
  "max_length": 84,
  "model_type": "mt5",
  "no_repeat_ngram_size": 2,
  "num_beams": 4,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_past": true,
  "pad_token_id": 0,
  "relative_attention_max_distance": 128,
  "relative_attention_num_buckets": 32,
  "tie_word_embeddings": false,
  "tokenizer_class": "T5Tokenizer",
  "torch_dtype": "float32",
  "transformers_version": "4.45.1",
  "use_cache": true,
  "vocab_size": 250112
}

In [5]:
model.generation_config

GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "length_penalty": 0.6,
  "max_length": 84,
  "no_repeat_ngram_size": 2,
  "num_beams": 4,
  "pad_token_id": 0
}

In [6]:
# Dont forget to push GenerationConfig to hub
model.generation_config.max_length=model_max_length # max length of input
model.generation_config.max_new_tokens=model_max_length # max length of generation
model.generation_config.repetition_penalty=2.0

In [7]:
model.generation_config

GenerationConfig {
  "decoder_start_token_id": 0,
  "eos_token_id": 1,
  "length_penalty": 0.6,
  "max_length": 92,
  "max_new_tokens": 92,
  "no_repeat_ngram_size": 2,
  "num_beams": 4,
  "pad_token_id": 0,
  "repetition_penalty": 2.0
}

In [8]:
def preprocess_function(examples):
    inputs = [str(item) for item in examples["description"]]

    model_inputs = tokenizer(
        inputs, 
        max_length=model_max_length, 
        padding='max_length', 
        truncation=True
    )
    labels = tokenizer(
        text_target=examples["title"],
        max_length=model_max_length,
        padding='max_length',
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = (
    load_dataset(dataset_name)["train"]# .shuffle(seed=42).select(range(100))
    .train_test_split(test_size=0.1, seed=42)
)

tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["title", "description"]
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

README.md:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

dataset.json:   0%|          | 0.00/84.4M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/75760 [00:00<?, ? examples/s]

Map:   0%|          | 0/68184 [00:00<?, ? examples/s]

Map:   0%|          | 0/7576 [00:00<?, ? examples/s]

In [9]:
len(tokenized_dataset['train'][0]['input_ids'])
len(tokenized_dataset['train'][2]['input_ids'])

92

In [10]:
' '.join(tokenizer.batch_decode(tokenized_dataset['train'][0]['input_ids'], skip_special_tokens=True), )

'Україн ські військов і  форс ують  Дніпро на Х ерсон щин і . Про це  повідомл яє американ ський  і нститут вив чення в ійни . П иш уть , що сили оборон и  ні бито актив із ували сво ю  діяльн ість на  певн их остров ах  дель ти  Дніпр а біл я Антон івського мост у , що б за кріп ити  позиці ї  і під гот уватися до наступ у на  лів обереж ж я Х '

In [11]:
!pip install evaluate
!pip install rouge_score
!pip install bert_score

  pid, fd = os.forkpty()


Collecting evaluate
  Downloading evaluate-0.4.3-py3-none-any.whl.metadata (9.2 kB)
Downloading evaluate-0.4.3-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: evaluate
Successfully installed evaluate-0.4.3
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25ldone
[?25h  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=05d97d8d4efa817263e90e5c6b5ef89c72f135c9c652a5986486b9373a7c515f
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
Collecting bert_score
  Downloading bert_score

In [12]:
import evaluate
import numpy as np

rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore", model_type='google/mt5-small')

def compute_metrics(eval_preds):
    logits, labels = eval_preds
    decoded_preds = tokenizer.batch_decode(logits, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    results = rouge.compute(predictions=decoded_preds, references=decoded_labels, tokenizer=lambda x: tokenizer(x)['input_ids'])
    bertscore_results = bertscore.compute(predictions=decoded_preds, references=decoded_labels, model_type='google/mt5-small')
    results.update({
        "bertscore_recall": sum(bertscore_results['recall'])/len(bertscore_results['recall']),
        "bertscore_precision": sum(bertscore_results['precision'])/len(bertscore_results['precision']),
        "bertscore_f1": sum(bertscore_results['f1'])/len(bertscore_results['f1']),
    })
    return results

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

In [13]:
!pip install wandb



In [14]:
environ["WANDB_PROJECT"]="Fine-tune T5 ukr-news-yt-summary"
environ["WANDB_LOG_MODEL"] = "checkpoint"

In [15]:
import wandb
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [16]:
torch.cuda.empty_cache()

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    logging_strategy="steps",
    logging_steps=1000,
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    save_only_model=True,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=12,
    predict_with_generate=True,
    warmup_ratio=0.01,
    learning_rate=3e-4,
    num_train_epochs=4,
    report_to="wandb",
    run_name="t5-4epoch",
    fp16=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [17]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mandriibul[0m ([33mandriibul-nocompany[0m). Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011113216066668328, max=1.0…

Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Bertscore Recall,Bertscore Precision,Bertscore F1
1,0.9294,0.842479,0.271782,0.103618,0.228602,0.228587,0.565692,0.583316,0.573491
2,0.8071,0.784943,0.279202,0.112299,0.237847,0.23782,0.568209,0.591902,0.578975
3,0.7368,0.766242,0.28702,0.116288,0.240612,0.24051,0.578629,0.586414,0.581716
4,0.6776,0.762051,0.288966,0.118548,0.242498,0.242476,0.580157,0.588808,0.583695


tokenizer_config.json:   0%|          | 0.00/82.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/553 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/1.20G [00:00<?, ?B/s]

[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-8523)... Done. 9.7s
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-17046)... Done. 8.5s
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-25569)... Done. 8.4s
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-34092)... Done. 8.6s
[34m[1mwandb[0m: Adding directory to artifact (./results/checkpoint-34092)... Done. 8.6s


TrainOutput(global_step=34092, training_loss=0.8276213284852456, metrics={'train_runtime': 19978.1502, 'train_samples_per_second': 13.652, 'train_steps_per_second': 1.706, 'total_flos': 5.876195353834291e+16, 'train_loss': 0.8276213284852456, 'epoch': 4.0})

In [18]:
wandb.finish()

VBox(children=(Label(value='8891.056 MB of 8891.056 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
eval/bertscore_f1,▁▅▇█
eval/bertscore_precision,▁█▄▅
eval/bertscore_recall,▁▂▇█
eval/loss,█▃▁▁
eval/rouge1,▁▄▇█
eval/rouge2,▁▅▇█
eval/rougeL,▁▆▇█
eval/rougeLsum,▁▆▇█
eval/runtime,▅▁▇█
eval/samples_per_second,▄█▂▁

0,1
eval/bertscore_f1,0.58369
eval/bertscore_precision,0.58881
eval/bertscore_recall,0.58016
eval/loss,0.76205
eval/rouge1,0.28897
eval/rouge2,0.11855
eval/rougeL,0.2425
eval/rougeLsum,0.24248
eval/runtime,1357.4853
eval/samples_per_second,5.581


In [19]:
def generate(text, **kwargs):
    inputs = tokenizer(text, return_tensors="pt").to('cuda')
    with torch.no_grad():
        hypotheses = model.generate(**inputs, **kwargs)
    print(len(hypotheses))
    return tokenizer.batch_decode(hypotheses, skip_special_tokens=True)


In [20]:
generate(
"""
Американці знали, що буде в плані перемоги Зеленського і план ніяк не змінив їхні думки щодо війни України та РФ. Неназваний чиновник адміністрації Байдена назвав мирний план Зеленського списком бажань
""",
    num_return_sequences=5,
    num_beams=10,
    do_sample=True,
    top_p=0.7,
)

5


['⚡️Екстрено! Байден ОШЕЛЕШИВ заявою про Зеленського. Слухайте до кінця',
 '⚡️ТЕРМІНОВО! Байден ШОКУВАВ заявою про МИРНИЙ ПЛАН Зеленського. Слухайте до кінця',
 '⚡️Екстрено! Байден ОШЕЛЕШИВ заявою про Зеленського. Слухайте, що сказав',
 '⚡️ТЕРМІНОВО! Байден ШОКУВАВ заявою про МИРНИЙ ПЛАН Зеленського. Слухайте, що сказав',
 '⚡️ТЕРМІНОВО! Байден ШОКУВАВ заявою про МИРНИЙ ПЛАН Зеленського. Слухайте, що назріває']