In [1]:
# Transformers installation
! pip install transformers[torch] datasets -q
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git

In [2]:
!pip install transformers datasets evaluate rouge_score -q

In [3]:
# from huggingface_hub import notebook_login

# notebook_login()

## Load dataset

In [4]:
!pip install transformers[torch] sentencepiece datasets -q

In [5]:
!pip install wandb -q

In [6]:
import pandas as pd

In [7]:
import os


# data_path = "data/processed/dialogsum_google_to_rus/"
data_path = "."

data_val = pd.read_csv(os.path.join(data_path, "output_validation.csv"))
data_train = pd.read_csv(os.path.join(data_path, "output_train.csv"))

In [8]:
import datasets
from datasets import DatasetDict
def make_dataset(dataframe):
    data = dataframe[["dialogue", "summary"]]
    # data.columns = ["input_ids", "labels"]
    dataset = datasets.Dataset.from_pandas(data)
    return dataset
dataset_train = make_dataset(data_train)
dataset_val = make_dataset(data_val)
dataset_dict = DatasetDict({
  "train": dataset_train,
  "validation": dataset_val,
})
dataset_val[20]

{'dialogue': '#Человек1#: Знаете ли вы, что употребление пива помогает лучше петь?\n#Человек2#: Ты уверен? Откуда вы знаете?\n#Человек1#: Ну, обычно люди думают, что я ужасный певец, но после того, как мы все выпьем немного пива, они говорят, что я звучу намного лучше!\n#Человек2#: Ну, я слышал, что если пить достаточно пива, то сможешь лучше говорить на иностранных языках. . .\n#Человек1#: Тогда, после нескольких кружек пива, ты будешь петь на тайваньском?\n#Человек2#: Возможно. . .',
 'summary': '#Человек1# говорит, что пиво помогает лучше петь, но #Человек2# слышал, что пиво помогает говорить на иностранных языках.'}

Start by loading the smaller California state bill subset of the BillSum dataset from the 🤗 Datasets library:

There are two fields that you'll want to use:

- `dialogue`: the text of the bill which'll be the input to the model.
- `summary`: a condensed version of `text` which'll be the model target.

## Model and tokenizer loading

In [9]:
from transformers import AutoTokenizer

checkpoint = "IlyaGusev/rut5_base_headline_gen_telegram"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


In [49]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

## Inference

In [50]:
n_samples = 10
validation_data = dataset_dict["validation"].select(range(n_samples))

In [51]:
text = validation_data[0]["dialogue"]

In [52]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device="cuda")
summarizer(text)

[{'summary_text': '#Человек2#: У меня проблемы с дыханием, я не простудился'}]

Log the results to W&B:

In [53]:
import wandb

In [54]:
table = wandb.Table(columns=["Input Text", "Target Summary", "Generated Summary"])

# Process each example in the validation dataset and append to the table_data list
for example in validation_data:
    input_text = example["dialogue"]  # Replace with the actual key in your dataset
    target_summary = example["summary"]  # Replace with the actual key in your dataset

    # Generate summary using the pipeline
    generated_summary = summarizer(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Append row to table_data
    table.add_data(input_text, target_summary, generated_summary[0]["summary_text"])

# Create a WandB Table and log it
wandb.log({"summarization_before_fine_tuning": table})


Your max_length is set to 150, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)


## Different metrics test

In [11]:
import evaluate


In [12]:
# test text pieces
predictions = ["русский язык не ожиданно", "привет братья", "русский язык не ожиданно"]
references = [[" на русской молве", "русский язык не ожиданно"], ["здравствуйсте комрады"], ["русский язык не ожиданно"]]



In [13]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references)
print(results)

{'bleu': 0.9099882808096075, 'precisions': [0.8, 0.8571428571428571, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1111111111111112, 'translation_length': 10, 'reference_length': 9}


In [14]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references, tokenizer=lambda x: x.split())
print(results)


{'bleu': 0.9099882808096075, 'precisions': [0.8, 0.8571428571428571, 1.0, 1.0], 'brevity_penalty': 1.0, 'length_ratio': 1.1111111111111112, 'translation_length': 10, 'reference_length': 9}


In [15]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references, tokenizer=tokenizer)
print(results)

{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0.0], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 6, 'reference_length': 6}


In [16]:
bleu = evaluate.load("bleu")
results = bleu.compute(predictions=predictions, references=references, tokenizer=tokenizer.tokenize)
print(results)

{'bleu': 0.7929873728856548, 'precisions': [0.8333333333333334, 0.8, 0.8333333333333334, 0.8888888888888888], 'brevity_penalty': 0.9459594689067654, 'length_ratio': 0.9473684210526315, 'translation_length': 18, 'reference_length': 19}


In [17]:
print(tokenizer("русский язык не ожиданно"))
print(tokenizer.tokenize("русский язык не ожиданно"))
print("русский язык не ожиданно".split())

{'input_ids': [259, 16735, 4217, 401, 259, 9807, 13796, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}
['▁', 'русский', '▁язык', '▁не', '▁', 'ожида', 'нно']
['русский', 'язык', 'не', 'ожиданно']


In [18]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions,
                        references=references)
print(results)

{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, 'rougeLsum': 0.0}


In [19]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions,
                        references=references,
                        tokenizer=lambda x: x.split())
print(results)

{'rouge1': 0.6666666666666666, 'rouge2': 0.6666666666666666, 'rougeL': 0.6666666666666666, 'rougeLsum': 0.6666666666666666}


In [20]:
rouge = evaluate.load('rouge')

results = rouge.compute(predictions=predictions,
                        references=references,
                        tokenizer=tokenizer.tokenize)
print(results)

{'rouge1': 0.7272727272727272, 'rouge2': 0.6666666666666666, 'rougeL': 0.7272727272727272, 'rougeLsum': 0.7272727272727272}


В общем думаю лучше использовать токенайзер умный, так как он учитывает, что корень хотя бы может быть идентичным при разных окончаниях.

## Preprocess

The preprocessing function you want to create needs to:

1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
2. Use the keyword `text_target` argument when tokenizing labels.
3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.

In [21]:
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [22]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map:   0%|          | 0/12460 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Now create a batch of examples using [DataCollatorForSeq2Seq](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForSeq2Seq). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [23]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [24]:
import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the ROUGE metric:

In [25]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result_rouge = rouge.compute(predictions=decoded_preds,
                        references=decoded_labels,
                        tokenizer=tokenizer.tokenize)

    bleu_results = bleu.compute(predictions=decoded_preds, references=decoded_labels, tokenizer=tokenizer.tokenize)
    result = {
        **{"rouge_" + k: v for k, v in result_rouge.items()},
        **{"bleu_" + k: v for k, v in bleu_results.items()},
    }
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)


    return {k: round(v, 4) if isinstance(v, float) else v
            for k, v in result.items()}

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load T5 with [AutoModelForSeq2SeqLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSeq2SeqLM):

In [26]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

At this point, only three steps remain:

1. Define your training hyperparameters in [Seq2SeqTrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the ROUGE metric and save the training checkpoint.
2. Pass the training arguments to [Seq2SeqTrainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [27]:
# import wandb

# wandb.login()

In [28]:
import torch
import os


use_fp16 = torch.cuda.is_available()

os.environ["WANDB_PROJECT"] = "tg-summarizer" # name your W&B project

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    logging_steps=100,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    # fp16=use_fp16,
    # bf16=True,
    generation_max_length=100,
    # push_to_hub=True,
    report_to="wandb",  # enable logging to W&B

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"].select(range(1000)), # out of 12k
    eval_dataset=tokenized_dataset["validation"].select(range(10)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mamirfvb[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Rouge Rouge1,Rouge Rouge2,Rouge Rougel,Rouge Rougelsum,Bleu Bleu,Bleu Precisions,Bleu Brevity Penalty,Bleu Length Ratio,Bleu Translation Length,Bleu Reference Length,Gen Len
1,1.6912,1.386762,0.4414,0.2393,0.3766,0.3772,0.2416,"[0.4792626728110599, 0.28537735849056606, 0.1956521739130435, 0.14356435643564355]",0.9705,0.9709,434,447,45.5
2,1.5602,1.304369,0.4674,0.2784,0.4005,0.4009,0.2566,"[0.555256064690027, 0.3518005540166205, 0.2564102564102564, 0.19648093841642228]",0.8148,0.83,371,447,39.1
3,1.491,1.281436,0.4682,0.2825,0.4016,0.4004,0.2788,"[0.4831223628691983, 0.30603448275862066, 0.22687224669603523, 0.18018018018018017]",1.0,1.0604,474,447,49.4
4,1.4217,1.240335,0.4915,0.3016,0.4177,0.4184,0.2956,"[0.508695652173913, 0.3288888888888889, 0.24545454545454545, 0.18604651162790697]",1.0,1.0291,460,447,48.0
5,1.3328,1.246128,0.4646,0.2635,0.3966,0.3966,0.2628,"[0.5023255813953489, 0.30714285714285716, 0.21951219512195122, 0.165]",0.9612,0.962,430,447,45.0
6,1.3104,1.206764,0.4673,0.2765,0.4007,0.4025,0.2743,"[0.5058548009367682, 0.31894484412470026, 0.2334152334152334, 0.181360201511335]",0.9542,0.9553,427,447,44.7
7,1.294,1.21525,0.472,0.2835,0.416,0.4168,0.2847,"[0.5, 0.3165137614678899, 0.2323943661971831, 0.18028846153846154]",0.9978,0.9978,446,447,46.6
8,1.2446,1.200177,0.4501,0.2566,0.3831,0.3819,0.2583,"[0.495260663507109, 0.3058252427184466, 0.22139303482587064, 0.1683673469387755]",0.9425,0.9441,422,447,44.2
9,1.2304,1.208618,0.4672,0.2803,0.3969,0.3977,0.2805,"[0.5070422535211268, 0.3245192307692308, 0.23891625615763548, 0.1919191919191919]",0.9519,0.953,426,447,44.6


Exception ignored in: <function _xla_gc_callback at 0x7d6b405eaf80>
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/jax/_src/lib/__init__.py", line 101, in _xla_gc_callback
    def _xla_gc_callback(*args):
KeyboardInterrupt: 


KeyboardInterrupt: ignored

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [None]:
trainer.push_to_hub()

<Tip>

For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).

</Tip>

## Inference

In [29]:
n_samples = 10
validation_data = dataset_dict["validation"].select(range(n_samples))

In [31]:
text = validation_data[0]["dialogue"]

In [47]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device="cuda")
summarizer(text)

[{'summary_text': '#Человек2# жалуется, что у него проблемы с дыханием. #Person1# сообщает #Person1#, что у #Person1# нет аллергии.'}]

Log the results to W&B:

In [38]:
import wandb

In [48]:
table = wandb.Table(columns=["Input Text", "Target Summary", "Generated Summary"])

# Process each example in the validation dataset and append to the table_data list
for example in validation_data:
    input_text = example["dialogue"]  # Replace with the actual key in your dataset
    target_summary = example["summary"]  # Replace with the actual key in your dataset

    # Generate summary using the pipeline
    generated_summary = summarizer(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Append row to table_data
    table.add_data(input_text, target_summary, generated_summary[0]["summary_text"])

# Create a WandB Table and log it
wandb.log({"summarization_after_fine_tunin": table})


Your max_length is set to 150, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
