In [1]:
# Transformers installation
! pip install "transformers[torch]" datasets -q
# To install from source instead of the last release, comment the command above and uncomment the following one.
# ! pip install git+https://github.com/huggingface/transformers.git



In [2]:
!pip install transformers datasets evaluate rouge_score -q



In [3]:
# from huggingface_hub import notebook_login

# notebook_login()

## Load dataset

In [6]:
!pip install "transformers[torch]" sentencepiece datasets -q

In [7]:
!pip install wandb -q

In [8]:
import pandas as pd

In [11]:
import os


# data_path = "data/processed/dialogsum_google_to_rus/"
# data_path = "."
data_path = ".."

data_val = pd.read_csv(os.path.join(data_path, "output_validation.csv"))
data_train = pd.read_csv(os.path.join(data_path, "output_train.csv"))

In [12]:
import datasets
from datasets import DatasetDict
def make_dataset(dataframe):
    data = dataframe[["dialogue", "summary"]]
    # data.columns = ["input_ids", "labels"]
    dataset = datasets.Dataset.from_pandas(data)
    return dataset
dataset_train = make_dataset(data_train)
dataset_val = make_dataset(data_val)
dataset_dict = DatasetDict({
  "train": dataset_train,
  "validation": dataset_val,
})
dataset_val[20]

  from .autonotebook import tqdm as notebook_tqdm


{'dialogue': '#Человек1#: Знаете ли вы, что употребление пива помогает лучше петь?\n#Человек2#: Ты уверен? Откуда вы знаете?\n#Человек1#: Ну, обычно люди думают, что я ужасный певец, но после того, как мы все выпьем немного пива, они говорят, что я звучу намного лучше!\n#Человек2#: Ну, я слышал, что если пить достаточно пива, то сможешь лучше говорить на иностранных языках. . .\n#Человек1#: Тогда, после нескольких кружек пива, ты будешь петь на тайваньском?\n#Человек2#: Возможно. . .',
 'summary': '#Человек1# говорит, что пиво помогает лучше петь, но #Человек2# слышал, что пиво помогает говорить на иностранных языках.'}

Start by loading the smaller California state bill subset of the BillSum dataset from the 🤗 Datasets library:

There are two fields that you'll want to use:

- `dialogue`: the text of the bill which'll be the input to the model.
- `summary`: a condensed version of `text` which'll be the model target.

## Model and tokenizer loading

In [13]:
from transformers import AutoTokenizer

checkpoint = "IlyaGusev/rut5_base_headline_gen_telegram"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)


Downloading (…)okenizer_config.json: 100%|██████████| 327/327 [00:00<00:00, 26.4kB/s]
Downloading spiece.model: 100%|██████████| 828k/828k [00:00<00:00, 2.42MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 1.31M/1.31M [00:00<00:00, 53.8MB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 65.0/65.0 [00:00<00:00, 25.9kB/s]


In [14]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)

Downloading (…)lve/main/config.json: 100%|██████████| 766/766 [00:00<00:00, 64.8kB/s]
Downloading pytorch_model.bin: 100%|██████████| 977M/977M [00:18<00:00, 53.8MB/s] 


## Inference

In [15]:
n_samples = 10
validation_data = dataset_dict["validation"].select(range(n_samples))

In [16]:
text = validation_data[0]["dialogue"]

In [17]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device="cuda")
summarizer(text)

[{'summary_text': '#Человек2#: У меня проблемы с дыханием, я не простудился'}]

Log the results to W&B:

In [20]:
import wandb

wandb.init()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /home/coder/.netrc
huggingface/tokenizers: The current process just got forked, after parallelism has alr

In [21]:
table = wandb.Table(columns=["Input Text", "Target Summary", "Generated Summary"])

# Process each example in the validation dataset and append to the table_data list
for example in validation_data:
    input_text = example["dialogue"]  # Replace with the actual key in your dataset
    target_summary = example["summary"]  # Replace with the actual key in your dataset

    # Generate summary using the pipeline
    generated_summary = summarizer(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Append row to table_data
    table.add_data(input_text, target_summary, generated_summary[0]["summary_text"])

# Create a WandB Table and log it
wandb.log({"summarization_before_fine_tuning": table})


Your max_length is set to 150, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


## Preprocess

The preprocessing function you want to create needs to:

1. Prefix the input with a prompt so T5 knows this is a summarization task. Some models capable of multiple NLP tasks require prompting for specific tasks.
2. Use the keyword `text_target` argument when tokenizing labels.
3. Truncate sequences to be no longer than the maximum length set by the `max_length` parameter.

In [22]:
prefix = ""

def preprocess_function(examples):
    inputs = [prefix + doc for doc in examples["dialogue"]]
    model_inputs = tokenizer(inputs, max_length=1024, truncation=True)

    labels = tokenizer(text_target=examples["summary"], max_length=128, truncation=True)

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

To apply the preprocessing function over the entire dataset, use 🤗 Datasets [map](https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map) method. You can speed up the `map` function by setting `batched=True` to process multiple elements of the dataset at once:

In [23]:
tokenized_dataset = dataset_dict.map(preprocess_function, batched=True)

Map: 100%|██████████| 12460/12460 [00:03<00:00, 3257.22 examples/s]
Map: 100%|██████████| 500/500 [00:00<00:00, 3123.68 examples/s]


Now create a batch of examples using [DataCollatorForSeq2Seq](https://huggingface.co/docs/transformers/main/en/main_classes/data_collator#transformers.DataCollatorForSeq2Seq). It's more efficient to *dynamically pad* the sentences to the longest length in a batch during collation, instead of padding the whole dataset to the maximum length.

In [24]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=checkpoint)

## Evaluate

Including a metric during training is often helpful for evaluating your model's performance. You can quickly load a evaluation method with the 🤗 [Evaluate](https://huggingface.co/docs/evaluate/index) library. For this task, load the [ROUGE](https://huggingface.co/spaces/evaluate-metric/rouge) metric (see the 🤗 Evaluate [quick tour](https://huggingface.co/docs/evaluate/a_quick_tour) to learn more about how to load and compute a metric):

In [25]:
import evaluate

rouge = evaluate.load("rouge")
bleu = evaluate.load("bleu")

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 4.24MB/s]
Downloading builder script: 100%|██████████| 5.94k/5.94k [00:00<00:00, 4.47MB/s]
Downloading extra modules: 4.07kB [00:00, 3.91MB/s]                   
Downloading extra modules: 100%|██████████| 3.34k/3.34k [00:00<00:00, 3.11MB/s]


Then create a function that passes your predictions and labels to [compute](https://huggingface.co/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute) to calculate the ROUGE metric:

In [26]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.where(predictions != -100, predictions, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    result_rouge = rouge.compute(predictions=decoded_preds,
                        references=decoded_labels,
                        tokenizer=tokenizer.tokenize)

    bleu_results = bleu.compute(predictions=decoded_preds, references=decoded_labels, tokenizer=tokenizer.tokenize)
    result = {
        **{"rouge_" + k: v for k, v in result_rouge.items()},
        **{"bleu_" + k: v for k, v in bleu_results.items()},
    }
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]
    result["gen_len"] = np.mean(prediction_lens)


    return {k: round(v, 4) if isinstance(v, float) else v
            for k, v in result.items()}

Your `compute_metrics` function is ready to go now, and you'll return to it when you setup your training.

## Train

<Tip>

If you aren't familiar with finetuning a model with the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer), take a look at the basic tutorial [here](https://huggingface.co/docs/transformers/main/en/tasks/../training#train-with-pytorch-trainer)!

</Tip>

You're ready to start training your model now! Load T5 with [AutoModelForSeq2SeqLM](https://huggingface.co/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForSeq2SeqLM):

In [27]:
from transformers import AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer

At this point, only three steps remain:

1. Define your training hyperparameters in [Seq2SeqTrainingArguments](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainingArguments). The only required parameter is `output_dir` which specifies where to save your model. You'll push this model to the Hub by setting `push_to_hub=True` (you need to be signed in to Hugging Face to upload your model). At the end of each epoch, the [Trainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer) will evaluate the ROUGE metric and save the training checkpoint.
2. Pass the training arguments to [Seq2SeqTrainer](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Seq2SeqTrainer) along with the model, dataset, tokenizer, data collator, and `compute_metrics` function.
3. Call [train()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train) to finetune your model.

In [30]:
import torch
import os


use_fp16 = torch.cuda.is_available()

os.environ["WANDB_PROJECT"] = "tg-summarizer" # name your W&B project

training_args = Seq2SeqTrainingArguments(
    output_dir="my_awesome_billsum_model",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=3,
    per_device_eval_batch_size=3,
    logging_steps=100,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=20,
    predict_with_generate=True,
    # fp16=use_fp16,
    # bf16=True,
    generation_max_length=100,
    # push_to_hub=True,
    report_to="wandb",  # enable logging to W&B

)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"], #.select(range(1000)), # out of 12k
    eval_dataset=tokenized_dataset["validation"].select(range(100)),
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()

Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

Once training is completed, share your model to the Hub with the [push_to_hub()](https://huggingface.co/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.push_to_hub) method so everyone can use your model:

In [None]:
trainer.push_to_hub()

<Tip>

For a more in-depth example of how to finetune a model for summarization, take a look at the corresponding
[PyTorch notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization.ipynb)
or [TensorFlow notebook](https://colab.research.google.com/github/huggingface/notebooks/blob/main/examples/summarization-tf.ipynb).

</Tip>

## Inference

In [None]:
n_samples = 10
validation_data = dataset_dict["validation"].select(range(n_samples))

In [None]:
text = validation_data[0]["dialogue"]

In [None]:
from transformers import pipeline

summarizer = pipeline("summarization", model=model, tokenizer=tokenizer, device="cuda")
summarizer(text)

[{'summary_text': '#Человек2# жалуется, что у него проблемы с дыханием. #Person1# сообщает #Person1#, что у #Person1# нет аллергии.'}]

Log the results to W&B:

In [None]:
import wandb

In [None]:
table = wandb.Table(columns=["Input Text", "Target Summary", "Generated Summary"])

# Process each example in the validation dataset and append to the table_data list
for example in validation_data:
    input_text = example["dialogue"]  # Replace with the actual key in your dataset
    target_summary = example["summary"]  # Replace with the actual key in your dataset

    # Generate summary using the pipeline
    generated_summary = summarizer(input_text, max_length=150, min_length=50, length_penalty=2.0, num_beams=4, early_stopping=True)

    # Append row to table_data
    table.add_data(input_text, target_summary, generated_summary[0]["summary_text"])

# Create a WandB Table and log it
wandb.log({"summarization_after_fine_tuning": table})


Your max_length is set to 150, but your input_length is only 117. Since this is a summarization task, where outputs shorter than the input are typically wanted, you might consider decreasing max_length manually, e.g. summarizer('...', max_length=58)


In [None]:
wandb.finish()