In [106]:
import pandas as pd
import os
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
    
)
from tqdm import tqdm
from text_generation_metrics import TextGenerationMetrics
from IPython.display import display
%load_ext autoreload
%autoreload 2

os.environ["WANDB_DISABLED"] = "true"


# for google colab
# from google.colab import drive
# drive.mount('/content/drive')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Fine-tuning the model for our task.

In [None]:
# Setting up the model and tokenizer
model_name = "ai-forever/ruT5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(
        examples["input_text"], max_length=200, truncation=True, padding="max_length")
    targets = tokenizer(
        examples["target_text"], max_length=60, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Function for preparing a tokenized dataset


def prepare_tokenized_dataset(df, tokenize_func):
    dataset = Dataset.from_pandas(df)
    return dataset.map(tokenize_func, batched=True)

In [None]:
# Reading and processing the dataset

path_to_dataset = "path_to_datset"
dataset = pd.read_csv(path_to_dataset, sep=",")
dataset = dataset[["problems", "posts", "id"]]
dataset = dataset.dropna()
dataset = dataset.rename(
    columns={'posts': 'input_text', 'problems': 'target_text'})

train_df, temp_df = train_test_split(
    dataset[["input_text", "target_text", "id"]], test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(valid_df)}")
print(f"Test set size: {len(test_df)}")


# Tokenization of datasets
train_tokenized = prepare_tokenized_dataset(train_df, tokenize_function)
valid_tokenized = prepare_tokenized_dataset(valid_df, tokenize_function)
test_tokenized = prepare_tokenized_dataset(test_df, tokenize_function)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="path_to_result",
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    num_train_epochs=6,
    fp16=False,
    report_to="tensorboard",
    logging_steps=10,
    save_total_limit=1
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

## Next, we will try applying the fine-tuned model to arbitrary text to see how it works.

In [None]:
model_path = "/path_to_pre_train_model"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
first_example = "Температура, и совсем нет настроения"
input_ids = tokenizer(
    first_example, return_tensors="pt").input_ids.to(model.device)

with torch.no_grad():
    outputs = model.generate(input_ids, max_length=60)

# Decoding the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated text:", generated_text)

## Let's evaluate the performance of the fine-tuned model.

In [None]:
path_to_dataset = "/path_to_dataset_for_training.csv"

path_to_dataset = path_to_dataset
dataset = pd.read_csv(path_to_dataset, sep=",")
dataset = dataset[["problems", "posts", "id"]]
dataset = dataset.dropna()
dataset = dataset.rename(
    columns={'posts': 'input_text', 'problems': 'target_text'})

train_df, temp_df = train_test_split(
    dataset[["input_text", "target_text", "id"]], test_size=0.3, random_state=42)

valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

#### To calculate the metrics we will generate the answer using a pre-trained model for the training, test and validation data set.

In [None]:
def generate_response(text, max_length=60):
    input_ids = tokenizer(text, return_tensors="pt").input_ids.to(model.device)
    with torch.no_grad():
        outputs = model.generate(input_ids, max_length=max_length)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


def generate_responses_df(df, source_col, target_col="T5_pre_trained_gen_text", max_length=60):
    tqdm.pandas(desc="Generating responses")
    df[target_col] = df[source_col].progress_apply(
        lambda x: generate_response(x, max_length=max_length))
    return df


def generate_responses_batch(df, source_col, target_col="T5_pre_trained_gen_text", batch_size=16, max_length=60):
    generated_responses = []
    for i in tqdm(range(0, len(df), batch_size), desc="Generating responses", unit="batch"):
        batch_texts = df[source_col].iloc[i:i + batch_size].tolist()
        input_ids = tokenizer(batch_texts, return_tensors="pt",
                              padding=True, truncation=True).input_ids.to(model.device)
        with torch.no_grad():
            outputs = model.generate(input_ids, max_length=max_length)
        batch_generated = [tokenizer.decode(
            output, skip_special_tokens=True) for output in outputs]
        generated_responses.extend(batch_generated)

    df[target_col] = generated_responses
    return df

In [24]:
test_df = generate_responses_df(df=test_df, source_col="input_text")
print("The end of test_df generation")
valid_df = generate_responses_df(df=valid_df, source_col="input_text")
print("The end of valid_df generation")
train_df = generate_responses_df(df=train_df, source_col="input_text")

Generating responses: 100%|██████████| 1657/1657 [17:44<00:00,  1.56it/s]   


The end of test_df generation


Generating responses: 100%|██████████| 1656/1656 [11:52<00:00,  2.33it/s]


The end of valid_df generation


Generating responses: 100%|██████████| 7728/7728 [52:29<00:00,  2.45it/s]  


Here is a comparison of metrics for a model fine-tuned on our specific task, evaluated on training, validation, and test datasets. Additionally, we include metrics for the model before fine-tuning (non_finetuned) to highlight the improvement achieved through fine-tuning.

To calculate the metrics, we use the __calculate_all_metrics__ function from the __TextGenerationMetrics__ module. The metrics_calculator function returns the average values for all metrics across the entire DataFrame.

The function requires as input a __DataFrame__ that contains the target and generated text. Additionally, you need to specify the parameters:

- __target_column__ — the name of the column containing the target text.
- __generated_column__ — the name of the column containing the generated text in the DataFrame.

#### Overall Performance
The fine-tuned model demonstrates:

- Good accuracy: Reflected in __BLEU__, __ROUGE__, and __METEOR__ scores.
- Strong generalization: Consistent performance across training, validation, and test datasets.
- Fluency and diversity: Low __perplexity__ and repetition rate highlight its ability to generate natural and coherent outputs.

In [None]:
metrics_calculator = TextGenerationMetrics()
path_to_T5_1_response = pd.read_csv(
    "/path_to_non_finetuned_T5_model_result", sep=",")

non_finetuned = metrics_calculator.calculate_all_metrics(
    path_to_T5_1_response, target_column='target_text', generated_column='generated_text')
train_df_metrics = metrics_calculator.calculate_all_metrics(
    train_df, target_column='target_text', generated_column='T5_pre_trained_gen_text')
valid_df_metrics = metrics_calculator.calculate_all_metrics(
    valid_df, target_column='target_text', generated_column='T5_pre_trained_gen_text')
test_df_metrics = metrics_calculator.calculate_all_metrics(
    test_df, target_column='target_text', generated_column='T5_pre_trained_gen_text')

summary_table = pd.DataFrame(
    [train_df_metrics, valid_df_metrics, test_df_metrics, non_finetuned],
    index=["train", "valid", "test", "non_finetuned"]
)
summary_table.index.name = 'Dataset'

display(summary_table)

Unnamed: 0_level_0,BLEU Score (average),Precision (average),Recall (average),F1 Score (average),ROUGE-2 (average),ROUGE-L (average),METEOR (average),Perplexity (average),Repetition Rate (average)
Dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
train,0.348871,0.590895,0.593959,0.573655,0.456144,0.568462,0.548376,8.933747,0.032956
valid,0.302542,0.554859,0.565603,0.540236,0.41315,0.535876,0.519141,8.971618,0.031747
test,0.318592,0.563575,0.562599,0.544328,0.426454,0.537988,0.522028,9.041642,0.036448
non_finetuned,0.003709,0.039536,0.039583,0.030309,0.000575,0.030556,0.018473,10.66896,0.000908
