In [2]:
import pandas as pd
import os
import torch
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    T5ForConditionalGeneration,
    DataCollatorForSeq2Seq,
)

os.environ["WANDB_DISABLED"] = "true"

# for google colab
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# Setting up the model and tokenizer
model_name = "ai-forever/ruT5-base"
model = T5ForConditionalGeneration.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)


# Tokenization function
def tokenize_function(examples):
    inputs = tokenizer(
        examples["input_text"], max_length=200, truncation=True, padding="max_length")
    targets = tokenizer(
        examples["target_text"], max_length=60, truncation=True, padding="max_length")
    inputs["labels"] = targets["input_ids"]
    return inputs

# Function for preparing a tokenized dataset


def prepare_tokenized_dataset(df, tokenize_func):
    dataset = Dataset.from_pandas(df)
    return dataset.map(tokenize_func, batched=True)

In [None]:
# Reading and processing the dataset

path_to_dataset = "path_to_datset"
dataset = pd.read_csv(path_to_dataset, sep=",")
dataset = dataset[["problems", "posts", "id"]]
dataset = dataset.dropna()
dataset = dataset.rename(
    columns={'posts': 'input_text', 'problems': 'target_text'})

train_df, temp_df = train_test_split(
    dataset[["input_text", "target_text", "id"]], test_size=0.3, random_state=42)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

print(f"Training set size: {len(train_df)}")
print(f"Validation set size: {len(valid_df)}")
print(f"Test set size: {len(test_df)}")


# Tokenization of datasets
train_tokenized = prepare_tokenized_dataset(train_df, tokenize_function)
valid_tokenized = prepare_tokenized_dataset(valid_df, tokenize_function)
test_tokenized = prepare_tokenized_dataset(test_df, tokenize_function)

In [None]:
training_args = Seq2SeqTrainingArguments(
    output_dir="path_to_result",
    eval_strategy="epoch",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    weight_decay=0.01,
    num_train_epochs=6,
    fp16=False,
    report_to="tensorboard",
    logging_steps=10,
    save_total_limit=1
)


trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=valid_tokenized,
    tokenizer=tokenizer,
    data_collator=data_collator
)

trainer.train()

### Next, we will try applying the fine-tuned model to arbitrary text to see how it works.

In [None]:
model_path = "path_to_model"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)

In [None]:
first_example = "Температура, и совсем нет настроени]"
input_ids = tokenizer(first_example, return_tensors="pt").input_ids.to(model.device)

with torch.no_grad():
    outputs = model.generate(input_ids, max_length=60)

# Decoding the result
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Generated text:", generated_text)