In [None]:
import torch
from transformers import T5ForConditionalGeneration, T5Tokenizer
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset
import mlflow
from os import environ

TRANSFORMERS_CACHE = "D:\.cache\huggingface"

In [None]:
model_name = "cointegrated/rut5-base-multitask"
model_max_length = 64  #   72
tokenizer = T5Tokenizer.from_pretrained(
    model_name, model_max_length=model_max_length, cache_dir=TRANSFORMERS_CACHE
)
model = T5ForConditionalGeneration.from_pretrained(
    model_name, cache_dir=TRANSFORMERS_CACHE
)
model.generation_config.max_new_tokens = 72

In [None]:
task = "answer"  #   "reply" based on fiction and "answer" based on online forums

def preprocess_function(examples):
    inputs = [f"{task} | " + item for item in examples["q"]]

    model_inputs = tokenizer(
        inputs, max_length=model_max_length, padding=True, truncation=True
    )
    labels = tokenizer(
        text_target=examples["a"],
        max_length=model_max_length,
        padding=True,
        truncation=True,
    )

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

dataset = (
    load_dataset("json", data_files="../datasets/parallel.json")["train"]
    .train_test_split(test_size=0.1)
)

tokenized_dataset = dataset.map(
    preprocess_function, batched=True, remove_columns=["q", "a"]
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [None]:
import evaluate

bleu = evaluate.load("bleu")

def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    result = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    del result["precisions"]
    return result

In [None]:
environ["MLFLOW_EXPERIMENT_NAME"] = "rut5-filtered-data"
environ["MLFLOW_FLATTEN_PARAMS "] = "True"
# environ["HF_MLFLOW_LOG_ARTIFACTS"] = "True"

In [None]:
torch.cuda.empty_cache()

training_args = Seq2SeqTrainingArguments(
    output_dir="./results",
    logging_strategy="steps",
    logging_steps=250,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=32,
    predict_with_generate=True,
    warmup_ratio=0.1,
    learning_rate=5e-4,
    num_train_epochs=3,
    fp16=False,  # model pretrained on bf16 and incompatible with fp16
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

mlflow.end_run()

In [None]:
trainer.evaluate()

In [None]:
def generate(text, **kwargs):
    inputs = tokenizer(f"{task} | {text}", return_tensors="pt").to('cuda')
    with torch.no_grad():
        hypotheses = model.generate(**inputs, num_beams=10, **kwargs)
    print(len(hypotheses))
    return tokenizer.batch_decode(hypotheses, skip_special_tokens=True)


In [None]:
generate(
    "аниме это не круто",
    do_sample=True,
    top_p=0.5,
    num_return_sequences=3,
    repetition_penalty=2.5,
)