In [1]:
import os
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from datasets import Dataset
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
MODEL_NAME = "google/flan-t5-xl"
SMALLE_MODEL_PATH = "google/flan-t5-base" # FOR FIRST TRAIN THIS MODEL BECAUSE WE DONT HAVE RESOURSES TO TUNING LARGER MODEL
DATA_PATH = "data/gec-only"
DEVICE = "cuda"
MAX_LENGTH = 128
torch.cuda.empty_cache()

In [3]:
def load_data(split, tokenized=False):
    subfolder = "source-sentences-tokenized" if tokenized else "source-sentences"
    source_path = os.path.normpath(os.path.join(DATA_PATH, split, subfolder))

    subfolder = "target-sentences-tokenized" if tokenized else "target-sentences"
    target_path = os.path.normpath(os.path.join(DATA_PATH, split, subfolder))

    if not os.path.exists(source_path) or not os.path.exists(target_path):
        raise FileNotFoundError(f"Path {source_path} or {target_path} not found. Check the structure of project and DATA_PATH.")

    source_files = sorted([os.path.join(source_path, f) for f in os.listdir(source_path) if f.endswith(".txt")])
    target_files = sorted([os.path.join(target_path, f) for f in os.listdir(target_path) if f.endswith(".txt")])

    data = []
    for src_file, tgt_file in zip(source_files, target_files):
        with open(src_file, "r", encoding="utf-8") as src, open(tgt_file, "r", encoding="utf-8") as tgt:
            source_sentences = [line.strip() for line in src.readlines()]
            target_sentences = [line.strip() for line in tgt.readlines()]
            data.extend(zip(source_sentences, target_sentences))

    return pd.DataFrame(data, columns=["source", "target"])


In [4]:
train_df = load_data("train", tokenized=False)
test_df = load_data("test", tokenized=False)
print(train_df)
print(test_df)

                                                  source  \
0      Byte for France або “Мій досвід ведення блогу ...   
1      Останні 3 місяці мого життя видалися аж занадт...   
2      Сьогодні розповім про те як і навіщо мене зане...   
3      Якщо цікаво подивитися відразу на результат, т...   
4                                  Моє бачення Instagram   
...                                                  ...   
17124  Якщо є можливість зробити неправильно, люди зр...   
17125  Особливо, якщо інструмент провокує таке робити...   
17126                   Яким боком московити  нам брати?   
17127       І навіть якщо брати, то Авель теж мав брата.   
17128  Якщо судити по автівці, то де Папа, а де якийс...   

                                                  target  
0      Byte for France або “Мій досвід ведення блогу ...  
1      Останні 3 місяці мого життя видалися аж занадт...  
2      Сьогодні розповім про те, як і навіщо мене зан...  
3      Якщо цікаво подивитися відразу на ре

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["source"], truncation=True, padding="max_length", max_length=512)

tokenizer = T5Tokenizer.from_pretrained(SMALLE_MODEL_PATH)

train_dataset = Dataset.from_pandas(train_df).map(
    lambda x: {
        "input_ids": tokenizer(x["source"], truncation=True, padding="max_length", max_length=MAX_LENGTH)["input_ids"],
        "labels": tokenizer(x["target"], truncation=True, padding="max_length", max_length=MAX_LENGTH)["input_ids"]
    }
)

valid_dataset = Dataset.from_pandas(test_df).map(
    lambda x: {
        "input_ids": tokenizer(x["source"], truncation=True, padding="max_length", max_length=MAX_LENGTH)["input_ids"],
        "labels": tokenizer(x["target"], truncation=True, padding="max_length", max_length=MAX_LENGTH)["input_ids"]
    }
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Map: 100%|██████████| 17129/17129 [00:04<00:00, 4056.31 examples/s]
Map: 100%|██████████| 1467/1467 [00:00<00:00, 3799.01 examples/s]


In [6]:
model = T5ForConditionalGeneration.from_pretrained(SMALLE_MODEL_PATH).to(DEVICE)

In [7]:
training_args = TrainingArguments(
    output_dir="data/results_of_training",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps = 8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=1,
    save_strategy="epoch",
    logging_dir="./logs",
    logging_steps=10,
    fp16=True,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)



In [8]:
trainer.train()

  0%|          | 0/6423 [00:00<?, ?it/s]Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  0%|          | 10/6423 [00:10<1:46:11,  1.01it/s]

{'loss': 13.8383, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.0}


  0%|          | 20/6423 [00:20<1:43:31,  1.03it/s]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.01}


  0%|          | 30/6423 [00:30<1:43:49,  1.03it/s]

{'loss': 30.3691, 'grad_norm': nan, 'learning_rate': 5e-05, 'epoch': 0.01}


KeyboardInterrupt: 