In [53]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [54]:
import os

os.chdir("../scripts")

from data_processing import poquad, processing
from t5.load_t5 import *

In [55]:
train_df, valid_df = poquad.load_poquad_manually_downloaded("../data/poquad-manually-processed/")

In [56]:
train_input = poquad.dataset_into_str_input(train_df)

In [57]:
valid_input = poquad.dataset_into_str_input(valid_df)

In [58]:
tokenizer, model = load_plt5("../models/plt5-original-small")

In [59]:
train_input["input_text"].sample(100)

24609    kontekst: Bogurodzica (pieśń)  Bogurodzica w p...
19841    kontekst: Płód arlekin  Rybia łuska arlekinowa...
1227     kontekst: Daniel Barenboim  W latach 1967–1987...
33835    kontekst: Admirał Fłota Sowietskogo Sojuza Kuz...
42819    kontekst: Błędy w koszykówce  Faul (ang. foul;...
                               ...                        
42843    kontekst: Pancerniki typu Tosa  Stępkę pancern...
14143    kontekst: Kirił Petkow (skoczek narciarski)  W...
9572     kontekst: Litwa na Zimowych Igrzyskach Olimpij...
29981    kontekst: Dziewanna drobnokwiatowa  Dziewanna ...
17324    kontekst: Róża (film 2011)  Tadeusz zostaje sp...
Name: input_text, Length: 100, dtype: object

In [60]:
sample = train_input["target_text"].sample(100)

max_size = 0
for i in range(len(sample)):
    size = tokenizer(sample.iloc[i], return_tensors="pt").input_ids.shape[1]
    if size > max_size:
        max_size = size
        print(max_size)

4
5
10
24
69


In [61]:
import torch
from typing import List, Dict, Any
from dataclasses import dataclass
from transformers import Trainer, TrainingArguments, DataCollatorWithPadding, Seq2SeqTrainer, Seq2SeqTrainingArguments

In [62]:
@dataclass
class T2TDataCollator(DataCollatorWithPadding):
    def collate_batch(self, batch: List) -> Dict[str, torch.Tensor]:
        """
        Take a list of samples from a Dataset and collate them into a batch.
        Returns:
            A dictionary of tensors
        """
        input_ids = torch.stack([example['input_ids'] for example in batch])
        lm_labels = torch.stack([example['target_ids'] for example in batch])
        lm_labels[lm_labels[:, :] == 0] = -100
        attention_mask = torch.stack([example['attention_mask'] for example in batch])
        decoder_attention_mask = torch.stack([example['target_attention_mask'] for example in batch])


        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'target_ids': lm_labels,
            'target_attention_mask': decoder_attention_mask
        }


In [63]:
class T2TDataCollator:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def __call__(self, batch):
        # Assuming 'batch' is a list of examples, where each example is a dictionary
        # that includes 'input_ids' and 'labels' (target sequence) among other possible keys.
        input_ids = [example["input_ids"] for example in batch]
        labels = [example["labels"] for example in batch]
        
        # Tokenizer's pad method can handle padding of both input_ids and decoder_input_ids
        batch = self.tokenizer.pad(
            {"input_ids": input_ids},
            return_tensors="pt"
        )
        
        # Ensure labels are also included and properly padded
        batch["labels"] = self.tokenizer.pad(
            {"input_ids": labels},
            return_tensors="pt"
        )["input_ids"]
        
        return batch

In [64]:
train_dataset = processing.TextDataset(train_input, tokenizer, 1024, 128)
valid_dataset = processing.TextDataset(valid_input, tokenizer, 1024, 128)

# Create DataCollator
data_collator = T2TDataCollator(tokenizer)

In [65]:
input_text = list(train_dataset.input_text.loc[[0]])

In [66]:
tokenizer.batch_decode(tokenizer.batch_encode_plus(
            input_text,
            add_special_tokens=True,
            max_length=1024,
            padding='max_length',
            return_attention_mask=True,
            truncation=True
        )["input_ids"], skip_special_tokens=True)

['kontekst: Konfederacja polsko-czechosłowacka Projekty konfederacji zaczęły się załamywać 5 sierpnia 1942. Ponownie wróciła kwestia monachijska, co uaktywniło się wymianą listów Ripka – Stroński. Natomiast 17 sierpnia 1942 doszło do spotkania E. Beneša i J. Masaryka z jednej a Wł. Sikorskiego i E. Raczyńskiego z drugiej strony. Polscy dyplomaci zaproponowali podpisanie układu konfederacyjnego. W następnym miesiącu, tj. 24 września, strona polska przesłała na ręce J. Masaryka projekt deklaracji o przyszłej konfederacji obu państw. Strona czechosłowacka projekt przyjęła, lecz już w listopadzie 1942 E. Beneš podważył ideę konfederacji. W zamian zaproponowano zawarcie układu sojuszniczego z Polską na 20 lat (formalnie nastąpiło to 20 listopada 1942). pytanie: Co było powodem powrócenia konceptu porozumieniu monachijskiego?']

In [67]:
import wandb

In [68]:
wandb.init(
    # set the wandb project where this run will be logged
    project="PLT5 Small Finetuning Poquad",
    # track hyperparameters and run metadata
    # config={
    # "architecture": "PLT5 Small",
    # "dataset": "Poquad",
    # }
)

VBox(children=(Label(value='0.003 MB of 0.003 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▃▆█
train/global_step,▁▃▆█
train/learning_rate,▁▁▁
train/loss,▁▁▁

0,1
total_flos,1.976354068443955e+16
train/epoch,0.99974
train/global_step,1924.0
train/grad_norm,
train/learning_rate,0.0
train/loss,0.0
train_loss,0.0
train_runtime,2352.903
train_samples_per_second,19.63
train_steps_per_second,0.818


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112396999993102, max=1.0…

In [69]:
from transformers import TrainerCallback

class PrintMemoryUsageCallback(TrainerCallback):
    """ Callback that prints memory allocation during training """
    def on_step_end(self, args, state, control, **kwargs):
        print(f"Step {state.global_step}: {torch.cuda.memory_allocated() / 1024 ** 2:.2f} MB allocated")

In [70]:
train_dataset = processing.TextDataset(train_input.iloc[:1000], tokenizer, 1024, 128)
valid_dataset = processing.TextDataset(valid_input.iloc[:1000], tokenizer, 1024, 128)

# Create DataCollator
data_collator = T2TDataCollator(tokenizer)

# Initialize model

# Define TrainingArguments
training_args = Seq2SeqTrainingArguments(
    learning_rate=3e-4,
    output_dir='./results',
    num_train_epochs=1,
    per_device_train_batch_size=6,
    per_device_eval_batch_size=6,
    save_total_limit=1,
    warmup_steps=500,
    weight_decay=0.01,
    fp16=True,
    gradient_accumulation_steps=4,
    logging_dir='./logs',
    overwrite_output_dir=True,
)

# Initialize Trainer
trainer = Seq2SeqTrainer(
        model=model,
        tokenizer=tokenizer,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        data_collator=data_collator,
        callbacks=[PrintMemoryUsageCallback()],
        )

# Train the model
trainer.train()

# Save the model
trainer.save_model('../models/plt5-small-1epoch')

print("Training complete and model saved to ./trained_model")

  0%|          | 0/41 [00:00<?, ?it/s]

Step 1: 380.25 MB allocated
Step 2: 380.25 MB allocated
Step 3: 380.25 MB allocated
Step 4: 380.25 MB allocated
Step 5: 380.25 MB allocated
Step 6: 380.25 MB allocated
Step 7: 380.25 MB allocated
Step 8: 380.25 MB allocated
Step 9: 380.25 MB allocated
Step 10: 380.25 MB allocated
Step 11: 380.25 MB allocated
Step 12: 380.25 MB allocated
Step 13: 380.25 MB allocated
Step 14: 380.25 MB allocated
Step 15: 380.25 MB allocated
Step 16: 380.25 MB allocated
Step 17: 380.25 MB allocated
Step 18: 380.25 MB allocated
Step 19: 380.25 MB allocated
Step 20: 380.25 MB allocated
Step 21: 380.25 MB allocated
Step 22: 380.25 MB allocated
Step 23: 380.25 MB allocated
Step 24: 380.25 MB allocated
Step 25: 380.25 MB allocated
Step 26: 380.25 MB allocated
Step 27: 380.25 MB allocated
Step 28: 380.25 MB allocated
Step 29: 380.25 MB allocated
Step 30: 380.25 MB allocated
Step 31: 380.25 MB allocated
Step 32: 380.25 MB allocated
Step 33: 380.25 MB allocated
Step 34: 380.25 MB allocated
Step 35: 380.25 MB allo