In [3]:
%%capture
!pip install transformers
!pip install sentencepiece

In [1]:
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm

# pytorch
import torch
from torch.utils.data import DataLoader, Dataset

# model config
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup

In [2]:
# file dependency
from utils import *

### load model

In [49]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# checkpoint -> pretrained model
checkpoint = 't5-base'

In [50]:
# load tokenizer and model
processer = get_tokenizer(checkpoint)
model = get_model(checkpoint, device, processer).to(device)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Data processing pipeline for fine-tuning
1. check if feature file exists
2. exists: load dataset from FeatureData
3. doesn't exist: load raw dataset from CustomData, call prepare_features, then do 2.  

In [27]:
# define data_path for raw input and feature_path for feature input
data_path = 'Question_Answer_Dataset_v1.2'
feature_cache_path = 'Question_Answer_Dataset_v1.2/features'

In [28]:
# prepare data if not yet exi
if not (os.path.exists(feature_cache_path) and os.path.isfile(feature_cache_path)):
    raw_dataset = CustomData(data_path)
    prepare_features(raw_dataset, feature_cache_path, processer, max_len_inp=100000,max_len_out=96)
else:
    print("features exists")

features exists


In [29]:
# feature dataset
# leave 425 points for testing
test_points = 425
train_dataset = FeatureData(feature_cache_path, 'train', test_points)
test_dataset = FeatureData(feature_cache_path, 'test', test_points) 

length of feature train set:  2259
length of feature test set:  425


In [30]:
# check what's in the dataset
input_dict = train_dataset[0]
print("input ids shape: ", input_dict['input_ids'].size())
print("question ids shape: ", input_dict['target_ids'].size())

input ids shape:  torch.Size([100000])
question ids shape:  torch.Size([96])


In [31]:
# default split point: 425 -> samples after the split point will be in the test set
dataloader_train, dataloader_test = get_dataloaders(train_dataset, test_dataset, batch_size=128)

Loaded train feature data with 18 batches
Loaded test feature data with 4 batches


### Training Pipeline

In [51]:
def get_optimizer(model, opt_name, lr, eps): 
    if opt_name == 'Adam':
        return AdamW(model.parameters(), lr=lr, eps=eps)
    elif opt_name == 'SGD':
        return SGD(model.parameters(), lr=lr, eps=eps)

In [52]:
def get_scheduler(model, n_batches, n_epochs, warmup_portion=0.1):
    train_steps = n_epochs*n_batches
    warm_step = int(train_steps*warmup_portion)
    return get_linear_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)

#### configure hyperparameters
n_epoch 

lr

weight_decay

optimizer 

warmup_steps

lr_scheduler

In [53]:
# hyperparameters
n_epochs = 10
lr = 1e-5
weight_decay = 5e-5

# model optimizer
model.optimizer = get_optimizer(model, "Adam", lr, weight_decay)

# learning rate scheduler
model.scheduler = get_scheduler(model, len(dataloader_train), n_epochs)

# log file
output_dir = "logs"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
log_file = "trial.log"

In [54]:
def log(output_dir, log_str, file_name=None):
    if file_name is None:
        file_name = "all.log"
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'a') as f:
        f.write(log_str + '\n')

In [61]:
# training loop
def train(model, dataloader_train, n_epochs, output_dir, log_file):
    
    for e in range(n_epochs):

        losses = 0
        for step, batch in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):

            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['input_mask'].to(device),
                decoder_input_ids=batch['target_ids'].to(device),
                decoder_attention_mask=batch['target_mask'].to(device),
                labels=batch['labels']
            )

            loss = outputs[0]

            model.optimizer.zero_grad() # clear loss
            loss.backward()
            model.optimizer.step()  # backprop to update the weights

            if model.scheduler is not None:
                model.scheduler.step()  # update learning rate schedule 

            # log losses
            loss /= len(train_dataloader)
            losses += loss.item()
        
        # output stats
        print(f"Epoch {e}; loss {losses}")
        log(output_dir, "Epoch " + str(e+1) + "; loss " + str(losses), log_file)
    

def test(model, dataloader_test, output_dir, log_file):
    
    model.eval()
    
    losses = 0
    for step, batch in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):

            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['input_mask'],
                decoder_input_ids=batch['target_ids'],
                decoder_attention_mask=batch['target_mask'],
                labels=batch['labels']
            )

            loss = outputs[0]

            # log losses
            loss /= len(train_dataloader)
            losses += loss.item()
        
    # output stats
    print(f"Validation loss {losses}")
    log(output_dir, "Validation loss " + str(losses), log_file)
    

In [63]:
train(model, dataloader_train, n_epochs, output_dir, log_file)

  0%|                                                                                            | 0/18 [00:00<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 36.62 GiB (GPU 0; 7.43 GiB total capacity; 3.51 GiB already allocated; 3.51 GiB free; 3.85 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF