In [None]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install sentence_transformers

In [1]:
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm

# pytorch
import torch
from torch.utils.data import DataLoader, Dataset

# model config
from transformers import T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model
from sentence_transformers import SentenceTransformer

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup

In [2]:
# file dependency
from utils import *

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### load model

In [3]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# checkpoint -> pretrained model
checkpoint = 't5-base'

In [4]:
# load tokenizer and model
processer = get_tokenizer(checkpoint)
model = get_model(checkpoint, device, processer).to(device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
# IR encoder -> T-5 sentence dense embeddings
encoder_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

### Data processing pipeline for fine-tuning
1. check if feature file exists
2. exists: load dataset from FeatureData
3. doesn't exist: load raw dataset from CustomData, call prepare_features, then do 2.  

In [6]:
# define data_path for raw input and feature_path for feature input
data_path = 'Question_Answer_Dataset_v1.2'
feature_cache_path = 'Question_Answer_Dataset_v1.2/features_questions'

In [1]:
# prepare feature data if not yet exist 
if not (os.path.exists(feature_cache_path) and os.path.isfile(feature_cache_path)):
    # use the encoder to get the raw dataset (context are extracted by IR with the K-NN sentence to the QA pair)
    print("processing raw dataset... ")
    raw_dataset = CustomData(data_path, encoder_model, k=1)
    print("computing features...")
    # tokenize
    prepare_features_q(raw_dataset, feature_cache_path, processer, max_len_inp=512,max_len_out=512)
else:
    print("features exists")

NameError: name 'os' is not defined

In [8]:
# feature dataset
# leave 425 points for testing
test_points = 425
train_dataset = FeatureData(feature_cache_path, 'train', test_points)
test_dataset = FeatureData(feature_cache_path, 'test', test_points) 

length of feature train set:  2259
length of feature test set:  425


In [9]:
# check what's in the dataset
input_dict = train_dataset[0]
print("input ids shape: ", input_dict['input_ids'].size())
print("question ids shape: ", input_dict['target_ids'].size())

input ids shape:  torch.Size([512])
question ids shape:  torch.Size([96])


In [10]:
# default split point: 425 -> samples after the split point will be in the test set
dataloader_train, dataloader_test = get_dataloaders(train_dataset, test_dataset, batch_size=128)

Loaded train feature data with 18 batches
Loaded test feature data with 4 batches


In [17]:
raw_dataset[110] # this is a hard question -> chain of thoughts/verification might be useful

('Is Calvin Jr. older than John Coolidge?',
 'No',
 'They had two sons; John Coolidge, born in 1906, and Calvin Jr., born in 1908.')

### Training Pipeline

In [18]:
def get_optimizer(model, opt_name, lr, eps): 
    if opt_name == 'Adam':
        return AdamW(model.parameters(), lr=lr, eps=eps)
    elif opt_name == 'SGD':
        return SGD(model.parameters(), lr=lr, eps=eps)

In [19]:
def get_scheduler(model, n_batches, n_epochs, warmup_portion=0.1):
    train_steps = n_epochs*n_batches
    warm_step = int(train_steps*warmup_portion)
    return get_linear_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)

#### configure hyperparameters
n_epoch 

lr

weight_decay

optimizer 

warmup_steps

lr_scheduler

In [20]:
# hyperparameters
n_epochs = 10
lr = 1e-5
weight_decay = 5e-5

# model optimizer
model.optimizer = get_optimizer(model, "Adam", lr, weight_decay)

# learning rate scheduler
model.scheduler = get_scheduler(model, len(dataloader_train), n_epochs)

# log file
output_dir = "logs"
if not os.path.isdir(output_dir):
    os.mkdir(output_dir)
log_file = "trial.log"

In [21]:
def log(output_dir, log_str, file_name=None):
    if file_name is None:
        file_name = "all.log"
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'a') as f:
        f.write(log_str + '\n')

In [22]:
# training loop
def train(model, dataloader_train, n_epochs, output_dir, log_file):
    
    model.train()
    
    for e in range(n_epochs):

        losses = 0
        for step, batch in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):

            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['input_mask'].to(device),
                decoder_input_ids=batch['target_ids'].to(device),
                decoder_attention_mask=batch['target_mask'].to(device),
                labels=batch['labels']
            )

            loss = outputs[0]

            model.optimizer.zero_grad() # clear loss
            loss.backward()
            model.optimizer.step()  # backprop to update the weights

            if model.scheduler is not None:
                model.scheduler.step()  # update learning rate schedule 

            # log losses
            loss /= len(train_dataloader)
            losses += loss.item()
        
        # output stats
        print(f"Epoch {e}; loss {losses}")
        log(output_dir, "Epoch " + str(e+1) + "; loss " + str(losses), log_file)
    

def test(model, dataloader_test, output_dir, log_file):
    
    model.eval()
    
    losses = 0
    for step, batch in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):

            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['input_mask'],
                decoder_input_ids=batch['target_ids'],
                decoder_attention_mask=batch['target_mask'],
                labels=batch['labels']
            )

            loss = outputs[0]

            # log losses
            loss /= len(train_dataloader)
            losses += loss.item()
        
    # output stats
    print(f"Validation loss {losses}")
    log(output_dir, "Validation loss " + str(losses), log_file)
    

In [23]:
train(model, dataloader_train, n_epochs, output_dir, log_file)

  0%|                                                                                            | 0/18 [00:01<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 1.50 GiB. GPU 0 has a total capacty of 22.19 GiB of which 791.50 MiB is free. Including non-PyTorch memory, this process has 21.41 GiB memory in use. Of the allocated memory 20.99 GiB is allocated by PyTorch, and 132.51 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF