In [6]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install sentence_transformers

In [1]:
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm

# pytorch
import torch
from torch.utils.data import DataLoader, Dataset

# model config
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, RobertaModel, T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model
from sentence_transformers import SentenceTransformer

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup

In [2]:
from utils import *

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# checkpoint -> pretrained model
checkpoint = 't5-base'

In [4]:
# IR encoder -> T-5 sentence dense embeddings
encoder_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [5]:
# model = AutoModelForQuestionAnswering.from_pretrained(checkpoint).to(device)
# processer = AutoTokenizer.from_pretrained(checkpoint)

In [6]:
# load tokenizer and model
processer = get_tokenizer(checkpoint)
model = get_model(checkpoint, device, processer)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


### Data Preparation

In [7]:
# define data_path for raw input and feature_path for feature input
data_path = 'Question_Answer_Dataset_v1.2'
feature_cache_path = 'Question_Answer_Dataset_v1.2/features_answers'

In [9]:
# prepare feature data if not yet exist 
if not (os.path.exists(feature_cache_path) and os.path.isfile(feature_cache_path)):
    # use the encoder to get the raw dataset (context are extracted by IR with the K-NN sentence to the QA pair)
    print("processing raw dataset... ")
    raw_dataset = CustomData(data_path, encoder_model, k=1)
    print("computing features...")
    # tokenize
    prepare_features_a(raw_dataset, feature_cache_path, processer, max_len_inp=512,max_len_out=512)
else:
    print("features exists")

processing raw dataset... 


100%|███████████████████████████████████████████████████████████████████████████████| 1715/1715 [01:35<00:00, 17.87it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 826/826 [01:00<00:00, 13.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 1459/1459 [01:31<00:00, 15.86it/s]


length of dataset:  2684
computing features...


100%|█████████████████████████████████████████████████████████████████████████████| 2684/2684 [00:01<00:00, 1481.00it/s]


In [8]:
# feature dataset
# leave 425 points for testing
test_points = 425
train_dataset = FeatureData(feature_cache_path, 'train', test_points)
test_dataset = FeatureData(feature_cache_path, 'test', test_points) 

length of feature train set:  2259
length of feature test set:  425


In [9]:
# check what's in the dataset
input_dict = train_dataset[0]
print("input ids shape: ", input_dict['input_ids'].size())
print("question ids shape: ", input_dict['target_ids'].size())

input ids shape:  torch.Size([512])
question ids shape:  torch.Size([512])


In [13]:
raw_dataset[110] # this is a hard question -> chain of thoughts/verification might be useful

In [11]:
train_dataset[110].keys()

dict_keys(['input_ids', 'input_mask', 'target_ids', 'target_mask', 'labels'])

In [12]:
train_dataset[110]['input_ids'].shape

torch.Size([512])

### Train pipeline

In [9]:
def __save_model(model_dir, model, model_type='latest'):

    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    root_model_path = os.path.join(model_dir, saved_name)
    state_dict = {'weights': model.state_dict(), 
                  'optimizer': model.optimizer.state_dict(), 
                  'scheduler': model.scheduler.state_dict()}
    torch.save(state_dict, root_model_path)
        

# Loads the experiment data if exists to resume training from last saved checkpoint.
def __load_experiment(model_dir, model, model_type='latest'):
    
    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    if os.path.exists(os.path.join(model_dir, 'train.log')):
        # get current epoch
        current_epoch = 0
        with open(os.path.join(model_dir, 'train.log')) as f:
            for line in f:
                current_epoch += 1
        # get the latest model
        state_dict = torch.load(os.path.join(model_dir, saved_name), map_location=device.type)
        model.load_state_dict(state_dict['weights'])
        model.optimizer.load_state_dict(state_dict['optimizer'])
        model.scheduler.load_state_dict(state_dict['scheduler'])
    else:
        current_epoch = 0

    return model, current_epoch


def log(output_dir, log_str, file_name=None):
    if file_name is None:
        file_name = "all.log"
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'a') as f:
        f.write(log_str + '\n')

In [21]:
# training loop
def train(model, dataloader_train, n_epochs, model_dir, log_file):

    model.train() # put to train mode
    
    # load current model if exist
    model, current_epoch = __load_experiment(model_dir, model)
    
    all_losses = []
    
    for e in range(current_epoch, n_epochs):

        losses = 0
        for step, batch in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):
            
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['input_mask'].to(device),
                decoder_input_ids=batch['target_ids'].to(device),
                decoder_attention_mask=batch['target_mask'].to(device), 
                labels = batch['labels'].to(device)
            )

            loss = outputs[0]

            model.optimizer.zero_grad() # clear loss
            loss.backward()
            model.optimizer.step()  # backprop to update the weights

            if model.scheduler is not None:
                model.scheduler.step()  # update learning rate schedule 

            # log losses
            loss /= len(dataloader_train)
            losses += loss.item()
            
        # output stats
        print(f"Epoch {e}; loss {losses}")
        log(model_dir, "Epoch " + str(e+1) + "; loss " + str(losses), log_file)
        all_losses.append(losses)
        # save model
        __save_model(model_dir, model) # save latest
        if (e > current_epoch and losses < all_losses[-1]):
            __save_model(model_dir, model, model_type='best') # save best model        
        
        

def test(model, dataloader_test, model_dir, log_file):
    
    model, e = __load_experiment(model_dir, model, model_type='latest')
    
    model.eval()
    
    losses = 0
    for step, batch in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):

        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['input_mask'].to(device),
            decoder_input_ids=batch['target_ids'].to(device),
            decoder_attention_mask=batch['target_mask'].to(device),
            labels = batch['labels'].to(device)
        )

        loss = outputs[0]

        # log losses
        loss /= len(dataloader_test)
        losses += loss.item()
        
    # output stats
    print(f"Validation loss {losses}")
    log(model_dir, "Validation loss " + str(losses), log_file)

In [15]:
def get_optimizer(model, opt_name, lr, eps): 
    if opt_name == 'Adam':
        return AdamW(model.parameters(), lr=lr, eps=eps)
    elif opt_name == 'SGD':
        return SGD(model.parameters(), lr=lr, eps=eps)
    
def get_scheduler(model, scheduler, n_batches, n_epochs, warmup_portion=0.1):
    train_steps = n_epochs*n_batches
    warm_step = int(train_steps*warmup_portion)
    if scheduler == "linear": 
        return get_linear_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)
    elif scheduler == "cosine":
        return get_cosine_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)

In [16]:
# hyperparameters
n_epochs = 10
lr = 1e-5
weight_decay = 5e-5
batch_size = 4

# dataloaders
# default split point: 425 -> samples after the split point will be in the test set
dataloader_train, dataloader_test = get_dataloaders(train_dataset, test_dataset, batch_size=batch_size)

# model optimizer
model.optimizer = get_optimizer(model, "Adam", lr, weight_decay)

# learning rate scheduler
model.scheduler = get_scheduler(model, "linear", len(dataloader_train), n_epochs)

# model state_dict
model_dir = f"{checkpoint}_e{n_epochs}_lr{lr}_eps{weight_decay}_Adam_linearS_batch{batch_size}"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# log file
log_file = "train.log"

Loaded train feature data with 565 batches
Loaded test feature data with 107 batches


In [17]:
train(model, dataloader_train, n_epochs, model_dir, log_file)

100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 2; loss 1.0490192598044814


100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 3; loss 0.671609889274805


100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 4; loss 0.4829345119524078


100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 5; loss 0.37002997546960614


100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 6; loss 0.30222941334181996


100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 7; loss 0.26097374822256825


100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 8; loss 0.22786905645398292


100%|█████████████████████████████████████████████████████████████████████████████████| 565/565 [03:34<00:00,  2.63it/s]


Epoch 9; loss 0.21599957597402408


In [22]:
test(model, dataloader_test, model_dir, log_file)

100%|█████████████████████████████████████████████████████████████████████████████████| 107/107 [00:12<00:00,  8.36it/s]

Validation loss 0.06734531256876153





In [23]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 3.797605GB
torch.cuda.memory_reserved: 15.416016GB
torch.cuda.max_memory_reserved: 15.416016GB
