In [79]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install sentence_transformers
!pip install gdown


In [80]:
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm

# pytorch
import torch
from torch.utils.data import DataLoader, Dataset

# model config
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline, RobertaModel, T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model
from sentence_transformers import SentenceTransformer

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup

In [81]:
# file dependency (don't forget to upload utils.py)
from utils import *

In [82]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# checkpoint -> pretrained model
checkpoint = 't5-base'

In [83]:
# load tokenizer and model
processer = get_tokenizer(checkpoint)
model = get_model(checkpoint, device, processer).to(device)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [84]:
# IR encoder -> T-5 sentence dense embeddings
encoder_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

### Data processing pipeline for fine-tuning
1. check if feature file exists
2. exists: load dataset from FeatureData
3. doesn't exist: load raw dataset from CustomData, call prepare_features, then do 2.  

In [85]:
# load dataset zip file to current directory and unzip
!unzip Question_Answer_Dataset_v1.2.zip

# define data_path for raw input and feature_path for feature input
data_path = 'Question_Answer_Dataset_v1.2'
feature_cache_path = 'Question_Answer_Dataset_v1.2/features_questions'
raw_path = os.path.join(data_path, "raw")

Archive:  Question_Answer_Dataset_v1.2.zip
replace __MACOSX/._Question_Answer_Dataset_v1.2? [y]es, [n]o, [A]ll, [N]one, [r]ename: A
  inflating: __MACOSX/._Question_Answer_Dataset_v1.2  
  inflating: Question_Answer_Dataset_v1.2/.DS_Store  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/._.DS_Store  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/._S10  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/._S08  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/._S09  
  inflating: Question_Answer_Dataset_v1.2/LICENSE-S08,S09  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/._LICENSE-S08,S09  
  inflating: Question_Answer_Dataset_v1.2/README.v1.2  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/._README.v1.2  
  inflating: Question_Answer_Dataset_v1.2/S10/question_answer_pairs.txt  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/S10/._question_answer_pairs.txt  
  inflating: __MACOSX/Question_Answer_Dataset_v1.2/S10/._data  
  inflating: Question_Answer_Dataset_v1.

In [86]:
!rm -rf $feature_cache_path

In [87]:
ls $data_path

LICENSE-S08,S09  raw  [0m[01;32mREADME.v1.2[0m*  [01;34mS08[0m/  [01;34mS09[0m/  [01;34mS10[0m/


In [88]:
# prepare feature data if not yet exist
if not (os.path.exists(feature_cache_path) and os.path.isfile(feature_cache_path)):
    # use the encoder to get the raw dataset (context are extracted by IR with the K-NN sentence to the QA pair)
    print("processing raw dataset... ")
    raw_dataset = CustomData(data_path, encoder_model, k=1)
    print("computing features...")
    # tokenize
    prepare_features_q(raw_dataset, feature_cache_path, processer, max_len_inp=512,max_len_out=512)
else:
    print("features exists")


processing raw dataset... 


100%|██████████| 1715/1715 [02:36<00:00, 10.96it/s]
100%|██████████| 826/826 [01:33<00:00,  8.87it/s]
100%|██████████| 1459/1459 [02:24<00:00, 10.09it/s]


length of dataset:  2684
computing features...


100%|██████████| 2684/2684 [00:02<00:00, 932.98it/s] 


In [89]:
# feature dataset
# leave 425 points for testing
test_points = 425
train_dataset = FeatureData(feature_cache_path, 'train', test_points)
test_dataset = FeatureData(feature_cache_path, 'test', test_points)

length of feature train set:  2259
length of feature test set:  425


In [90]:
# check what's in the dataset
input_dict = train_dataset[0]
print("input ids shape: ", input_dict['input_ids'].size())
print("question ids shape: ", input_dict['target_ids'].size())

input ids shape:  torch.Size([512])
question ids shape:  torch.Size([512])


In [91]:
# default split point: 425 -> samples after the split point will be in the test set
dataloader_train, dataloader_test = get_dataloaders(train_dataset, test_dataset, batch_size=1)

Loaded train feature data with 2259 batches
Loaded test feature data with 425 batches


In [92]:
raw_dataset[110] # this is a hard question -> chain of thoughts/verification might be useful

('Is Calvin Jr. older than John Coolidge?',
 'No',
 'They had two sons; John Coolidge, born in 1906, and Calvin Jr., born in 1908.')

In [93]:
train_dataset[110].keys()

dict_keys(['input_ids', 'input_mask', 'target_ids', 'target_mask', 'labels'])

### Training Pipeline

In [94]:
# saves trained model
def __save_model(model_dir, model, model_type='latest'):

    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    root_model_path = os.path.join(model_dir, saved_name)
    state_dict = {'weights': model.state_dict(),
                  'optimizer': model.optimizer.state_dict(),
                  'scheduler': model.scheduler.state_dict()}
    torch.save(state_dict, root_model_path)

In [98]:
# Loads the experiment data if exists to resume training from last saved checkpoint.
def __load_experiment(model_dir, model, model_type='latest'):

    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    if os.path.exists(os.path.join(model_dir, 'train.log')):
        # get current epoch
        current_epoch = 0
        with open(os.path.join(model_dir, 'train.log')) as f:
            for line in f:
                current_epoch += 1
        # get the latest model
        state_dict = torch.load(os.path.join(model_dir, saved_name), map_location=device.type)
        model.load_state_dict(state_dict['weights'])
        model.optimizer.load_state_dict(state_dict['optimizer'])
        model.scheduler.load_state_dict(state_dict['scheduler'])
    else:
        current_epoch = 0

    return model, current_epoch


In [99]:
# creates log file with loss at each epoch
def log(output_dir, log_str, file_name=None):
    if file_name is None:
        file_name = "all.log"
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'a') as f:
        f.write(log_str + '\n')

In [100]:
# training loop
def train(model, dataloader_train, n_epochs, model_dir, log_file):

    model.train() # put to train mode

    # load current model if exist
    model, current_epoch = __load_experiment(model_dir, model)

    all_losses = []

    for e in range(current_epoch, n_epochs):

        losses = 0
        for step, batch in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):

            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['input_mask'].to(device),
                decoder_input_ids=batch['target_ids'].to(device),
                decoder_attention_mask=batch['target_mask'].to(device),
                labels = batch['labels'].to(device)
            )

            loss = outputs[0]

            model.optimizer.zero_grad() # clear loss
            loss.backward()
            model.optimizer.step()  # backprop to update the weights

            if model.scheduler is not None:
                model.scheduler.step()  # update learning rate schedule

            # log losses
            loss /= len(dataloader_train) # already average across batch in nn.CrossEntropy
            losses += loss.item()

        # output stats
        print(f"Epoch {e}; loss {losses}")
        log(model_dir, "Epoch " + str(e+1) + "; loss " + str(losses), log_file)
        all_losses.append(losses)
        # save model
        __save_model(model_dir, model) # save latest
        if (e > current_epoch and losses < all_losses[-1]):
            __save_model(model_dir, model, model_type='best') # save best model



def test(model, dataloader_test, model_dir, log_file):

    model, e = __load_experiment(model_dir, model, model_type='latest')

    model.eval()

    losses = 0
    for step, batch in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):

        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['input_mask'].to(device),
            decoder_input_ids=batch['target_ids'].to(device),
            decoder_attention_mask=batch['target_mask'].to(device),
            labels = batch['labels'].to(device)
        )

        loss = outputs[0]

        # log losses
        loss /= len(dataloader_test)
        losses += loss.item()

    # output stats
    print(f"Validation loss {losses}")
    log(model_dir, "Validation loss " + str(losses), log_file)

In [101]:
def get_optimizer(model, opt_name, lr, eps):
    if opt_name == 'Adam':
        return AdamW(model.parameters(), lr=lr, eps=eps)
    elif opt_name == 'SGD':
        return SGD(model.parameters(), lr=lr, eps=eps)

In [102]:
def get_scheduler(model, scheduler, n_batches, n_epochs, warmup_portion=0.1):
    train_steps = n_epochs*n_batches
    warm_step = int(train_steps*warmup_portion)
    if scheduler == "linear":
        return get_linear_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)
    elif scheduler == "cosine":
        return get_cosine_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)

#### configure hyperparameters
n_epoch

lr

weight_decay

optimizer

warmup_steps

lr_scheduler

In [104]:
# hyperparameters
n_epochs = 10
lr = 1e-5
weight_decay = 5e-5
batch_size = 1

# dataloaders
# default split point: 425 -> samples after the split point will be in the test set
dataloader_train, dataloader_test = get_dataloaders(train_dataset, test_dataset, batch_size=batch_size)

# model optimizer
model.optimizer = get_optimizer(model, "Adam", lr, weight_decay)

# learning rate scheduler
# i.e. there are two types of scheduler: linear and cosine. I was confused what
# to choose so I chose linear
model.scheduler = get_scheduler(model, "linear", len(dataloader_train), n_epochs)

name = checkpoint.split('/')[-1]
print(name)

# model state_dict
model_dir = f"{name}_e{n_epochs}_lr{lr}_eps{weight_decay}_Adam_linearS_batch{batch_size}"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# log file
log_file = "train.log"

Loaded train feature data with 2259 batches
Loaded test feature data with 425 batches
t5-base


In [105]:
train(model, dataloader_train, n_epochs, model_dir, log_file)

100%|██████████| 2259/2259 [13:03<00:00,  2.88it/s]


Epoch 0; loss 4.403209098178195


100%|██████████| 2259/2259 [13:03<00:00,  2.88it/s]


Epoch 1; loss 0.7320343435912946


100%|██████████| 2259/2259 [13:03<00:00,  2.88it/s]


Epoch 2; loss 0.22169073486747948


100%|██████████| 2259/2259 [13:03<00:00,  2.88it/s]


Epoch 3; loss 0.10549498569642424


100%|██████████| 2259/2259 [13:02<00:00,  2.89it/s]


Epoch 4; loss 0.06360594212611659


100%|██████████| 2259/2259 [13:03<00:00,  2.88it/s]


Epoch 5; loss 0.04484260515624783


100%|██████████| 2259/2259 [13:02<00:00,  2.89it/s]


Epoch 6; loss 0.033844294625680504


100%|██████████| 2259/2259 [13:02<00:00,  2.89it/s]


Epoch 7; loss 0.02794611336623376


100%|██████████| 2259/2259 [13:02<00:00,  2.89it/s]


Epoch 8; loss 0.024307027566472073


100%|██████████| 2259/2259 [13:02<00:00,  2.89it/s]


Epoch 9; loss 0.023297698944418244


###load trained model###

In [109]:
model, current_epoch = __load_experiment(model_dir, model, model_type='latest')

AttributeError: ignored

In [110]:
model[0].eval()

T5ForConditionalGeneration(
  (shared): Embedding(32100, 768)
  (encoder): T5Stack(
    (embed_tokens): Embedding(32100, 768)
    (block): ModuleList(
      (0): T5Block(
        (layer): ModuleList(
          (0): T5LayerSelfAttention(
            (SelfAttention): T5Attention(
              (q): Linear(in_features=768, out_features=768, bias=False)
              (k): Linear(in_features=768, out_features=768, bias=False)
              (v): Linear(in_features=768, out_features=768, bias=False)
              (o): Linear(in_features=768, out_features=768, bias=False)
              (relative_attention_bias): Embedding(32, 12)
            )
            (layer_norm): T5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): T5LayerFF(
            (DenseReluDense): T5DenseActDense(
              (wi): Linear(in_features=768, out_features=3072, bias=False)
              (wo): Linear(in_features=3072, out_features=768, bias=False)
              (dropout): Dro

In [128]:
context = r"""Giraffes can inhabit savannas, grasslands, or open woodlands. They prefer areas enriched with acacia growth. They drink large quantities of water and, as a result, they can spend long periods of time in dry, arid areas. When searching for more food they will venture into areas with denser foliage.""".replace('\n', ' ')

In [129]:
inputs = processer(context, return_tensors='pt')

In [130]:
inputs.keys()

dict_keys(['input_ids', 'attention_mask'])

In [131]:
input_ids = inputs['input_ids'].to(device)

In [132]:
outputs = model[0].generate(input_ids, max_new_tokens=512)
print(outputs)

tensor([[    0, 32099,   328,  3281,   508,  6201,    13,   387,    11,     6,
          2459,     6,    79,    54,  1492,   307,  8811,    13,    97,    16,
          2192,     6,     3,     9,  4055,   844,     5,   366,  9601,    21,
           542,    79,    56,  6086,   139,   844,    28, 13809, 25840,     5,
             1]], device='cuda:0')


In [136]:
question_answer = processer.decode(outputs[0], skip_special_tokens=False)
print(question_answer)
question_answer = question_answer.replace(processer.pad_token, "").replace(processer.eos_token, "")
print(question_answer)
print(question_answer.split(processer.sep_token))


<pad> <extra_id_0> They drink large amounts of water and, therefore, they can spend long periods of time in dry, arid areas. When hunting for food they will venture into areas with dense foliage.</s>
 <extra_id_0> They drink large amounts of water and, therefore, they can spend long periods of time in dry, arid areas. When hunting for food they will venture into areas with dense foliage.
['<extra_id_0>', 'They', 'drink', 'large', 'amounts', 'of', 'water', 'and,', 'therefore,', 'they', 'can', 'spend', 'long', 'periods', 'of', 'time', 'in', 'dry,', 'arid', 'areas.', 'When', 'hunting', 'for', 'food', 'they', 'will', 'venture', 'into', 'areas', 'with', 'dense', 'foliage.']
