In [6]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install sentence_transformers

In [1]:
import os
import numpy as np
import pickle
from tqdm import tqdm

# pytorch
import torch
from torch.utils.data import DataLoader, Dataset

# model config
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, RobertaModel, T5Config, T5ForConditionalGeneration, T5Tokenizer, T5Model
from sentence_transformers import SentenceTransformer

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup

In [2]:
from utils import *

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# checkpoint -> pretrained model
checkpoint = "deepset/roberta-base-squad2" #"t5-large"

In [4]:
# IR encoder -> T-5 sentence dense embeddings
encoder_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [5]:
processer = get_tokenizer(checkpoint)

In [6]:
model = get_model(checkpoint, device, processer)

### Data Preparation

In [7]:
# define data_path for raw input and feature_path for feature input
data_path = 'Question_Answer_Dataset_v1.2'
raw_path = os.path.join(data_path, "raw")
feature_cache_path = 'Question_Answer_Dataset_v1.2/features_answers'

In [8]:
!rm -rf $feature_cache_path

In [9]:
ls $data_path

LICENSE-S08,S09  [0m[01;32mREADME.v1.2[0m*  [01;34mS08[0m/  [01;34mS09[0m/  [01;34mS10[0m/  raw


In [10]:
# use the encoder to get the raw dataset (context are extracted by IR with the K-NN sentence to the QA pair)
print("processing raw dataset... ")
if not os.path.isfile(raw_path):
    raw_dataset = CustomData(data_path, encoder_model, k=1)
    with open(os.path.join(data_path, "raw"), 'wb') as f:
        pickle.dump(raw_dataset, f)
else: 
    with open(os.path.join(data_path, "raw"), 'rb') as f:
        raw_dataset = pickle.load(f) # deserialize using load()

print("computing features...")
# prepare feature data if not yet exist 
if not (os.path.exists(feature_cache_path) and os.path.isfile(feature_cache_path)):
    prepare_features_a(raw_dataset, feature_cache_path, processer, max_len_inp=512,max_len_out=512)
else:
    print("features exists")

processing raw dataset... 
computing features...


100%|█████████████████████████████████████████████████████████████████████████████| 2684/2684 [00:01<00:00, 1556.41it/s]


In [16]:
# feature dataset
# leave 425 points for testing
test_points = 425
train_dataset = FeatureData_A(feature_cache_path, 'train', test_points)
test_dataset = FeatureData_A(feature_cache_path, 'test', test_points) 

length of feature train set:  2259
length of feature test set:  425


In [17]:
# check what's in the dataset
input_dict = train_dataset[0]
print("input ids shape: ", input_dict['input_ids'].size())
print("question ids shape: ", input_dict['target_ids'].size())

input ids shape:  torch.Size([512])
question ids shape:  torch.Size([512])


In [18]:
raw_dataset[110] # this is a hard question -> chain of thoughts/verification might be useful

('Is Calvin Jr. older than John Coolidge?',
 'No',
 'They had two sons; John Coolidge, born in 1906, and Calvin Jr., born in 1908.')

In [19]:
train_dataset[110].keys()

dict_keys(['input_ids', 'input_mask', 'target_ids', 'target_mask', 'start', 'end', 'labels'])

In [20]:
train_dataset[110]['input_ids'].shape

torch.Size([512])

### Train pipeline

In [21]:
def __save_model(model_dir, model, model_type='latest'):

    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    root_model_path = os.path.join(model_dir, saved_name)
    state_dict = {'weights': model.state_dict(), 
                  'optimizer': model.optimizer.state_dict(), 
                  'scheduler': model.scheduler.state_dict()}
    torch.save(state_dict, root_model_path)
        

# Loads the experiment data if exists to resume training from last saved checkpoint.
def __load_experiment(model_dir, model, model_type='latest'):
    
    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    if os.path.exists(os.path.join(model_dir, 'train.log')):
        # get current epoch
        current_epoch = 0
        with open(os.path.join(model_dir, 'train.log')) as f:
            for line in f:
                current_epoch += 1
        # get the latest model
        state_dict = torch.load(os.path.join(model_dir, saved_name), map_location=device.type)
        model.load_state_dict(state_dict['weights'])
        model.optimizer.load_state_dict(state_dict['optimizer'])
        model.scheduler.load_state_dict(state_dict['scheduler'])
    else:
        current_epoch = 0

    return model, current_epoch


def log(output_dir, log_str, file_name=None):
    if file_name is None:
        file_name = "all.log"
    output_file = os.path.join(output_dir, file_name)
    with open(output_file, 'a') as f:
        f.write(log_str + '\n')

In [35]:
# training loop
def train(model, dataloader_train, n_epochs, model_dir, log_file):

    model.train() # put to train mode
    
    # load current model if exist
    model, current_epoch = __load_experiment(model_dir, model)
    
    all_losses = []
    
    for e in range(current_epoch, n_epochs):

        losses = 0
        for step, batch in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):
            
            outputs = model(
                input_ids=batch['input_ids'].to(device),
                attention_mask=batch['input_mask'].to(device),
#                 decoder_input_ids=batch['target_ids'].to(device),
#                 decoder_attention_mask=batch['target_mask'].to(device), 
                start_positions = batch['start'].to(device), 
                end_positions = batch['end'].to(device)
            )

            loss = outputs[0]
#             print(loss)

            model.optimizer.zero_grad() # clear loss
            loss.backward()
            model.optimizer.step()  # backprop to update the weights

            if model.scheduler is not None:
                model.scheduler.step()  # update learning rate schedule 

            # log losses
            loss /= len(dataloader_train)
            losses += loss.item()
            
        # output stats
        print(f"Epoch {e}; loss {losses}")
        log(model_dir, "Epoch " + str(e+1) + "; loss " + str(losses), log_file)
        all_losses.append(losses)
        # save model
        __save_model(model_dir, model) # save latest
        if (e > current_epoch and losses < all_losses[-1]):
            __save_model(model_dir, model, model_type='best') # save best model        
        
        

def test(model, dataloader_test, model_dir, log_file):
    
    model, e = __load_experiment(model_dir, model, model_type='latest')
    
    model.eval()
    
    losses = 0
    for step, batch in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):

        outputs = model(
            input_ids=batch['input_ids'].to(device),
            attention_mask=batch['input_mask'].to(device),
#             decoder_input_ids=batch['target_ids'].to(device),
#             decoder_attention_mask=batch['target_mask'].to(device),
            start_positions = batch['start'].to(device), 
            end_positions = batch['end'].to(device)
        )

        loss = outputs[0]

        # log losses
        loss /= len(dataloader_test)
        losses += loss.item()
        
    # output stats
    print(f"Validation loss {losses}")
    log(model_dir, "Validation loss " + str(losses), log_file)

In [36]:
def get_optimizer(model, opt_name, lr, eps): 
    if opt_name == 'Adam':
        return AdamW(model.parameters(), lr=lr, eps=eps)
    elif opt_name == 'SGD':
        return SGD(model.parameters(), lr=lr, eps=eps)
    
def get_scheduler(model, scheduler, n_batches, n_epochs, warmup_portion=0.1):
    train_steps = n_epochs*n_batches
    warm_step = int(train_steps*warmup_portion)
    if scheduler == "linear": 
        return get_linear_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)
    elif scheduler == "cosine":
        return get_cosine_schedule_with_warmup(model.optimizer, num_warmup_steps=warm_step,num_training_steps=train_steps)

In [37]:
# hyperparameters
n_epochs = 10
lr = 1e-5
weight_decay = 5e-5
batch_size = 1

# dataloaders
# default split point: 425 -> samples after the split point will be in the test set
dataloader_train, dataloader_test = get_dataloaders(train_dataset, test_dataset, batch_size=batch_size)

# model optimizer
model.optimizer = get_optimizer(model, "Adam", lr, weight_decay)

# learning rate scheduler
model.scheduler = get_scheduler(model, "linear", len(dataloader_train), n_epochs)

name = checkpoint.split('/')[-1]
print(name)

# model state_dict
model_dir = f"{name}_e{n_epochs}_lr{lr}_eps{weight_decay}_Adam_linearS_batch{batch_size}"
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# log file
log_file = "train.log"

Loaded train feature data with 2259 batches
Loaded test feature data with 425 batches
roberta-base-squad2


In [38]:
question = raw_dataset[0][0]
text = raw_dataset[0][1]
print(question)
print(text)

Was Abraham Lincoln the sixteenth President of the United States?
yes


In [39]:
train(model, dataloader_train, n_epochs, model_dir, log_file)

100%|███████████████████████████████████████████████████████████████████████████████| 2259/2259 [02:01<00:00, 18.60it/s]


Epoch 0; loss 0.002366676836057735


100%|███████████████████████████████████████████████████████████████████████████████| 2259/2259 [02:01<00:00, 18.57it/s]


Epoch 1; loss 5.1048086130656145e-05


100%|███████████████████████████████████████████████████████████████████████████████| 2259/2259 [02:01<00:00, 18.61it/s]


Epoch 2; loss 1.8201642551174402e-05


100%|███████████████████████████████████████████████████████████████████████████████| 2259/2259 [02:01<00:00, 18.61it/s]


Epoch 3; loss 8.83299624954148e-06


100%|███████████████████████████████████████████████████████████████████████████████| 2259/2259 [02:01<00:00, 18.61it/s]


Epoch 4; loss 4.7231861232122085e-06


100%|███████████████████████████████████████████████████████████████████████████████| 2259/2259 [02:01<00:00, 18.61it/s]


Epoch 5; loss 2.9641582992567805e-06


 64%|██████████████████████████████████████████████████▉                            | 1455/2259 [01:18<00:43, 18.59it/s]


KeyboardInterrupt: 

In [40]:
test(model, dataloader_test, model_dir, log_file)

100%|█████████████████████████████████████████████████████████████████████████████████| 425/425 [00:04<00:00, 88.53it/s]

Validation loss 6.215710557366805e-07





In [23]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 3.797605GB
torch.cuda.memory_reserved: 15.416016GB
torch.cuda.max_memory_reserved: 15.416016GB


### pipeline

In [14]:
# Loads the experiment data if exists to resume training from last saved checkpoint.
def __load_experiment(model_dir, model, model_type='latest'):
    
    if model_type == "latest":
        saved_name = 'latest_model.pt'
    else:
        saved_name = 'best_model.pt'

    if os.path.exists(os.path.join(model_dir, 'train.log')):
        # get the latest model
        state_dict = torch.load(os.path.join(model_dir, saved_name), map_location=device.type)
        model.load_state_dict(state_dict['weights'])
    else: 
        print("model state dict doesn't exist")
        
    return model

In [35]:
# load model
model_dir = 'roberta-base-squad2_e10_lr1e-05_eps5e-05_Adam_linearS_batch1'
model = __load_experiment(model_dir, model, model_type='latest')

In [36]:
model.eval()

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [37]:
model.to('cpu')

RobertaForQuestionAnswering(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (Lay

In [38]:
question_answerer = pipeline("question-answering", model=model, tokenizer=processer)

In [39]:
context = r"""Giraffes can inhabit savannas, grasslands, or open woodlands. They prefer areas enriched with acacia growth. They drink large quantities of water and, as a result, they can spend long periods of time in dry, arid areas. When searching for more food they will venture into areas with denser foliage.""".replace('\n', ' ')

In [44]:
question = 'What type of vegetation do giraffes prefer?'
answer = 'acacia' # gt

In [46]:
question = "Is the bear white?"
context = "a white bear running to a tree"

In [47]:
model.device

device(type='cpu')

In [48]:
question_answerer(question=question, context=context)

{'score': 3.3720368924699488e-15,
 'start': 2,
 'end': 12,
 'answer': 'white bear'}

In [25]:
question_answerer2 = pipeline("question-answering", model='t5-base')

Some weights of T5ForQuestionAnswering were not initialized from the model checkpoint at t5-base and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [50]:
question_answerer2(question=question, context=context)

{'score': 0.023177172988653183,
 'start': 0,
 'end': 25,
 'answer': 'a white bear running to a'}

In [27]:
question_answerer3 = pipeline("question-answering", model=checkpoint)

In [49]:
question_answerer3(question=question, context=context)

{'score': 0.1231619194149971, 'start': 0, 'end': 7, 'answer': 'a white'}