In [6]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets
!pip install sentence_transformers

In [7]:
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm

# pytorch
import torch
from torch.utils.data import DataLoader, Dataset

# model config
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, RobertaModel
from sentence_transformers import SentenceTransformer

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup

In [8]:
from utils import *

In [9]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# checkpoint -> pretrained model
checkpoint = 'deepset/roberta-base-squad2'

In [10]:
# IR encoder -> T-5 sentence dense embeddings
encoder_model = SentenceTransformer('sentence-transformers/sentence-t5-base')

In [11]:
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)
processer = AutoTokenizer.from_pretrained(checkpoint)

### Data Preparation

In [12]:
# define data_path for raw input and feature_path for feature input
data_path = 'Question_Answer_Dataset_v1.2'
feature_cache_path = 'Question_Answer_Dataset_v1.2/features_answers'

In [13]:
# prepare feature data if not yet exist 
if not (os.path.exists(feature_cache_path) and os.path.isfile(feature_cache_path)):
    # use the encoder to get the raw dataset (context are extracted by IR with the K-NN sentence to the QA pair)
    print("processing raw dataset... ")
    raw_dataset = CustomData(data_path, encoder_model, k=1)
    print("computing features...")
    # tokenize
    prepare_features_a(raw_dataset, feature_cache_path, processer, max_len_inp=512,max_len_out=96)
else:
    print("features exists")

features exists


In [14]:
# feature dataset
# leave 425 points for testing
test_points = 425
train_dataset = FeatureData(feature_cache_path, 'train', test_points)
test_dataset = FeatureData(feature_cache_path, 'test', test_points) 

EOFError: Ran out of input

In [None]:
# check what's in the dataset
input_dict = train_dataset[0]
print("input ids shape: ", input_dict['input_ids'].size())
print("question ids shape: ", input_dict['target_ids'].size())

In [None]:
# default split point: 425 -> samples after the split point will be in the test set
dataloader_train, dataloader_test = get_dataloaders(train_dataset, test_dataset, batch_size=128)

In [None]:
raw_dataset[110] # this is a hard question -> chain of thoughts/verification might be useful

In [None]:
# training loop
def train(model, dataloader_train, n_epochs, output_dir, log_file):

    model.train() # put to train mode
    
    for e in range(n_epochs):

        losses = 0
        for step, batch in tqdm(enumerate(dataloader_train), total=len(dataloader_train)):

            batch = {key: tensor.to(device) for key, tensor in batch.items()} # to cuda

            outputs = model(
                input_ids=batch['input_ids'],
                attention_mask=batch['input_mask'],
                decoder_input_ids=batch['target_ids'],
                decoder_attention_mask=batch['target_mask'],
                labels=batch['labels']
            )

            loss = outputs[0]

            model.optimizer.zero_grad() # clear loss
            loss.backward()
            model.optimizer.step()  # backprop to update the weights

            if model.scheduler is not None:
                model.scheduler.step()  # update learning rate schedule 

            # log losses
            loss /= len(train_dataloader)
            losses += loss.item()
        
        # output stats
        print(f"Epoch {e}; loss {losses}")
        log(output_dir, "Epoch " + str(e+1) + "; loss " + str(losses), log_file)
    

def test(model, dataloader_test, output_dir, log_file):
    
    model.eval()
    
    losses = 0
    for step, batch in tqdm(enumerate(dataloader_test), total=len(dataloader_test)):

        batch = {key: tensor.to(device) for key, tensor in batch.items()} # to cuda 

        outputs = model(
            input_ids=batch['input_ids'],
            attention_mask=batch['input_mask'],
            decoder_input_ids=batch['target_ids'],
            decoder_attention_mask=batch['target_mask'],
            labels=batch['labels']
        )

        loss = outputs[0]

        # log losses
        loss /= len(train_dataloader)
        losses += loss.item()
        
    # output stats
    print(f"Validation loss {losses}")
    log(output_dir, "Validation loss " + str(losses), log_file)

In [8]:
print("torch.cuda.memory_allocated: %fGB"%(torch.cuda.memory_allocated(0)/1024/1024/1024))
print("torch.cuda.memory_reserved: %fGB"%(torch.cuda.memory_reserved(0)/1024/1024/1024))
print("torch.cuda.max_memory_reserved: %fGB"%(torch.cuda.max_memory_reserved(0)/1024/1024/1024))

torch.cuda.memory_allocated: 0.000000GB
torch.cuda.memory_reserved: 0.000000GB
torch.cuda.max_memory_reserved: 0.000000GB
