In [1]:
%%capture
!pip install transformers
!pip install sentencepiece
!pip install datasets

In [17]:
import os
import numpy as np
import pickle as pkl
from tqdm import tqdm

# pytorch
import torch
from torch.utils.data import DataLoader, Dataset

# model config
from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, RobertaModel

# model optim
from torch.optim import AdamW, SGD

# lr schedulers
from transformers import get_cosine_schedule_with_warmup, get_linear_schedule_with_warmup, \
    get_cosine_with_hard_restarts_schedule_with_warmup

In [7]:
from utils import *

[nltk_data] Downloading package punkt to /home/ubuntu/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# specify device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# checkpoint -> pretrained model
checkpoint = 'deepset/roberta-base-squad2'

In [18]:
processor = AutoTokenizer.from_pretrained("roberta-base")
encoder = RobertaModel.from_pretrained("roberta-base")

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/496M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/79.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

### Data Preparation

In [5]:
# define data_path for raw input and feature_path for feature input
data_path = 'Question_Answer_Dataset_v1.2'
feature_cache_path = 'Question_Answer_Dataset_v1.2/features'

In [15]:
# prepare data if not yet exist
class CustomAData(Dataset):
    '''
    Process raw data
    '''
    def __init__(self, file_dir, model, tokenzier, k=1):
        self.file = file_dir
        self.article_name = []
        self.questions = []
        self.answers = []
        self.q_diffi = []
        self.a_diffi = []
        self.article_path = []
        self.context_nn = {}
        self.context = {} # only fill when load the dataset
        self.context_embed = {}

        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = model.to(device)
        self.tok = tokenzier
        model.eval()
        
        # get question answer pairs
        for div in ['S08', 'S09', 'S10']:
            skip = True
            qa_path = os.path.join(self.file, div, "question_answer_pairs.txt")
            num_lines = sum(1 for line in open(qa_path,'rb'))
            with open(qa_path, 'rb') as f:

                for line in tqdm(f, total=num_lines):

                    if skip:
                        skip = False # skip the first line
                        continue

                    try: # only continue if the decoding is valid for utf-8
                        row = line.decode().split('\t')
                    except:
                        continue

                    if "NULL" in row:
                        continue # if any feature does not exist -> skip

                    context_file = self.file + "/" + div + "/"+ row[5][:-1] + ".txt" # path to the context file
                    if not (os.path.exists(context_file) and os.path.isfile(context_file)): # otherwise context doesn't exist: invalid
                        continue

                    # only process document embedding when needed (article first found)
                    if row[0] not in self.context_embed.keys():
                        # check if context could be extracted
                        try:
                            with open(context_file, 'rb') as f:
                                curr_context = f.read().decode() # could be decoded, otherwise skip
                        except:
                            continue

                        curr_context = curr_context.split('Related Wikipedia Articles')[0] # ignore everything after Related articles
                        curr_context = curr_context.replace('\n',' ')
                        self.context[row[0]] = tokenize.sent_tokenize(curr_context)
                        # encode context and add to corresponding files
                        c_embed = []
                        for context in self.context[row[0]]:
                            enc = self.tok(context, max_length=512, padding='max_length', return_tensors="pt")
                            enc.to(device)
                            output = self.model(**enc)
                            c_embed.append(output.last_hidden_state)
                            self.context_embed[row[0]] = torch.cat(c_embed)

                    # get top-1 similar context
                    qa_enc = self.tok(context, max_length=512, padding='max_length', return_tensors="pt")
                    qa_enc.to(device)
                    qa_output = self.model.encoder(**qa_enc)
                    encoded_qa = qa_output.last_hidden_state # question and answer embedding
                    c_embed = self.context_embed[row[0]] # load the context embeddings
                    # compute knn score: dot product
                    scores = c_embed.matmul(encoded_qa)
                    nn = torch.argmax(scores)
                    # the text of the closest neighbor
                    self.context_nn[row[0]] = self.context[row[0]][nn]

                    # other info
                    self.article_name.append(row[0])
                    self.questions.append(row[1])
                    self.answers.append(row[2])
                    self.q_diffi.append(row[3]) # difficulty
                    self.a_diffi.append(row[4])
                    self.article_path.append(div + "/"+ row[5][:-1]) # get rid of '\n

        print("length of dataset: ", len(self.questions))

    def __len__(self):
        return len(self.questions)

    def __getitem__(self, idx):
        return self.questions[idx], self.answers[idx], self.context[self.article_name[idx]]

In [19]:
raw_dataset = CustomAData(data_path, encoder, processor)

  0%|                                                                                | 1/1715 [00:02<1:23:28,  2.92s/it]


OutOfMemoryError: CUDA out of memory. Tried to allocate 12.00 MiB. GPU 0 has a total capacty of 7.43 GiB of which 8.44 MiB is free. Including non-PyTorch memory, this process has 7.42 GiB memory in use. Of the allocated memory 7.33 GiB is allocated by PyTorch, and 12.09 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF