In [1]:
import numpy as np
import pandas as pd

import os
import re
import torch 
import transformers
import gzip

from collections import Counter, defaultdict
from itertools import islice
from transformers import BertTokenizer, BertModel, BertForQuestionAnswering
from models.QAheads import *
from models.utils import *
from utils import *

GPU is available


## Load data into memory

In [2]:
# load SubjQA_data into memory
subjqa_data_train = get_data(source='/SubjQA/', split='/train', domain='all')
subjqa_data_dev = get_data(source='/SubjQA/', split='/dev', domain='all')

In [3]:
# load SQuAD_data into memory
squad_data_train = get_data(source='/SQuAD/', split='train')
squad_data_dev = get_data(source='/SQuAD/', split='dev')

# create question, answer, support pairs for train and dev set
squad_q_train, squad_a_train, squad_s_train, _ = create_pairs(squad_data_train, bert=True)
squad_q_dev, squad_a_dev, squad_s_dev, _ = create_pairs(squad_data_dev, bert=True)

In [None]:
def tokenize_qas(questions:list, answers:list, contexts:list):
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    input_ids_all, token_type_ids_all = [], []
    for question, context in zip(questions, contexts):
        input_ids_current = tokenizer.encode(question, context)
        # TODO: figure out why line below is necessary
        token_type_ids_current = [0 if i <= input_ids_current.index(102) else 1 for i in range(len(input_ids_current))]
        input_ids_all.append(input_ids_current)
        token_type_ids_all.append(token_type_ids_current)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_encoder = BertModel.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
linear_head = LinearQAHead()

In [3]:
question, text = "[CLS] Who was Jim Henson? [SEP]", "[CLS] Jim Henson was a nice puppet [SEP]"

In [4]:
question = '[CLS] ' + question + ' [SEP]'
text = '[CLS] ' + text + ' [SEP]'

In [5]:
input_ids = tokenizer.encode(question, text)
token_type_ids = [0 if i <= input_ids.index(102) else 1 for i in range(len(input_ids))]
bert_outputs = bert_encoder(torch.tensor([input_ids]), token_type_ids=torch.tensor([token_type_ids]))
start_scores, end_scores = linear_head(bert_outputs)

all_tokens = tokenizer.convert_ids_to_tokens(input_ids)
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])

#assert answer == "a nice puppet"

Bert out: (tensor([[[ 0.5671,  0.0449, -0.7326,  ..., -0.1442,  0.7058, -0.2816],
         [ 0.5673,  0.0451, -0.7315,  ..., -0.1434,  0.7059, -0.2817],
         [-0.1931,  0.0144,  0.4119,  ...,  0.6692,  0.5947,  0.0778],
         ...,
         [ 0.9417, -0.1896, -0.8671,  ..., -0.6603,  0.5811,  0.2788],
         [ 0.5672,  0.0470, -0.7302,  ..., -0.1424,  0.7063, -0.2825],
         [ 0.5667,  0.0504, -0.7241,  ..., -0.1377,  0.7082, -0.2846]]],
       grad_fn=<NativeLayerNormBackward>), tensor([[ 0.1509, -0.9996, -0.7013,  ...,  0.9989, -0.9997, -0.1417]],
       grad_fn=<TanhBackward>))

0 out shape: torch.Size([1, 19, 1024])
1 out shape: torch.Size([1, 1024])

Seq out: tensor([[[ 0.5671,  0.0449, -0.7326,  ..., -0.1442,  0.7058, -0.2816],
         [ 0.5673,  0.0451, -0.7315,  ..., -0.1434,  0.7059, -0.2817],
         [-0.1931,  0.0144,  0.4119,  ...,  0.6692,  0.5947,  0.0778],
         ...,
         [ 0.9417, -0.1896, -0.8671,  ..., -0.6603,  0.5811,  0.2788],
         [ 0.5672,

In [6]:
print(answer)

was a nice puppet [SEP] [SEP]


In [7]:
some_tensor = torch.ones(3,4,10)
print(some_tensor.shape)
print(some_tensor.view(-1, some_tensor.shape[-1]).shape)

torch.Size([3, 4, 10])
torch.Size([12, 10])
