In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Read

In [2]:
import string
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [3]:
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/Ultramarine_NLP_project/Datasets/podcasts_edits.csv')

In [4]:
df_test.head()

Unnamed: 0,questions,answers,context
0,How much time before there's regular travel ba...,I think it's going to take a while to build a ...,Well I think it's going to take a while to bui...
1,There's you ever see the New York Times articl...,I don't know.,"I don't know. Yeah, there was a New York Times..."
2,Do you think that they would want us to know o...,I don't know,I don't know of any real civilization. They su...
3,Even though you're thinking about interplaneta...,"No I mean if they show up I'm like, great, OK,...","No I mean if they show up I'm like, great, OK,..."


In [5]:
answers = df_test.answers
contexts = df_test.context
questions = df_test.questions

## Prepare

In [6]:
def add_idx(answers, contexts):
    # loop through each answer-context pair
    start_positions = []
    end_positions = []
    for answer, context in zip(answers, contexts):
        # gold_text refers to the answer we are expecting to find in context
        gold_text = answer
        start_idx = context.find(gold_text)
        # and ideally this would be the end index...
        end_idx = start_idx + len(gold_text)

        start_positions.append(start_idx)
        end_positions.append(end_idx)
    
    return start_positions, end_positions

In [7]:
start_positions, end_positions = add_idx(answers, contexts)

In [8]:
start_positions, end_positions

([5, 0, 0, 0], [68, 13, 12, 75])

## Encode

In [9]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 6.9 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 34.5 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 94.8 MB/s 
[?25hCollecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 84.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.2.0-py3-none-any.whl (61 kB)
[K     |████████████████████████████████| 61 kB 648 kB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transfor

In [10]:
from transformers import BertTokenizerFast
tokenizer = BertTokenizerFast.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

test_encodings = tokenizer(contexts.to_list(), questions.to_list(), truncation=True, padding=True)

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/443 [00:00<?, ?B/s]

In [11]:
def add_token_positions(encodings, answers, start_chars=start_positions, end_chars=end_positions):
    # initialize lists to contain the token indices of answer start/end
    start_positions = []
    end_positions = []
    input_offsets = []
    for i in range(len(answers)):
        # append start/end token position using char_to_token method
        start_positions.append(encodings.char_to_token(i, start_chars[i]))
        end_positions.append(encodings.char_to_token(i, end_chars[i]))

        # if start position is None, the answer passage has been truncated
        if start_positions[-1] is None:
            start_positions[-1] = tokenizer.model_max_length
        # end position cannot be found, char_to_token found space, so shift one token forward
        go_back = 1
        while end_positions[-1] is None:
            end_positions[-1] = encodings.char_to_token(i, end_chars[i]-go_back)
            go_back +=1
    # update our encodings object with the new token-based start/end positions
    encodings.update({'start_positions': start_positions, 'end_positions': end_positions})

# apply function to our data
add_token_positions(test_encodings, answers.to_list())

In [12]:
test_encodings

{'input_ids': [[101, 2092, 1045, 2228, 2009, 1005, 1055, 2183, 2000, 2202, 1037, 2096, 2000, 3857, 1037, 2613, 10585, 2008, 1996, 2613, 1997, 1996, 11207, 2008, 2428, 5609, 2003, 2065, 2057, 1005, 2128, 2893, 2627, 1996, 2307, 11307, 2003, 2079, 2057, 2031, 2438, 4219, 2006, 7733, 2107, 2008, 2065, 1996, 2065, 1996, 25516, 2015, 2013, 3011, 2644, 2746, 1010, 2017, 2071, 5788, 1029, 3398, 1012, 2061, 2008, 2064, 2069, 2022, 2074, 4394, 2028, 2210, 2518, 1012, 2017, 1005, 1040, 2022, 2066, 2017, 1005, 2128, 2006, 1037, 2146, 2712, 8774, 1998, 1996, 2069, 2518, 2017, 1005, 2128, 4394, 2003, 17663, 1039, 1012, 7910, 1010, 2009, 1005, 1055, 2069, 1037, 3043, 1997, 2051, 1010, 2017, 2113, 1012, 3398, 1012, 1998, 2216, 2064, 2022, 14694, 1012, 2061, 2017, 1005, 2310, 2288, 2000, 2031, 2035, 1996, 2477, 4072, 2000, 15770, 10585, 2006, 7733, 1012, 1998, 1996, 3114, 2008, 2216, 3719, 3030, 2746, 2071, 2022, 1012, 2088, 2162, 2093, 1010, 2030, 2009, 2071, 2022, 2349, 2000, 1037, 4030, 6689, 1997,

---

# PyTorch implementation

In [13]:
import torch

class SquadDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

test_dataset = SquadDataset(test_encodings)

In [14]:
from transformers import BertForQuestionAnswering
model = BertForQuestionAnswering.from_pretrained("bert-large-uncased-whole-word-masking-finetuned-squad")

Downloading:   0%|          | 0.00/1.25G [00:00<?, ?B/s]

In [15]:
from torch.utils.data import DataLoader
from transformers import AdamW
from tqdm import tqdm

# setup GPU/CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
# move model over to detected device
model.to(device)
# # activate training mode of model
# model.train()
# # initialize adam optimizer with weight decay (reduces chance of overfitting)
# optim = AdamW(model.parameters(), lr=5e-5)

# # initialize data loader for training data
# train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)

# for epoch in range(3):
#     # set model to train mode
#     model.train()
#     # setup loop (we use tqdm for the progress bar)
#     loop = tqdm(train_loader, leave=True)
#     for batch in loop:
#         # initialize calculated gradients (from prev step)
#         optim.zero_grad()
#         # pull all the tensor batches required for training
#         input_ids = batch['input_ids'].to(device)
#         attention_mask = batch['attention_mask'].to(device)
#         start_positions = batch['start_positions'].to(device)
#         end_positions = batch['end_positions'].to(device)
#         # train model on batch and return outputs (incl. loss)
#         outputs = model(input_ids, attention_mask=attention_mask,
#                         start_positions=start_positions,
#                         end_positions=end_positions)
#         # extract loss
#         loss = outputs[0]
#         # calculate loss for every parameter that needs grad update
#         loss.backward()
#         # update parameters
#         optim.step()
#         # print relevant info to progress bar
#         loop.set_description(f'Epoch {epoch}')
#         loop.set_postfix(loss=loss.item())

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,), eps=1e-12,

In [16]:
# switch model out of training mode
model.eval()

#val_sampler = SequentialSampler(val_dataset)
test_loader = DataLoader(test_dataset, batch_size=16)

acc = []

# initialize loop for progress bar
loop = tqdm(test_loader)
# loop through batches
for batch in loop:
    # we don't need to calculate gradients as we're not training
    with torch.no_grad():
        # pull batched items from loader
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_true = batch['start_positions'].to(device)
        end_true = batch['end_positions'].to(device)
        # make predictions
        outputs = model(input_ids, attention_mask=attention_mask)
        # pull preds out
        start_pred = torch.argmax(outputs['start_logits'], dim=1)
        end_pred = torch.argmax(outputs['end_logits'], dim=1)
        # calculate accuracy for both and append to accuracy list
        acc.append(((start_pred == start_true).sum()/len(start_pred)).item())
        acc.append(((end_pred == end_true).sum()/len(end_pred)).item())
# calculate average accuracy in total
acc = sum(acc)/len(acc)
acc

100%|██████████| 1/1 [00:00<00:00,  3.24it/s]


0.0

In [17]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	2	16
pred	0	0

true	1	6
pred	109	117

true	1	5
pred	115	116

true	1	22
pred	0	0



In [18]:
for i in range(len(start_true)):
  all_tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
  print('<------->')
  print(f'Q: {questions[i]}')
  print(f'Pred A: {" ".join(all_tokens[start_pred[i]:end_pred[i]+1])}')

<------->
Q: How much time before there's regular travel back and forth to Mars?
Pred A: [CLS]
<------->
Q: There's you ever see the New York Times article that came out in twenty seventeen about the stuff?
Pred A: i think i would know if there were aliens
<------->
Q: Do you think that they would want us to know or do you think they would just be observing and making sure we don't blow ourselves up, would we?
Pred A: very subtle
<------->
Q: Even though you're thinking about interplanetary travel, you don't really think about aliens?
Pred A: [CLS]


# Implementing weights based on location

In [19]:
answers = outputs['start_logits'].shape[0]
indexes = outputs['start_logits'].shape[1]

In [20]:
softmax = torch.nn.Softmax(dim=1)
start_soft = softmax(outputs['start_logits'])
end_soft = softmax(outputs['end_logits'])

In [21]:
weighted_indexes = np.zeros([answers, indexes, indexes])

In [22]:
lowest_weight = 0.5
weights = np.linspace(1, lowest_weight, num=indexes)

In [23]:
for i, positions in enumerate(zip(start_soft, end_soft)):
  start_pos = positions[0]
  end_pos = positions[1]
  for start_index, start_token in enumerate(start_pos):
    for end_index, end_token in enumerate(end_pos):
      if start_index <= end_index:
        weighted_indexes[i,start_index,end_index] = ((start_token.item() + end_token.item())/2) * weights[start_index]

In [24]:
start_pred = []
end_pred = []
for i in range(weighted_indexes.shape[0]):
  max_start = weighted_indexes[i].max(axis=1).argmax()
  max_end = weighted_indexes[i].argmax(axis=1)[max_start]
  start_pred.append(max_start)
  end_pred.append(max_end)

In [25]:
print("T/F\tstart\tend\n")
for i in range(len(start_true)):
    print(f"true\t{start_true[i]}\t{end_true[i]}\n"
          f"pred\t{start_pred[i]}\t{end_pred[i]}\n")

T/F	start	end

true	2	16
pred	0	0

true	1	6
pred	109	117

true	1	5
pred	115	116

true	1	22
pred	0	0



In [26]:
for i in range(len(start_true)):
  all_tokens = tokenizer.convert_ids_to_tokens(input_ids[i])
  print('<------->')
  print(f'Q: {questions[i]}')
  print(f'Pred A: {" ".join(all_tokens[start_pred[i]:end_pred[i]+1])}')

<------->
Q: How much time before there's regular travel back and forth to Mars?
Pred A: [CLS]
<------->
Q: There's you ever see the New York Times article that came out in twenty seventeen about the stuff?
Pred A: i think i would know if there were aliens
<------->
Q: Do you think that they would want us to know or do you think they would just be observing and making sure we don't blow ourselves up, would we?
Pred A: very subtle
<------->
Q: Even though you're thinking about interplanetary travel, you don't really think about aliens?
Pred A: [CLS]
