In [0]:
!pip install transformers
!pip install wget



In [0]:
import wget
import os
import json
import torch
import numpy as np
import pandas as pd
from transformers import BertModel, BertTokenizer, BertForQuestionAnswering
#from torch.utils.data import Dataset, DataLoader

In [0]:
def SQuAD_organize(SQuAD):

	"""
	takes the SQuAD json file and organizes the important parts into a list of lists
	impossible questions are "answered" with an empty string
	"""

	SQuAD_list_of_lists = []

	for i, subject in enumerate(SQuAD['data']):													#limited for testing purposes
		for paragraph in SQuAD['data'][i]['paragraphs'][0:2]:									#limited for testing purposes
			#find the context (paragraph)
			the_context = paragraph['context']
			for j, questions in enumerate(SQuAD['data'][i]['paragraphs']):	#limited for testing purposes
				for question in questions['qas'][0:2]:															#limited for testing purposes
					#find the question
					the_question = question['question']
					if question['answers'] != []:
						#find the answer (label)
						the_answer = question['answers'][0]['text']
					else:
						#or an empty string for impossible questions
						the_answer = ''
					row = [the_context.lower(), the_question.lower(), the_answer.lower()]
					SQuAD_list_of_lists.append(row)

	SQuAD_df = pd.DataFrame(SQuAD_list_of_lists, columns=['context', 'question', 'answer'])

	return SQuAD_df

In [0]:
#download the dataset from the github repository of the webinar
url_train = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json'
url_dev = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json'
if not os.path.exists('./train-v2.0.json'):
  wget.download(url_train, './train-v2.0.json')
if not os.path.exists('./dev-v2.0.json'):
  wget.download(url_dev, './dev-v2.0.json')

with open('train-v2.0.json', 'r') as json_train:
	SQuAD_train = json.load(json_train, encoding='utf-8')
with open('dev-v2.0.json', 'r') as json_val:
	SQuAD_val = json.load(json_val, encoding='utf-8')
 
SQuAD_train_df = SQuAD_organize(SQuAD_train)
SQuAD_val_df = SQuAD_organize(SQuAD_val)
SQuAD_train_df.head()

Unnamed: 0,context,question,answer
0,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,when did beyonce start becoming popular?,in the late 1990s
1,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,what areas did beyonce compete in when she was...,singing and dancing
2,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,"after her second solo album, what other entert...",acting
3,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,which artist did beyonce marry?,jay z
4,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,"in her music, what are some recurring elements...","love, relationships, and monogamy"


In [0]:
#customize tokenizer (this is just to confirm that it's possible)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
#extra_tokens = ['digital', 'virtual', 'blockchain', 'internet', 'software'] #this is just until we decide what the real list will be
#tokenizer.add_tokens(extra_tokens)

In [0]:
from torch.utils.data import Dataset, DataLoader

In [0]:
class LoadDataset(Dataset):

  def __init__(self, df, tokenizer, maxlen=128):
    self.df = df
    self.tokenizer = tokenizer
    self.maxlen = maxlen

  #dataset class is required to return the max length of any given string
  def __len__(self):
    return len(self.df)

  #preprocessing and returning weights by using the data loader
  def __getitem__(self, index):
    context = self.df.loc[index, 'context']
    question = self.df.loc[index, 'question']
    label = self.df.loc[index, 'answer']

    #tokenization, special tokens, padding, attention mask, token ids
    tokens_q = self.tokenizer.tokenize(question)                          #tokenization
    tokens_q = ['[CLS]'] + tokens_q + ['[SEP]']                           #special tokens
    tokens_c = self.tokenizer.tokenize(context)
    tokens_c = tokens_c + ['[SEP]']
    tokens = tokens_q + tokens_c
    if len(tokens) < self.maxlen:                                         #padding
      tokens = tokens + ['[PAD]' for _ in range(self.maxlen - len(tokens))]
    else:
      tokens = tokens[:self.maxlen-1] + ['[SEP]']
    token_ids = self.tokenizer.convert_tokens_to_ids(tokens)              #token ids
    token_ids = torch.tensor(token_ids)
    
    attn_masks = (token_ids != 0).long()                                   #attention mask (if token id is not 0, return true as integer (1))
 
    segment_ids = [0 for _ in range(len(tokens_q))] + [1 for _ in range(self.maxlen-len(tokens_q))]
    segment_ids = torch.tensor(segment_ids)                               #segment ids (to distinguish the first passage from the second)


    return token_ids, attn_masks, segment_ids, label #this would also include the position tag if a next-sentence model

In [0]:
train_set = LoadDataset(df=SQuAD_train_df, tokenizer=tokenizer, maxlen=128)
val_set = LoadDataset(df=SQuAD_val_df, tokenizer=tokenizer, maxlen=128)
print(train_set[50][0].shape, train_set[50][1].shape, train_set[50][2].shape, train_set[50][0])

torch.Size([128]) torch.Size([128]) torch.Size([128]) tensor([  101, 20773,  2207,  1996,  2299,  1000,  4195,  1000,  2006,  2029,
         3784,  2189,  2326,  1029,   102, 20773, 21025, 19358, 22815,  1011,
         5708,  1006,  1013, 12170, 23432, 29715,  3501, 29678, 12325, 29685,
         1013, 10506,  1011, 10930,  2078,  1011,  2360,  1007,  1006,  2141,
         2244,  1018,  1010,  3261,  1007,  2003,  2019,  2137,  3220,  1010,
         6009,  1010,  2501,  3135,  1998,  3883,  1012,  2141,  1998,  2992,
         1999,  5395,  1010,  3146,  1010,  2016,  2864,  1999,  2536,  4823,
         1998,  5613,  6479,  2004,  1037,  2775,  1010,  1998,  3123,  2000,
         4476,  1999,  1996,  2397,  4134,  2004,  2599,  3220,  1997,  1054,
         1004,  1038,  2611,  1011,  2177, 10461,  1005,  1055,  2775,  1012,
         3266,  2011,  2014,  2269,  1010, 25436, 22815,  1010,  1996,  2177,
         2150,  2028,  1997,  1996,  2088,  1005,  1055,  2190,  1011,  4855,
         2

In [0]:
train_loader = DataLoader(train_set, batch_size=32, num_workers=5)
val_loader = DataLoader(val_set, batch_size=32, num_workers=5)

In [0]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
#model.resize_token_embeddings(len(tokenizer))

In [0]:
if torch.cuda.is_available():
  device = 'cuda'
  print("Using the GPU: " + torch.cuda.get_device_name(0))
else:
  device = 'cpu'
  print("No GPU available. Using CPU instead.")
print(device)

Using the GPU: Tesla P100-PCIE-16GB
cuda


In [0]:
#confirm that the GPU memory is available (this must be done or else I get a "RuntimeError: CUDA out of memory." message and the model doesn't train)
!pip install gputil
import GPUtil
GPUtil.showUtilization()

Collecting gputil
  Downloading https://files.pythonhosted.org/packages/ed/0e/5c61eedde9f6c87713e89d794f01e378cfd9565847d4576fa627d758c554/GPUtil-1.4.0.tar.gz
Building wheels for collected packages: gputil
  Building wheel for gputil (setup.py) ... [?25l[?25hdone
  Created wheel for gputil: filename=GPUtil-1.4.0-cp36-none-any.whl size=7410 sha256=4eb5789a7be2bec10862295fa26acd73ec66c962230b2f165cd1d71d62c77306
  Stored in directory: /root/.cache/pip/wheels/3d/77/07/80562de4bb0786e5ea186911a2c831fdd0018bda69beab71fd
Successfully built gputil
Installing collected packages: gputil
Successfully installed gputil-1.4.0
| ID | GPU | MEM |
------------------
|  0 |  0% |  0% |


In [0]:
eg_question = "Does Walmart need a software development project?"
eg_context = """
  Retailer Walmart China launched its first Blockchain Traceability Platform on June 25, allowing consumers to acquire more knowledge on products -- from sources, logistics and testing results -- and empowering suppliers to better safeguard food safety.
  The first batch of 23 products underwent testing and was introduced into the platform. The company will scale up to more than 100 merchandises by the second half of this year, covering over 10 categories, including fresh meat, vegetable, seafood and its self-developed private brands.
  By the end of 2020, Walmart China's traceability system will be able to track about half of the total packaged fresh meat, 40 percent of packaged vegetables and 12.5 percent of seafood.
  The blockchain traceability system was developed by PwC and VeChain. By scanning the products, consumers can acquire large amount of information, which cannot be tampered with once the data are written.
"""
token_ids = tokenizer.encode(eg_question, eg_context)
attn_masks = [0 if i <= token_ids.index(102) else 1 for i in range(len(token_ids))]
print(token_ids, attn_masks)
start_scores, end_scores = model(torch.tensor([token_ids]), token_type_ids=torch.tensor([attn_masks]))
#print(start_scores.shape, end_scores.shape)
all_tokens = tokenizer.convert_ids_to_tokens(token_ids)
print(' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1]))

[101, 2515, 24547, 22345, 2342, 1037, 4007, 2458, 2622, 1029, 102, 20196, 24547, 22345, 2859, 3390, 2049, 2034, 3796, 24925, 2078, 7637, 8010, 4132, 2006, 2238, 2423, 1010, 4352, 10390, 2000, 9878, 2062, 3716, 2006, 3688, 1011, 1011, 2013, 4216, 1010, 12708, 1998, 5604, 3463, 1011, 1011, 1998, 7861, 23948, 20141, 2000, 2488, 28805, 2833, 3808, 1012, 1996, 2034, 14108, 1997, 2603, 3688, 9601, 5604, 1998, 2001, 3107, 2046, 1996, 4132, 1012, 1996, 2194, 2097, 4094, 2039, 2000, 2062, 2084, 2531, 16359, 2015, 2011, 1996, 2117, 2431, 1997, 2023, 2095, 1010, 5266, 2058, 2184, 7236, 1010, 2164, 4840, 6240, 1010, 15415, 1010, 23621, 1998, 2049, 2969, 1011, 2764, 2797, 9639, 1012, 2011, 1996, 2203, 1997, 12609, 1010, 24547, 22345, 2859, 1005, 1055, 7637, 8010, 2291, 2097, 2022, 2583, 2000, 2650, 2055, 2431, 1997, 1996, 2561, 21972, 4840, 6240, 1010, 2871, 3867, 1997, 21972, 11546, 1998, 2260, 1012, 1019, 3867, 1997, 23621, 1012, 1996, 3796, 24925, 2078, 7637, 8010, 2291, 2001, 2764, 2011, 1052, 