# Using Bert for Question Answering
### Credit to Google's CoLab for helping providing code to get started

In [1]:
import torch
import torchtext
from torchnlp import *
import transformers
from transformers import BertTokenizer, BertForQuestionAnswering, AdamW
from tqdm.notebook import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
# To load the model from Internet repository using model name. 
# Use this if you are running from your own copy of the notebooks
bert_model = 'bert-large-uncased-whole-word-masking-finetuned-squad' 

# To load the model from the directory on disk. Use this for Microsoft Learn module, because we have
# prepared all required files for you.
# bert_model = './bert'

tokenizer = BertTokenizer.from_pretrained(bert_model)

model = BertForQuestionAnswering.from_pretrained(bert_model)
optimizer = AdamW(model.parameters(), lr=3e-5)

model.to(device)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

In [3]:
# question = "When did the British held American colonies declare their independence?"
question = "What did the American people declare their independence from?"
answer_text = "Independence Day, known colloquially as the Fourth of July, is a Federal Holiday in the United States which commemorates the ratification of the Declaration of Independence by the Second Continental Congress on July 4, 1776, establishing the United States of America. The Founding Father delegates of the Second Continental Congress declared that the Thirteen Colonies were no longer subject (and subordinate) to the monarch of Britain, King George III, and were now united, free, and independent states. The Congress voted to approve independence by passing the Lee Resolution on July 2 and adopted the Declaration of Independence two days later, on July 4."

input_ids = tokenizer.encode(question, answer_text)

print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 130 tokens.


In [4]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)


# For each token and its id...
for token, id in zip(tokens, input_ids):
    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))

    if id == tokenizer.sep_token_id:
        print('')
    

[CLS]           101
what          2,054
did           2,106
the           1,996
american      2,137
people        2,111
declare      13,520
their         2,037
independence  4,336
from          2,013
?             1,029

[SEP]           102

independence  4,336
day           2,154
,             1,010
known         2,124
colloquially 23,992
as            2,004
the           1,996
fourth        2,959
of            1,997
july          2,251
,             1,010
is            2,003
a             1,037
federal       2,976
holiday       6,209
in            1,999
the           1,996
united        2,142
states        2,163
which         2,029
commemorates 25,530
the           1,996
ratification 27,369
of            1,997
the           1,996
declaration   8,170
of            1,997
independence  4,336
by            2,011
the           1,996
second        2,117
continental   6,803
congress      3,519
on            2,006
july          2,251
4             1,018
,             1,010
1776         13,96

In [5]:
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

In [6]:
outputs = model(torch.tensor([input_ids]).to(device), # The tokens representing our input text.
                             token_type_ids=torch.tensor([segment_ids]).to(device), # The segment IDs to differentiate question from answer_text
                             return_dict=True) 

start_scores = outputs.start_logits
end_scores = outputs.end_logits


In [7]:
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print(f"Question: {question}")
print('Answer: "' + answer + '"')

Question: What did the American people declare their independence from?
Answer: "the monarch of britain"


# Trying other models / Trying larger text sizes
*full disclosure I am using the wiki from one of my favorite games project zomboid to source the large text input size*

In [8]:
# from transformers import LongformerTokenizer, LongformerForQuestionAnswering

# # Initialize Longformer tokenizer and model
# tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
# model = LongformerForQuestionAnswering.from_pretrained('allenai/longformer-base-4096')
# model.to(device)

In [9]:
bert_model = 'bert-large-uncased-whole-word-masking-finetuned-squad' 

# To load the model from the directory on disk. Use this for Microsoft Learn module, because we have
# prepared all required files for you.
# bert_model = './bert'

tokenizer = BertTokenizer.from_pretrained(bert_model)

model = BertForQuestionAnswering.from_pretrained(bert_model)
optimizer = AdamW(model.parameters(), lr=3e-5)

model.to(device)

Some weights of the model checkpoint at bert-large-uncased-whole-word-masking-finetuned-squad were not used when initializing BertForQuestionAnswering: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 1024, padding_idx=0)
      (position_embeddings): Embedding(512, 1024)
      (token_type_embeddings): Embedding(2, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-23): 24 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
              (LayerNorm): LayerNorm((1024,)

In [10]:
# Own block because it's a huge chunk of text. Testing for scalibility (also it's my own built in lore searcher for one of my favorite games)
question = "What do the zombies follow?"
answer_text = "A zombie is the player's main antagonist in Project Zomboid. These once human citizens of Knox Country roam the landscape in the thousands. They have an insatiable hunger for human flesh and will not hesitate to kill. The default zombies in Project Zomboid are inspired by George A. Romero's shambler zombies. The zombies can be heavily modified with the use of custom sandbox. Sandbox has many options for modifying the zombies, such as increasing or reducing their speed, primary senses, strength etc. Spawning: By default, zombies generally spawn in higher numbers at urban areas than rural areas. However, most areas won't have the exact same population. Some settlements will usually have larger zombie populations than others, such as Louisville. Downtown areas tend to have higher populations than suburban or outskirt areas. The population will increase and hit its peak after 30 days have passed. Some areas can also become more populated as time passes due to hordes migrating from other areas. Though sometimes the opposite may happen as zombies populating an area may wander away from the area too, such as from gunshots and other sounds from the metagame. Zombies can also spawn in enclosed spaces such as bathrooms or closets and ambush unsuspecting survivors. Behavior: Zombies feasting on an unfortunate victim. Zombies rely on their eyesight and hearing, they're especially drawn to noises such as radios, running vehicles and gunfire. If they hear a noise behind them, they will first look behind and turn around afterward. Zombies prioritize their pathfinding: the sight of prey attracts them first and foremost. With no human flesh in sight, noise is the next priority, regardless of whether it is man-made or not (e.g., thunder). Some zombies can be seen sitting against walls. This gives survivors an easier opportunity of killing that zombie or safely getting past, due to the zombie having to stand up first. When the player dies next to zombies, they will kneel down and begin to eat their corpse. Zombies can also occasionally be found feasting on a corpse, also giving survivors an easier chance of slipping past or killing the zombie(s). Hordes: Horde movement. Zombies tend to roam in hordes, with one of those zombies being the designated horde leader. The zombies in that horde will follow that leader around. Zombie hordes are the most dangerous, and are best avoided at any stage of the game."

In [11]:
input_ids = tokenizer.encode(question, answer_text)

print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 506 tokens.


In [12]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)


# For each token and its id...
for token, id in zip(tokens, input_ids):
    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))

    if id == tokenizer.sep_token_id:
        print('')

[CLS]           101
what          2,054
do            2,079
the           1,996
zombies      14,106
follow        3,582
?             1,029

[SEP]           102

a             1,037
zombie       11,798
is            2,003
the           1,996
player        2,447
'             1,005
s             1,055
main          2,364
antagonist   17,379
in            1,999
project       2,622
z             1,062
##om          5,358
##bo          5,092
##id          3,593
.             1,012
these         2,122
once          2,320
human         2,529
citizens      4,480
of            1,997
knox         11,994
country       2,406
roam         25,728
the           1,996
landscape     5,957
in            1,999
the           1,996
thousands     5,190
.             1,012
they          2,027
have          2,031
an            2,019
ins          16,021
##ati        10,450
##able        3,085
hunger        9,012
for           2,005
human         2,529
flesh         5,771
and           1,998
will          2,09

In [13]:
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

In [14]:
outputs = model(torch.tensor([input_ids]).to(device), # The tokens representing our input text.
                             token_type_ids=torch.tensor([segment_ids]).to(device), # The segment IDs to differentiate question from answer_text
                             return_dict=True) 

start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [15]:
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print(f"Question: {question}")
print('Answer: "' + answer + '"')

Question: What do the zombies follow?
Answer: "the designated horde leader"
