In [2]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer
import pandas as pd
import numpy as np

In [3]:
data = pd.read_json('train.json')

In [4]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [5]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["support"][random_num]
answer = data["correct_answer"][random_num]

In [6]:
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))

The input has a total of 125 tokens.


In [7]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)
for token, id in zip(tokens, input_ids):
    print('{:8}{:8,}'.format(token,id))

[CLS]        101
what       2,054
lengths   10,742
are        2,024
positive   3,893
for        2,005
con        9,530
##ver      6,299
##ging     4,726
lens      10,014
and        1,998
negative   4,997
for        2,005
diver     17,856
##ging     4,726
lens      10,014
?          1,029
[SEP]        102
-          1,011
for        2,005
lenses    15,072
,          1,010
the        1,996
distance   3,292
from       2,013
the        1,996
center     2,415
of         1,997
the        1,996
lens      10,014
to         2,000
the        1,996
focus      3,579
is         2,003
.          1,012
focal     15,918
lengths   10,742
are        2,024
positive   3,893
for        2,005
con        9,530
##ver      6,299
##ging     4,726
lens      10,014
and        1,998
negative   4,997
for        2,005
diver     17,856
##ging     4,726
lens      10,014
.          1,012
the        1,996
distance   3,292
from       2,013
the        1,996
center     2,415
of         1,997
the        1,996
lens      10,0

In [8]:
#first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print("SEP token index: ", sep_idx)
#number of tokens in segment A (question) - this will be one more than the sep_idx as the index in Python starts from 0
num_seg_a = sep_idx+1
print("Number of tokens in segment A: ", num_seg_a)
#number of tokens in segment B (text)
num_seg_b = len(input_ids) - num_seg_a
print("Number of tokens in segment B: ", num_seg_b)
#creating the segment ids
segment_ids = [0]*num_seg_a + [1]*num_seg_b
#making sure that every input token has a segment id
assert len(segment_ids) == len(input_ids)

SEP token index:  17
Number of tokens in segment A:  18
Number of tokens in segment B:  107


In [9]:
#token input_ids to represent the input and token segment_ids to differentiate our segments - question and text
output = model(torch.tensor([input_ids]),  token_type_ids=torch.tensor([segment_ids]))

In [10]:
#tokens with highest start and end scores
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end+1])
else:
    print("I am unable to find the answer to this question. Can you please ask another question?")
    
print("\nQuestion:\n{}".format(question.capitalize()))
print("\nAnswer:\n{}.".format(answer.capitalize()))


Question:
What lengths are positive for converging lens and negative for diverging lens?

Answer:
Focal lengths.


In [11]:
answer = tokens[answer_start]
for i in range(answer_start+1, answer_end+1):
    if tokens[i][0:2] == "##":
        answer += tokens[i][2:]
    else:
        answer += " " + tokens[i]

In [12]:
def question_answer(question, text):
    
    #tokenize question and text as a pair
    input_ids = tokenizer.encode(question, text)
    
    #string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    
    #segment IDs
    #first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)
    #number of tokens in segment A (question)
    num_seg_a = sep_idx+1
    #number of tokens in segment B (text)
    num_seg_b = len(input_ids) - num_seg_a
    
    #list of 0s and 1s for segment embeddings
    segment_ids = [0]*num_seg_a + [1]*num_seg_b
    assert len(segment_ids) == len(input_ids)
    
    #model output using input_ids and segment_ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
    
    #reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)
    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer += tokens[i][2:]
            else:
                answer += " " + tokens[i]
                
    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question."
    
    print("\nPredicted answer:\n{}".format(answer.capitalize()))

In [17]:
text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")
question_answer(question, text)



Predicted answer:
Brf and br2


In [14]:
random_num = np.random.randint(0,len(data))
question = data["question"][random_num]
text = data["support"][random_num]
answer = data["correct_answer"][random_num]
print(question)
print(text)
print(answer)

What are the two possible reduction products for brf3?
redox reaction will occur. The only question is whether lead will be oxidized to Pb(II) or Pb(IV). Because BrF3 is a powerful oxidant and fluorine is able to stabilize high oxidation states of other elements, it is likely that PbF4 will be the product. The two possible reduction products for BrF3 are BrF and Br2. The actual product will likely depend on the ratio of the reactants used. With excess BrF3, we expect the more oxidized product (BrF). With lower ratios of oxidant to lead, we would probably obtain Br2 as the product. Exercise Predict the products of each reaction and write a balanced chemical equation for each reaction.
brf and br2
