In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
SQuAD = pd.read_json("https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v2.0.json")
SQuAD.head()

Unnamed: 0,version,data
0,v2.0,"{'title': 'Beyoncé', 'paragraphs': [{'qas': [{..."
1,v2.0,"{'title': 'Frédéric_Chopin', 'paragraphs': [{'..."
2,v2.0,{'title': 'Sino-Tibetan_relations_during_the_M...
3,v2.0,"{'title': 'IPod', 'paragraphs': [{'qas': [{'qu..."
4,v2.0,{'title': 'The_Legend_of_Zelda:_Twilight_Princ...


In [3]:
# SQuAD.head(1).values

In [4]:
# del  SQuAD['version']
cols = ['text', 'question', 'answer']

comp_list = []
for index, row in SQuAD.iterrows():
    for i in range(len(row['data']['paragraphs'])):
        for j in (row['data']['paragraphs'][i]['qas']):
            temp_list = []
            temp_list.append(row["data"]["paragraphs"][i]["context"])
            temp_list.append(j["question"])
            if j["answers"]:
                temp_list.append(j["answers"][0]["text"])
            else:
                temp_list.append("")
        comp_list.append(temp_list)
new_df = pd.DataFrame(comp_list,columns=cols)

In [5]:
new_df.to_csv("SQuAD_data.csv", index=False)
data = pd.read_csv("SQuAD_data.csv")
data.head()

Unnamed: 0,text,question,answer
0,Beyoncé Giselle Knowles-Carter (/biːˈjɒnseɪ/ b...,What was the name of Beyoncé's first solo album?,Dangerously in Love
1,Following the disbandment of Destiny's Child i...,What is the name of Beyoncé's alter-ego?,Sasha Fierce
2,"A self-described ""modern-day feminist"", Beyonc...",What magazine named Beyoncé as the most powerf...,Forbes
3,"Beyoncé Giselle Knowles was born in Houston, T...",Beyoncé was raised in what religion?,Methodist
4,Beyoncé attended St. Mary's Elementary School ...,What choir did Beyoncé sing in for two years?,St. John's United Methodist Church


In [6]:
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

Downloading (…)lve/main/config.json: 100%|██████████| 443/443 [00:00<00:00, 69.4kB/s]
Downloading (…)"pytorch_model.bin";: 100%|██████████| 1.34G/1.34G [00:23<00:00, 57.7MB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 714kB/s]
Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 10.4kB/s]


In [7]:
random_num = np.random.randint(0, len(data))
question = data["question"][random_num]
text = data["text"][random_num]

In [8]:
!mkdir data
!mv SQuAD_data.csv data/

mkdir: cannot create directory ‘data’: File exists


In [9]:
question

'What type of wood can hold four times as much of a load when dried?'

In [10]:
text

'Drying produces a decided increase in the strength of wood, particularly in small specimens. An extreme example is the case of a completely dry spruce block 5 cm in section, which will sustain a permanent load four times as great as a green (undried) block of the same size will.'

In [11]:
input_ids = tokenizer.encode(question, text)
print("The input has a total of {} tokens.".format(len(input_ids)))

The input has a total of 76 tokens.


In [12]:
tokens = tokenizer.convert_ids_to_tokens(input_ids)

for token, id in zip(tokens, input_ids):
    print("{:8}{:8}".format(token, id))

[CLS]        101
what        2054
type        2828
of          1997
wood        3536
can         2064
hold        2907
four        2176
times       2335
as          2004
much        2172
of          1997
a           1037
load        7170
when        2043
dried       9550
?           1029
[SEP]        102
drying     17462
produces    7137
a           1037
decided     2787
increase    3623
in          1999
the         1996
strength    3997
of          1997
wood        3536
,           1010
particularly    3391
in          1999
small       2235
specimens    9908
.           1012
an          2019
extreme     6034
example     2742
is          2003
the         1996
case        2553
of          1997
a           1037
completely    3294
dry         4318
spruce     19893
block       3796
5           1019
cm          4642
in          1999
section     2930
,           1010
which       2029
will        2097
sustain    15770
a           1037
permanent    4568
load        7170
four        2176
times 

In [13]:
# first occurence of [SEP] token
sep_idx = input_ids.index(tokenizer.sep_token_id)
print(sep_idx)

# number of tokens in segment A - question
num_seg_a = sep_idx + 1
print(num_seg_a)

#number of tokens in segment B - text
num_seg_b = len(input_ids) - num_seg_a
print(num_seg_b)

segment_ids = [0]*num_seg_a + [1]*num_seg_b
print(segment_ids)

assert len(segment_ids) == len(input_ids)

17
18
58
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [14]:
output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))
print(output.start_logits, output.end_logits)

tensor([[-6.5570, -3.6428, -7.7892, -7.3022, -7.3378, -8.5929, -8.7354, -9.1374,
         -9.4038, -9.1775, -9.1694, -8.8319, -8.7167, -9.0042, -8.4514, -7.9351,
         -9.3174, -6.5569, -2.8911, -7.0641, -6.9028, -6.0249, -5.9022, -7.9937,
         -6.0126, -5.4645, -7.3554, -3.3017, -7.5972, -6.2685, -6.5429, -4.4986,
         -5.5158, -6.5569, -3.5121, -5.0184, -4.9290, -5.7100, -3.9372, -4.7136,
         -3.2312,  4.5708,  3.3033,  2.7333,  8.1562,  1.0806, -2.0258, -4.9256,
         -5.5844, -3.1432, -5.1337, -4.0815, -5.0982, -5.6637, -6.1958, -6.1446,
         -5.5061, -3.2329, -6.0126, -7.2608, -6.5485, -6.3367, -0.3333,  1.1320,
         -4.6226, -2.1368, -6.4142, -5.7904, -3.1184, -6.5893, -6.2802, -6.0601,
         -4.3446, -6.4494, -7.3229, -6.5571]], grad_fn=<CloneBackward0>) tensor([[-1.4395, -1.9131, -5.6148, -4.9678, -6.0605, -7.6445, -7.5396, -7.1709,
         -7.2522, -7.6118, -7.5136, -7.4072, -7.6763, -6.6284, -8.0881, -6.2145,
         -6.6987, -1.4394, -2.4799, 

In [15]:
answer_start = torch.argmax(output.start_logits)
answer_end = torch.argmax(output.end_logits)
print (answer_start, answer_end)

if answer_end >= answer_start:
    answer = " ".join(tokens[answer_start:answer_end + 1])
else:
    print("I am unable to answer your question? Please pardon")

print("Text: \n{}".format(text.capitalize()))
print("\nQuestion: \n{}".format(question.capitalize()))
print("\nAnswer: \n{}".format(answer.capitalize()))

tensor(44) tensor(44)
Text: 
Drying produces a decided increase in the strength of wood, particularly in small specimens. an extreme example is the case of a completely dry spruce block 5 cm in section, which will sustain a permanent load four times as great as a green (undried) block of the same size will.

Question: 
What type of wood can hold four times as much of a load when dried?

Answer: 
Spruce


In [16]:
answer = tokens[answer_start]

for i in range(answer_start+1, answer_end):
    if tokens[i][0:2] == "##":
        answer +=tokens[i][2:]
    else:
        answer += " " + tokens[i]

answer

'spruce'

In [17]:
def question_answer(question, text):

    # tokenize question and text in ids as a pair
    input_ids = tokenizer.encode(question, text)

    # string version of tokenized ids
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # segment IDs
    # first occurence of [SEP] token
    sep_idx = input_ids.index(tokenizer.sep_token_id)

    # number of tokens in segment A - qustion 
    num_seg_a = sep_idx + 1

    # number of tokens in segment B - text
    num_seg_b = len(input_ids) - num_seg_a

    # list of 0s and 1s
    segment_ids =[0]*num_seg_a + [1]*num_seg_b

    assert len(segment_ids) == len(input_ids)

    # model output using input ids and sedment ids
    output = model(torch.tensor([input_ids]), token_type_ids=torch.tensor([segment_ids]))

    # reconstructing the answer
    answer_start = torch.argmax(output.start_logits)
    answer_end = torch.argmax(output.end_logits)

    if answer_end >= answer_start:
        answer = tokens[answer_start]
        for i in range(answer_start+1, answer_end+1):
            if tokens[i][0:2] == "##":
                answer = ""
            else:
                answer += " " + tokens[i]

    if answer.startswith("[CLS]"):
        answer = "Unable to find the answer to your question"
    
    print("\nAnswer: \n{}".format(answer.capitalize()))


In [23]:
text = """Shukran Dental Clinic is a leading provider of dental care services in Kenya, with locations in Eldoret and Nairobi. Our team of experienced and highly qualified dental professionals is dedicated to providing top-notch care to our patients in a warm, welcoming, and comfortable environment.

We offer a wide range of dental services, including routine cleanings, fillings, extractions, crowns and bridges, orthodontics, and cosmetic procedures such as teeth whitening and veneers. Our state-of-the-art facilities and equipment allow us to provide efficient and effective treatments for all of our patients.

The cost of our services varies depending on the specific procedure, but we strive to make our care as affordable as possible. Our staff will work with you to create a customized treatment plan that fits your budget, and we accept a variety of payment methods, including most major insurance plans.

We accept patients with insurance coverage from the following insurance agencies:
AAR Healthcare
Britam Insurance
Jubilee Insurance
Sanlam Kenya
CIC Insurance
At Shukran Dental Clinic, our goal is to help our patients achieve and maintain optimal oral health. We believe in educating our patients about the importance of oral health and how they can take care of their teeth and gums at home. Contact us today to schedule an appointment and learn more about our dental services.
"""

question = "What is the name of the clinic?"
 
question_answer(question, text)


Answer: 
 dental clinic


In [40]:
text = input("Please enter your text: \n")
question = input("\nPlease enter your question: \n")
 
while True:
   question_answer(question, text)
  
   flag = True
   flag_N = False
  
   while flag:
       response = input("\nDo you want to ask another question based on this text (Y/N)? ")
       if response[0] == "Y":
           question = input("\nPlease enter your question: \n")
           flag = False
       elif response[0] == "N":
           print("\nBye!")
           flag = False
           flag_N = True
          
   if flag_N == True:
       break


Answer: 
 town


IndexError: string index out of range