<a href="https://colab.research.google.com/github/HirenRupchandani/Question_Answer_Model/blob/main/QAModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install transformers



In [6]:
from transformers import BertForQuestionAnswering, BertTokenizer

model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

In [3]:
# A simple example
question = "How many medical teams were deployed?"
answer_text = "4 Army columns, Two Medical teams, one Engineering Task Force deployed at Ringi village. Army helicopters on aeriaâ€¦ https://t.co/Y1tGCRIOeZ"

In [4]:
# Apply the tokenizer to the input text, treating them as a text-pair.
input_ids = tokenizer.encode(question, answer_text)

print('The input has a total of {:} tokens.'.format(len(input_ids)))

The input has a total of 50 tokens.


In [5]:
# BERT only needs the token IDs, but for the purpose of inspecting the 
# tokenizer's behavior, let's also get the token strings and display them.
tokens = tokenizer.convert_ids_to_tokens(input_ids)

# For each token and its id...
for token, id in zip(tokens, input_ids):
    
    # If this is the [SEP] token, add some space around it to make it stand out.
    if id == tokenizer.sep_token_id:
        print('')
    
    # Print the token string and its ID in two columns.
    print('{:<12} {:>6,}'.format(token, id))

    if id == tokenizer.sep_token_id:
        print('')

[CLS]           101
how           2,129
many          2,116
medical       2,966
teams         2,780
were          2,020
deployed      7,333
?             1,029

[SEP]           102

4             1,018
army          2,390
columns       7,753
,             1,010
two           2,048
medical       2,966
teams         2,780
,             1,010
one           2,028
engineering   3,330
task          4,708
force         2,486
deployed      7,333
at            2,012
ring          3,614
##i           2,072
village       2,352
.             1,012
army          2,390
helicopters  12,400
on            2,006
ae           29,347
##ria         4,360
##a           2,050
##€          30,102
##¦          29,649
https        16,770
:             1,024
/             1,013
/             1,013
t             1,056
.             1,012
co            2,522
/             1,013
y             1,061
##1           2,487
##t           2,102
##gc         18,195
##rio         9,488
##ez          9,351

[SEP]           1

In [7]:
# Search the input_ids for the first instance of the `[SEP]` token.
sep_index = input_ids.index(tokenizer.sep_token_id)

# The number of segment A tokens includes the [SEP] token istelf.
num_seg_a = sep_index + 1

# The remainder are segment B.
num_seg_b = len(input_ids) - num_seg_a

# Construct the list of 0s and 1s.
segment_ids = [0]*num_seg_a + [1]*num_seg_b

# There should be a segment_id for every input token.
assert len(segment_ids) == len(input_ids)

In [8]:
# Run our example through the model.
import torch
outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                             token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                             return_dict=True) 

start_scores = outputs.start_logits
end_scores = outputs.end_logits

In [9]:
# Find the tokens with the highest `start` and `end` scores.
answer_start = torch.argmax(start_scores)
answer_end = torch.argmax(end_scores)

# Combine the tokens in the answer and print it out.
answer = ' '.join(tokens[answer_start:answer_end+1])

print('Answer: "' + answer + '"')

Answer: "two"


In [10]:
# Start with the first token.
answer = tokens[answer_start]

# Select the remaining answer tokens and join them with whitespace.
for i in range(answer_start + 1, answer_end + 1):
    
    # If it's a subword token, then recombine it with the previous token.
    if tokens[i][0:2] == '##':
        answer += tokens[i][2:]
    
    # Otherwise, add a space then the token.
    else:
        answer += ' ' + tokens[i]

print('Answer: "' + answer + '"')

Answer: "two"


In [11]:
# Pull the scores out of PyTorch Tensors and convert them to 1D numpy arrays.
s_scores = start_scores.detach().numpy().flatten()
e_scores = end_scores.detach().numpy().flatten()

# We'll use the tokens as the x-axis labels. In order to do that, they all need
# to be unique, so we'll add the token index to the end of each one.
token_labels = []
for (i, token) in enumerate(tokens):
    token_labels.append('{:} - {:>2}'.format(token, i))

**I'll merge all of the above Question-Answer pipeline into a single process to try out actual examples from the Uttarakhand.csv dataset**

**So the pre-trained model I used here has a limit of handling 512 tokens. After generating tokens from the given dataset, around 100k tokens are being generated. The only work-around I could find to this problem was to iterate through the dataset in small batches using a for loop.**

In [13]:
def answer_question(question, answer_text):
    '''
    Takes a `question` string and an `answer_text` string (which contains the
    answer), and identifies the words within the `answer_text` that are the
    answer. Prints them out.
    '''
    # ======== Tokenize ========
    # Apply the tokenizer to the input text, treating them as a text-pair.
    input_ids = tokenizer.encode(question, answer_text)

    # Report how long the input sequence is.
    #print('Query has {:,} tokens.\n'.format(len(input_ids)))

    # ======== Set Segment IDs ========
    # Search the input_ids for the first instance of the `[SEP]` token.
    sep_index = input_ids.index(tokenizer.sep_token_id)

    # The number of segment A tokens includes the [SEP] token istelf.
    num_seg_a = sep_index + 1

    # The remainder are segment B.
    num_seg_b = len(input_ids) - num_seg_a

    # Construct the list of 0s and 1s.
    segment_ids = [0]*num_seg_a + [1]*num_seg_b

    # There should be a segment_id for every input token.
    assert len(segment_ids) == len(input_ids)

    # ======== Evaluate ========
    # Run our example through the model.
    outputs = model(torch.tensor([input_ids]), # The tokens representing our input text.
                    token_type_ids=torch.tensor([segment_ids]), # The segment IDs to differentiate question from answer_text
                    return_dict=True) 

    start_scores = outputs.start_logits
    end_scores = outputs.end_logits

    # ======== Reconstruct Answer ========
    # Find the tokens with the highest `start` and `end` scores.
    answer_start = torch.argmax(start_scores)
    answer_end = torch.argmax(end_scores)

    # Get the string versions of the input tokens.
    tokens = tokenizer.convert_ids_to_tokens(input_ids)

    # Start with the first token.
    answer = tokens[answer_start]

    # Select the remaining answer tokens and join them with whitespace.
    for i in range(answer_start + 1, answer_end + 1):
        
        # If it's a subword token, then recombine it with the previous token.
        if tokens[i][0:2] == '##':
            answer += tokens[i][2:]
        # Otherwise, add a space then the token.
        else:
            answer += ' ' + tokens[i]
    
    answer = answer.replace('[CLS]','')
    # A very un-professional way to deal with the [CLS] token
    # which was being returned upon not finding the proper answer in the part of
    # the dataset during the iteration
    print('Answer: "' + answer + '"')

**Removing emojis in the next part.**

In [15]:
path = '/content/drive/MyDrive/Uttarakhand.csv'
import pandas as pd
import re

#function to remove emojis and other possible icons
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)


**Reading every tweet and putting it into a single string which will act as the input along with the questions**

In [22]:
import csv
data = ''

with open(path, 'r') as file:
    reader = csv.reader(file)
    for row in reader:
        deEmojifiedText = deEmojify(str(row[0]))
        data+=deEmojifiedText+'. '

print(row)
print(len(data))
print(data[:100])
#print(len(data.split())) #no. of words in dataset(probable number, needs more cleaning for an accurate number) 

['Sorry guys, I will not be able to tweet today, saddened by the bursting of the glacier in #Uttarakhand.\nभगवान बद्री… https://t.co/osVaStOfuG']
838297
tweet. Horrible news out of #Uttarakhand. Prayer for #Uttarakhand . Ohhh God Please Save #Uttarakhan


In [21]:
#Wrap text to 80 characters. Just to to able to read the data string
import textwrap

wrapper = textwrap.TextWrapper(width=80) 
print(wrapper.fill(data[1701:3300]))

loods in Uttarakhand due to Glacial burst is so scary. I hope all are safe. I
pray for speedy return to normalcy a… https://t.co/1t01RaAYFL. Prayers for
#Uttarakhand  My State #Uttarakhand. Respected indians  Let pray for the  people
and provide all the support we can let's help the people around. #Uttarakhand.
Let's pray for Uttarakhand.  Eventually it will reach the people in #Chamoli
#Uttarakhand. Hope ppl in safe if you're stuck or anywhere near the affected
area of flash flood please contact Disaster Operatio… https://t.co/cywjzpswTh.
You know it...when you see these response so soon. @HMOIndia what is the
reason???? #Uttarakhand https://t.co/MvVT16zOW5. 4 Army columns, Two Medical
teams,  one Engineering Task Force deployed at Ringi village. Army helicopters
on aeria… https://t.co/Y1tGCRIOeZ. The more hydro-power plants u build, the more
entropy (floods) you'll receive..☺ #Uttarakhand #ClimateAction…
https://t.co/AbXRVMAlcI. My prayers are with the people of Uttarakhand May
Mahad

**So the workaround is to loop the dataset and whenever I am finding the correct answer, I terminate the loop to save time because there would be around 580 iterations per question. Some questions have multiple answers so I'll display them before terminating the loop.**

In [121]:
question = "what happened in uttarakhand?"
for i in range(0,580):
  answer_question(question, data[(i*1500):((i+1)*1500)])

Answer: ""
Answer: "floods"


KeyboardInterrupt: ignored

In [117]:
question = "how much water level rose in rishikesh?"
for i in range(0,580):
  answer_question(question, data[(i*1500):((i+1)*1500)])

Answer: ""
Answer: ""
Answer: ""
Answer: "340 . 50 m"
Answer: ""
Answer: ""
Answer: ""


KeyboardInterrupt: ignored

In [118]:
question = "what is the emergency helpline number?"
for i in range(0,580):
  answer_question(question, data[(i*1500):((i+1)*1500)])

Answer: "1070 9557444486"
Answer: ""
Answer: "1070 9557444486"


KeyboardInterrupt: ignored

In [122]:
question = "How many people are affected?"
for i in range(0,580):
  answer_question(question, data[(i*1600):((i+1)*1600)])

Answer: ""
Answer: ""
Answer: ""
Answer: ""
Answer: ""
Answer: "100 - 150"


KeyboardInterrupt: ignored

In [19]:
question = "How many army units were deployed?"
for i in range(0,580):
  answer_question(question, data[(i*1500):((i+1)*1500)])

Answer: ""
Answer: "4"


KeyboardInterrupt: ignored