In [None]:
!pip install torch

In [None]:
!pip install transformers

In [None]:
!pip install wikipedia

In [None]:
import wikipedia as wiki
import pprint as pp

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

In [None]:
from collections import OrderedDict

We are using wikipedia's search API to return a list of documents that we then feed into our document reader (in this case, BERT fine-tuned on SQuAD 2.0). 

Let's make our code easier to read and more self-contained by packaging the document reader into a class.

In [None]:
class DocumentReader:
  
    def __init__(self, pretrained_model_name='bert-base-cased-squad2'): #for accessing the attributes of the class
        self.path = pretrained_model_name #name of the model
        self.tokenizer = AutoTokenizer.from_pretrained(self.path) #downloading the tokenizer
        self.model = AutoModelForQuestionAnswering.from_pretrained(self.path) #downloading the fine-tuned model
        self.max_len = self.model.config.max_position_embeddings #maximum length of embeddings for the model
        self.chunked = False

    def tokenize(self, question, text):# Tokenizing pair of inputs 
        self.inputs = self.tokenizer.encode_plus(question, text, add_special_tokens=True, return_tensors="pt")
        self.input_ids = self.inputs["input_ids"].tolist()[0] 

        if len(self.input_ids) > self.max_len:
            self.inputs = self.chunkify()
            self.chunked = True

    def chunkify(self):
        qmask = self.inputs['token_type_ids'].lt(1) #EXtracting only the question tokens
        qt = torch.masked_select(self.inputs['input_ids'], qmask) #From qmask select the input_ids tag as qt
        chunk_size = self.max_len - qt.size()[0] - 1  #specifying the chunk size. The "-1" accounts for
        # having to add an ending [SEP] token to the end.
        
        #Creating a dictionary, a structure of chunked model input as chunked_input
        #The structure of the chunked_input: [CLS], 12 question tokens, [SEP], 497 tokens of the Wikipedia article, [SEP] token = 512 tokens
        chunked_input = OrderedDict()
        for k,v in self.inputs.items():
            #q and c are the input_items of question and context respectively
            q = torch.masked_select(v, qmask)
            c = torch.masked_select(v, ~qmask)
            chunks = torch.split(c, chunk_size)#splitting c into chunks of input_tems
            
            for i, chunk in enumerate(chunks):
                if i not in chunked_input:
                    chunked_input[i] = {}

                thing = torch.cat((q, chunk))
                if i != len(chunks)-1:
                    if k == 'input_ids':
                        thing = torch.cat((thing, torch.tensor([102])))
                    else:
                        thing = torch.cat((thing, torch.tensor([1])))

                chunked_input[i][k] = torch.unsqueeze(thing, dim=0)# returns a new tensors with specified dim
        return chunked_input

    def convert_ids_to_string(self, input_ids):
        return self.tokenizer.convert_tokens_to_string(self.tokenizer.convert_ids_to_tokens(input_ids))  

    def get_answer(self):#Now iterating over each chunk, to get the best possible answer from each chunk
        if self.chunked:
            answer = ''
            for k, chunk in self.inputs.items():
                answer_start_scores, answer_end_scores = self.model(**chunk)

                answer_start = torch.argmax(answer_start_scores)
                answer_end = torch.argmax(answer_end_scores) + 1

                ans = self.convert_ids_to_string(chunk['input_ids'][0][answer_start:answer_end])
                if ans != '[CLS]':
                    answer += ans + " / "
            return answer
        

    

In [None]:
reader = DocumentReader('deepset/bert-base-cased-squad2') #specifying the model type

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=508.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=213450.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=152.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433294681.0, style=ProgressStyle(descri…




In [None]:
questions = [
    'When did "ustad hotel" release?',
    'Who is the wife of fahad fazil?',
    'What is the debut movie of Nivin pauly?',
    'Who is the music director of malayalam movie "yoddha"?',
    'Who is the Director of malayalam movie "Manichitrathazhu"?',
    'What is the genre of the malayalam movie "Memories"?',
    'Name some crime drama movie of mohanlal?',
    'What is the debut movie of Dulquer salman?'
]



for question in questions:
    print(f"Question: {question}")
    results = wiki.search(question)

    page = wiki.page(results[0])
    print(f"Top wiki result: {page}")

    text = page.content

    reader.tokenize(question, text)
    
    print(f"Answer: {reader.get_answer()}")
    print()

Question: When did "ustad hotel" release?
Top wiki result: <WikipediaPage 'Dulquer Salmaan'>


Token indices sequence length is longer than the specified maximum sequence length for this model (9 > 512). Running this sequence through the model will result in indexing errors


Answer: 2012 / 

Question: Who is the wife of fahad fazil?
Top wiki result: <WikipediaPage 'Fahadh Faasil'>


Token indices sequence length is longer than the specified maximum sequence length for this model (12 > 512). Running this sequence through the model will result in indexing errors


Answer: Nazriya Nazim / Nazriya Nazim / Fahadh Faasil / 

Question: What is the debut movie of Nivin pauly?
Top wiki result: <WikipediaPage 'Nivin Pauly'>


Token indices sequence length is longer than the specified maximum sequence length for this model (12 > 512). Running this sequence through the model will result in indexing errors


Answer: Malarvaadi Arts Club / 

Question: Who is the music director of malayalam movie "yoddha"?
Top wiki result: <WikipediaPage 'Yoddha (1992 film)'>


Token indices sequence length is longer than the specified maximum sequence length for this model (17 > 512). Running this sequence through the model will result in indexing errors


Answer: A . R . Rahman / 

Question: Who is the Director of malayalam movie "Manichitrathazhu"?
Top wiki result: <WikipediaPage 'Manichitrathazhu'>


Token indices sequence length is longer than the specified maximum sequence length for this model (19 > 512). Running this sequence through the model will result in indexing errors


Answer: Fazil / 

Question: What is the genre of the malayalam movie "Memories"?
Top wiki result: <WikipediaPage 'Memories (2013 film)'>


Token indices sequence length is longer than the specified maximum sequence length for this model (15 > 512). Running this sequence through the model will result in indexing errors


Answer: crime thriller / 

Question: Name some crime drama movie of mohanlal?
Top wiki result: <WikipediaPage 'Mohanlal'>


Token indices sequence length is longer than the specified maximum sequence length for this model (11 > 512). Running this sequence through the model will result in indexing errors


Answer: Rajavinte Makan / Thiranottam / Poochakkoru Mookkuthi / Kireedam /  / Narasimham / Janakan / Pranayam , released in August , an off - beat romantic drama directed by Blessy . It was a love story between three aged characters played by Mohanlal , Anupam Kher and Jayapradha . The film was highly acclaimed by critics and Mohanlal ' s performance as Mathews was well appreciated . His next film Snehaveedu , was directed by Sathyan Anthikkadu , a family film in the background of a village in Palakkad . For the first time , yesteryear actress Sheela shared screen space with Mohanlal . She played his mother . Snehaveedu is credited as his 300th film . The film was a hit . His last movie that year was Oru Marubhoomikkadha , directed by Priyadarshan , which was an action - comedy thriller entirely set in the Middle East . It marked the return of the Priyadarshan - Mohanlal - Mukesh combo of the late 1980s and 90s . The film performed well at the Christmas - New Year ' s box office . In 2

Token indices sequence length is longer than the specified maximum sequence length for this model (12 > 512). Running this sequence through the model will result in indexing errors


Answer: Second Show / Second Show / 

