<a href="https://colab.research.google.com/github/Meenakshi72/PythonProjects/blob/main/Information_retrieval.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q tensorflow_text
!pip install -q simpleneighbors[annoy]
!pip install -q nltk
!pip install -q tqdm

In [None]:
import json
import nltk
import os
import pprint
import random
import simpleneighbors
import urllib
from IPython.display import HTML, display
from tqdm.notebook import tqdm
import tensorflow.compat.v2 as tf
import tensorflow_hub as hub
from tensorflow_text import SentencepieceTokenizer
nltk.download('punkt')
print('TensorFlow version: ', tf.__version__)
print('TensorFlow Hub version: ', hub.__version__)

In [None]:
squad_url = 'https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json'
squad_json = json.load(urllib.request.urlopen(squad_url))

In [None]:
squad_json

In [None]:
nltk.tokenize.sent_tokenize('Computational complexity theory is a branch of the theory of computation in theoretical computer science that focuses on classifying computational problems according to their inherent difficulty, and relating those classes to each other. A computational problem is understood to be a task that is in principle amenable to being solved by a computer, which is equivalent to stating that the problem may be solved by mechanical application of mathematical steps, such as an algorithm.')

In [None]:
def extract_sentences(squad):
  all_sentences = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      sentences = nltk.tokenize.sent_tokenize(paragraph['context'])
      #print(sentences)
      all_sentences.extend(zip(sentences, [paragraph['context']] * len(sentences)))
  return list(set(all_sentences))

In [None]:
sentences = extract_sentences(squad_json)

In [None]:
len(sentences)

In [None]:
sentences[0:5]

In [None]:
def extract_questions_answers(squad):
  questions_answers = []
  for data in squad['data']:
    for paragraph in data['paragraphs']:
      for qas in paragraph['qas']:
        if qas['answers']:
          questions_answers.append((qas['question'], qas['answers'][0]['text']))
  return list(set(questions_answers))

In [None]:
questions_answers = extract_questions_answers(squad_json)

In [None]:
len(questions_answers)

In [None]:
questions_answers[0:10]

In [None]:
print('Sentence and context\n')
sentence = random.choice(sentences)
print('Sentence: ')
pprint.pprint(sentence[0])
print('\nContext:\n')
pprint.pprint(sentence[1])
print()

In [None]:
# More models: https://tfhub.dev/s?dataset=squad
model_path = 'https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3'
model = hub.load(model_path)

In [None]:
sentences[0][0]

In [None]:
sentences[0][1]

In [None]:
encodings = model.signatures['response_encoder'](input = tf.constant([sentences[0][0]]),
                                                 context = tf.constant([sentences[0][1]]))

In [None]:
len(encodings['outputs'][0])

In [None]:
index = simpleneighbors.SimpleNeighbors(len(encodings['outputs'][0]), metric = 'angular')

In [None]:
batch_size = 100
slices = zip(*(iter(sentences),) * batch_size)
num_batches = int(len(sentences) / batch_size)
num_batches

In [None]:
for s in tqdm(slices, total = num_batches):
  sentence_batch = list([r for r, c in s])
  context_batch = list([c for r, c in s])
  encodings = model.signatures['response_encoder'](input = tf.constant(sentence_batch), context = tf.constant(context_batch))
  for batch_index, batch in enumerate(sentence_batch):
    index.add_one(batch, encodings['outputs'][batch_index])
index.build()

In [None]:
number_of_results = 10
question_answer = random.choice(questions_answers)
print(question_answer)

In [None]:
def show_results(question, answer):
  embedding = model.signatures['question_encoder'](tf.constant([question]))['outputs'][0]
  #print(embedding)
  search_results = index.nearest(embedding, n = number_of_results)

  formatted_result = '''
    <p>Random question selected from SQUAD</p>
    <p><b>%s</b></p>
    <p>Answer:</p>
    <p><b>%s</b></p>
  ''' % (question, answer)

  formatted_result += '<ol>'
  for s in search_results:
    formatted_result += '<li>'
    formatted_result += s
    formatted_result += '</li>'
  formatted_result += '</ol>'

  display(HTML(formatted_result))

In [None]:
show_results(question_answer[0], question_answer[1])