In [1]:
# import dependencies
import json
import faiss
import numpy as np
from pprint import pprint
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline



### Load data and preprocess

In [2]:
# Load FAISS index cretaed in Milestone 2
index = faiss.read_index('data/pandemics')

In [3]:
# Load a transformers model
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [4]:
# Load a question answering pipeline from Hugging Face Transformers
nlp = pipeline('question-answering')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


In [5]:
with open('data/data.json', 'r') as file:
    documents = json.load(file)
corpus = [d['text'] for d in documents]

In [6]:
encoded_query = embedder.encode(['spanish flu casualties'])
encoded_query

array([[ 6.04743540e-01, -1.01334035e+00, -3.44446510e-01,
        -8.46514583e-01,  4.49926525e-01, -6.51701510e-01,
         6.07120156e-01,  2.33016878e-01, -5.03782451e-01,
        -1.60832614e-01,  3.77833515e-01, -7.87616909e-01,
        -2.86686718e-02,  1.16363895e+00, -1.27604222e+00,
        -1.86595470e-01, -5.61707616e-02,  5.58383226e-01,
         8.94670337e-02, -1.08795263e-01,  4.62752640e-01,
        -3.10191154e-01, -3.78858328e-01,  3.76741350e-01,
        -4.12901819e-01, -4.42575872e-01,  9.47318196e-01,
        -5.38632274e-01,  7.70681083e-01, -5.96220493e-01,
         9.89409626e-01,  7.62357652e-01,  6.00445628e-01,
        -7.47968137e-01, -5.99298835e-01, -4.16271865e-01,
        -3.03628147e-01, -1.72790095e-01,  2.14923143e-01,
         2.86867708e-01, -3.87522340e-01, -1.16060674e+00,
        -7.79481512e-03, -7.26408977e-03,  8.51115406e-01,
        -5.63370049e-01, -5.94873428e-01,  7.79424250e-01,
         6.00455403e-01,  1.46040678e-01, -2.31472015e-0

In [7]:
top_k = index.search(encoded_query, 2)
top_k

(array([[88.825806, 74.40703 ]], dtype=float32), array([[19, 21]]))

### Search the documents and retrieve the answers

In [8]:
# Modify the search function from Milestone 2
# to add question-answering functionality
def find_answer(query: str, documents, k=5):
    encoded_query = embedder.encode([query])
    top_k = index.search(encoded_query, k)
    # Use the transformers question-answering pipeline to find answer in text. This is done for the top_k documents
    # that were found to be matching from FAISS index
    answers = [nlp(context=documents[_id], question=query) 
          for _id 
          in top_k[1][0]]
    return sorted(answers, key = lambda x: x["score"], reverse=True)

In [9]:
# Checking for the query string which was used in Milestone1 and Milestone2 for comparison
pprint(find_answer("spanish flu casualties", corpus, k=2))

  tensor = as_tensor(value)
  p_mask = np.asarray(


[{'answer': 'between 12,000 and 18,000',
  'end': 1548,
  'score': 0.19150158762931824,
  'start': 1523},
 {'answer': '17 million and 50 million',
  'end': 384,
  'score': 0.009130196645855904,
  'start': 359}]


In [10]:
# Trying a query from the questions.json file
query = "How to prevent the spread of viral infections?"
results = find_answer(query, corpus)

print('Top search results:')
for result in results:
    print(result)

Top search results:
{'score': 0.5752673745155334, 'start': 1211, 'end': 1256, 'answer': 'improved sanitation and access to clean water'}
{'score': 0.4162377119064331, 'start': 1322, 'end': 1383, 'answer': 'by giving both the mother and child antiretroviral medication'}
{'score': 0.2783832848072052, 'start': 111, 'end': 163, 'answer': 'measures to reduce causes of new infectious diseases'}
{'score': 0.15053513646125793, 'start': 900, 'end': 946, 'answer': 'Tracking viral load is used to monitor therapy'}
{'score': 0.11565114557743073, 'start': 218, 'end': 244, 'answer': 'administration of vaccines'}
