In [1]:
# import dependencies
import json
import faiss
import numpy as np
from pprint import pprint
from sentence_transformers import SentenceTransformer, util
from transformers import pipeline



### Load data and preprocess

In [2]:
# Load FAISS index cretaed in Milestone 2
index = faiss.read_index('data/pandemics')

In [5]:
# Load a transformers model
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')
# Load a question answering pipeline from Hugging Face Transformers
nlp = pipeline('question-answering')

No model was supplied, defaulted to distilbert-base-cased-distilled-squad (https://huggingface.co/distilbert-base-cased-distilled-squad)


In [6]:
with open('data/data.json', 'r') as file:
    documents = json.load(file)
corpus = [d['text'] for d in documents]

### Search the documents and retrieve the answers

In [7]:
# Modify the search function from Milestone 2
# to add question-answering functionality
def find_answer(query: str, documents, k=5):
    encoded_query = embedder.encode([query])
    top_k = index.search(encoded_query, k)
    # Use the transformers question-answering pipeline to find answer in text. This is done for the top_k documents
    # that were found to be matching from FAISS index
    answers = [nlp(context=documents[_id], question=query) 
          for _id 
          in top_k[1][0]]
    return sorted(answers, key = lambda x: x["score"], reverse=True)

In [8]:
# Checking for the query string which was used in Milestone1 and Milestone2 for comparison
pprint(find_answer("spanish flu casualties", corpus, k=2))

  tensor = as_tensor(value)
  p_mask = np.asarray(


[{'answer': 'epidemics and disasters',
  'end': 457,
  'score': 0.7145211100578308,
  'start': 434},
 {'answer': 'more than 1.1 million',
  'end': 487,
  'score': 0.044743359088897705,
  'start': 466}]


In [34]:
# Trying a query from the questions.json file
query = "How to prevent the spread of viral infections?"
results = find_answer(query, corpus)

print('Top search results:')
for result in results:
    print(result)

Top search results:
{'score': 0.5752660632133484, 'start': 1211, 'end': 1256, 'answer': 'improved sanitation and access to clean water'}
{'score': 0.41623830795288086, 'start': 1322, 'end': 1383, 'answer': 'by giving both the mother and child antiretroviral medication'}
{'score': 0.2783833146095276, 'start': 111, 'end': 163, 'answer': 'measures to reduce causes of new infectious diseases'}
{'score': 0.15053525567054749, 'start': 900, 'end': 946, 'answer': 'Tracking viral load is used to monitor therapy'}
{'score': 0.11565103381872177, 'start': 218, 'end': 244, 'answer': 'administration of vaccines'}
