In [1]:
!pip install sentence-transformers

[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
[33mDEPRECATION: Configuring installation scheme with distutils config files is deprecated and will no longer work in the near future. If you are using a Homebrew or Linuxbrew Python, please see discussion at https://github.com/Homebrew/homebrew-core/issues/76621[0m
You should consider upgrading via the '/usr/local/opt/python@3.8/bin/python3.8 -m pip install --upgrade pip' command.[0m


In [2]:
# import dependencies
import json
import faiss
import numpy as np
from pprint import pprint
from sentence_transformers import SentenceTransformer, util



### Load data and preprocess

In [3]:
# Load a transformers model
embedder = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [4]:
x = embedder.encode(['spanish flu'])
x.shape

(1, 768)

In [5]:
# Load documents from JSON file
with open('data/data.json', 'r') as file:
    documents = json.load(file)

In [6]:
# Compute sentence embeddings for every text in the documents
corpus = [d['text'] for d in documents]

In [7]:
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [8]:
## move the tensor data to CPU for the FAISS index to work
corpus_embeddings = corpus_embeddings.cpu()

In [9]:
corpus_embeddings.numpy().shape

(26, 768)

### Create a FAISS index

In [10]:
# Create a flat Faiss index
inner_index = faiss.IndexFlatIP(768) # the size of our vector space
index = faiss.IndexIDMap(inner_index)

index.add_with_ids(corpus_embeddings.numpy(), 
                   np.array(range(0, len(corpus))))

# save the index for future use
faiss.write_index(index, 'data/pandemics')

### Search the documents

In [11]:
# Build a search function that finds the most relevant search results
def search(query, documents, k=5):
    # the difference from milestone1 is the use of embedder which is SBERT instead of BERT
    encoded_query = embedder.encode([query])
    top_k = index.search(encoded_query, k)
    scores = top_k[0][0]
    results = [documents[_id] for _id in top_k[1][0]]
    return list(zip(results, scores))

In [12]:
encoded_query = embedder.encode(["spanish flu casualties"])
encoded_query.shape

(1, 768)

In [13]:
top_k = index.search(encoded_query, 2)
top_k

(array([[88.825806, 74.40703 ]], dtype=float32), array([[19, 21]]))

In [14]:
pprint(search("spanish flu casualties", corpus, k=2))

[('The Spanish flu, also known as the 1918 flu pandemic, was an unusually '
  'deadly influenza pandemic caused by the H1N1 influenza A virus. Lasting '
  'from February 1918 to April 1920, it infected 500 million people – about a '
  "third of the world's population at the time – in four successive waves. The "
  'death toll is typically estimated to have been somewhere between 17 million '
  'and 50 million, and possibly as high as 100 million, making it one of the '
  'deadliest pandemics in human history.The first observations of illness and '
  'mortality were documented in the United States (in Fort Riley, Haskell '
  'County, Kansas as well as in New York City), France (Brest), Germany and '
  'the United Kingdom. To maintain morale, World War I censors minimized these '
  "early reports. Newspapers were free to report the epidemic's effects in "
  'neutral Spain, such as the grave illness of King Alfonso XIII, and these '
  'stories created a false impression of Spain as especi

In [15]:
pprint(search("influenza subtypes", corpus, k=2))

[('Swine influenza is an infection caused by any one of several types of swine '
  'influenza viruses. Swine influenza virus (SIV) or swine-origin influenza '
  'virus (S-OIV) is any strain of the influenza family of viruses that is '
  'endemic in pigs. As of 2009, the known SIV strains include influenza C and '
  'the subtypes of  influenza A known as H1N1, H1N2, H2N1, H3N1, H3N2, and '
  'H2N3.\n'
  'Swine influenza virus is common throughout pig populations worldwide. '
  'Transmission of the virus from pigs to humans is not common and does not '
  'always lead to human flu, often resulting only in the production of '
  'antibodies in the blood. If transmission does cause human flu, it is called '
  'zoonotic swine flu. People with regular exposure to pigs are at increased '
  'risk of swine flu infection.\n'
  'Around the mid-20th century, identification of influenza subtypes became '
  'possible, allowing accurate diagnosis of transmission to humans. Since '
  'then, only 50 such