Use AutoTokenizer and AutoModel classes from Transformers library to load a pre-trained model from Transformers, along with the appropriate tokenizer.
https://huggingface.co/docs/transformers/model_doc/auto

In [24]:
import pandas as pd
import numpy as np
import torch
import transformers
from transformers import AutoTokenizer
from transformers import DistilBertTokenizer, DistilBertModel
import faiss


In [7]:
sentences = pd.read_json("https://lp-prod-resources.s3.amazonaws.com/493/57248/2021-05-04-13-31-46/sentences.json").rename(columns = {0:"sentence_text"}) 
questions = pd.read_json("https://lp-prod-resources.s3.amazonaws.com/493/57248/2021-08-16-19-04-45/questions.json").rename(columns = {0:"question_text"}) 

In [8]:
sentences, questions

(                                        sentence_text
 0   A pandemic is an epidemic of an infectious dis...
 1   The most fatal pandemic in recorded history wa...
 2   Current pandemics include COVID-19 (SARS-CoV-2...
 3   As of 2018, approximately 37.9 million people ...
 4   Cholera is an infection of the small intestine...
 5   Classic cholera symptom is large amounts of wa...
 6   The COVID-19 pandemic, also known as the coron...
 7   Common symptoms of COVID-19 include fever, cou...
 8   The Plague of Cyprian was a pandemic that affl...
 9   The Spanish flu, also known as the 1918 flu pa...
 10  The death toll of Spanish Flu is estimated to ...,
                                        question_text
 0      How many people have died during Black Death?
 1      Which diseases can be transmitted by animals?
 2  Connection between climate change and a likeli...
 3               What is an example of a latent virus
 4                          Viruses in nanotechnology
 5             

In [10]:
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

Downloading: 100%|██████████| 232k/232k [00:00<00:00, 1.45MB/s]
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Downloading: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.0kB/s]
Downloading: 100%|██████████| 483/483 [00:00<00:00, 242kB/s]
Downloading: 100%|██████████| 268M/268M [00:39<00:00, 6.85MB/s] 
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForP

In [16]:
documents = sentences["sentence_text"].to_list()

In [17]:
# Turn the documents into tensors
vectors = [
    model(**tokenizer(document, return_tensors = 'pt'))[0].detach().squeeze()
    for document in documents
]


In [20]:
# Check the sizes
[v.size() for v in vectors]

[torch.Size([35, 768]),
 torch.Size([37, 768]),
 torch.Size([25, 768]),
 torch.Size([18, 768]),
 torch.Size([24, 768]),
 torch.Size([55, 768]),
 torch.Size([57, 768]),
 torch.Size([24, 768]),
 torch.Size([27, 768]),
 torch.Size([35, 768]),
 torch.Size([43, 768])]

In [21]:
# Average on dimension 0 to create uniform sizes (These match the orginal size of the faiss index 768 also happens to be the hidden ouput of distilbert)

averaged_vectors = [torch.mean(vector, dim=0) for vector in vectors]
[v.size() for v in averaged_vectors]

[torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768])]

In [22]:
# Define an encoding function to turn a string into a tensor
def encode(document: str) -> torch.Tensor:
  tokens = tokenizer(document, return_tensors='pt')
  vector = model(**tokens)[0].detach().squeeze()
  return torch.mean(vector, dim=0)

In [26]:
index = faiss.IndexIDMap(faiss.IndexFlatIP(768)) # the size of our vector space (Also the size of the hidden ouput of distilbert)

In [33]:
# index all the documents, we need to get the numpy array for each document
index.add_with_ids(
    np.array([t.numpy() for t in averaged_vectors]),
    # the IDs will be 0 to len(documents)
    np.array(range(0, len(documents))).astype(np.int64)
)

In [34]:
# Our search funciton will receive a string, encode it, and use the encoded tensor to search the index returning the k most similar documents
def search(query: str, k=1):
  encoded_query = encode(query).unsqueeze(dim=0).numpy()
  top_k = index.search(encoded_query, k)
  scores = top_k[0][0]
  results = [documents[_id] for _id in top_k[1][0]]
  return list(zip(results, scores))

In [44]:
q_num = 1
num_similar_docs = 5

print("Question: {}".format(questions.question_text[q_num])), print("Answer: {}".format(search(questions.question_text[q_num], num_similar_docs)))

Question: Which diseases can be transmitted by animals?
Answer: [('A pandemic is an epidemic of an infectious disease that has spread across a large region, for instance multiple continents or worldwide, affecting a substantial number of people.', 54.0495), ('Cholera is an infection of the small intestine by some strains of the bacterium Vibrio cholerae.', 50.804134), ('Current pandemics include COVID-19 (SARS-CoV-2) and HIV/AIDS.', 50.652287), ('As of 2018, approximately 37.9 million people are infected with HIV globally.', 50.516018), ('The Spanish flu, also known as the 1918 flu pandemic, was an unusually deadly influenza pandemic caused by the H1N1 influenza A virus.', 48.97045)]


(None, None)