In [2]:
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = AutoModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.bias', 'vocab_projector.bias', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
documents = [
    "That restaurant was not as good as the last movie I watched.",
    "I'm selling a used car in good condition",
    "Food was okay, the rest so so",
    "I love cats, but don't really like hyenas",
    "On the road, you must be careful",
]

vectors = [
  # tokenize the document, return it as PyTorch tensors (vectors),
  # and pass it onto the model
  model(**tokenizer(document, return_tensors='pt'))[0].detach().squeeze()
  for document in documents
]

In [4]:
[v.size() for v in vectors]

[torch.Size([15, 768]),
 torch.Size([12, 768]),
 torch.Size([10, 768]),
 torch.Size([15, 768]),
 torch.Size([10, 768])]

In [5]:
import torch

averaged_vectors = [torch.mean(vector, dim=0) for vector in vectors]

[v.size() for v in averaged_vectors]

[torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768]),
 torch.Size([768])]

In [6]:
def encode(document: str) -> torch.Tensor:
    tokens = tokenizer(document, return_tensors='pt')       # tokenize usign the tokenizer
    vector = model(**tokens)[0].detach().squeeze()      # pass the tokens to the model to get the vector
    return torch.mean(vector, dim=0)                # average the vector

In [7]:
import faiss
import numpy as np

# convert the PyTorch tensors to NumPy arrays

index = faiss.IndexIDMap(faiss.IndexFlatIP(768)) # create an index

index.add_with_ids(
    np.array([t.numpy() for t in averaged_vectors]),
    np.array(range(0, len(documents)))
)

In [8]:
def search(query: str, k=1):
    encoded_query = encode(query).unsqueeze(dim=0).numpy()
    top_k = index.search(encoded_query, k)
    scores = top_k[0][0]
    results = [documents[_id] for _id in top_k[1][0]]
    return list(zip(results, scores))


In [9]:
documents[1]

"I'm selling a used car in good condition"

In [10]:
search(documents[1], k=2)

: 

: 