In [1]:
# based on:
# https://www.vikas.sh/post/semantic-search-guide

In [None]:
import requests
import re

import faiss
from ftfy import fix_text
from sentence_transformers import SentenceTransformer, util
import torch

  from .autonotebook import tqdm as notebook_tqdm


## Basics

In [6]:
# Download the book ...
# book = requests.get("https://www.gutenberg.org/files/2701/2701-0.txt").text
# ... or get it from a local file
with open("moby-dick.txt", encoding="utf-8") as f:
    book = f.read()

# Remove all text before the first chapter
book = book.split("CHAPTER 1. Loomings.")[-1]

# Split into passages
passages = re.split("[\n\r]{3,}", book)

In [8]:
# Clean up the passages
passages = [fix_text(chunk.strip()) for chunk in passages if len(chunk.strip()) > 100]
passages[:2]

['Call me Ishmael. Some years ago—never mind how long precisely—having\nlittle or no money in my purse, and nothing particular to interest me\non shore, I thought I would sail about a little and see the watery part\nof the world. It is a way I have of driving off the spleen and\nregulating the circulation. Whenever I find myself growing grim about\nthe mouth; whenever it is a damp, drizzly November in my soul; whenever\nI find myself involuntarily pausing before coffin warehouses, and\nbringing up the rear of every funeral I meet; and especially whenever\nmy hypos get such an upper hand of me, that it requires a strong moral\nprinciple to prevent me from deliberately stepping into the street, and\nmethodically knocking people\'s hats off—then, I account it high time to\nget to sea as soon as I can. This is my substitute for pistol and ball.\nWith a philosophical flourish Cato throws himself upon his sword; I\nquietly take to the ship. There is nothing surprising in this. If they\nbut k

In [10]:
model = SentenceTransformer(
    "sentence-transformers/all-MiniLM-L6-v2"
)  # Use device='cuda' to use GPU

No sentence-transformers model found with name sentence-transformers/all-MiniLM-L6-v2. Creating a new one with mean pooling.


OSError: We couldn't connect to 'https://huggingface.co' to load the files, and couldn't find them in the cached files.
Checkout your internet connection or see how to run the library in offline mode at 'https://huggingface.co/docs/transformers/installation#offline-mode'.

In [None]:
embeddings = model.encode(passages, convert_to_tensor=True)

In [None]:
query_text = [
    "A really big whale",
    "Adrift alone in the ocean",
]

In [None]:
# Encode the query text
query = model.encode(query_text, convert_to_tensor=True)

# Find the similar passages
cos_scores = util.cos_sim(query, embeddings)
cos_scores.shape

In [None]:
# Find the most similar passage indices.  Increase k to get more results.
top_results = torch.topk(cos_scores, k=1, dim=-1)
indices = top_results.indices.tolist()

# Display the results


def display_results(indices):
    for i, result in enumerate(indices):
        print(f"Query: {query_text[i]}")
        for idx in result:
            print(f"Passage: {passages[idx]}")
        print()


display_results(indices)

## Scaling up

### Don't re-compute embeddings

In [None]:
# Save embeddings
torch.save(embeddings, "embeddings.pt")

In [None]:
# Load embeddings
new_embeddings = torch.load("embeddings.pt")

In [None]:
# Verify that the query results are the same
torch.allclose(util.cos_sim(query, new_embeddings), util.cos_sim(query, embeddings))

### Index and store embeddings in db

In [None]:
# Initialize an HNSW index
index = faiss.IndexHNSWFlat(embeddings.shape[1], 64)

In [None]:
# Add our embeddings
index.add(embeddings.numpy())

In [None]:
# Query the index
distances, indices = index.search(query, k=1)

In [None]:
# Display the results
display_results(indices)

In [None]:
# Persist the index, can load with faiss.read_index("index.faiss")
faiss.write_index(index, "index.faiss")