# Description
Notebook demonstrating semantic search. Documents in a corpus are loaded into memory, chunked
and chunks are converted to vector embeddings. User queries are embedded with the same model, and the most
similar document chunks are fetched as search results.

- Image: *Data Science 3.0*
- Assumes corpus can be held in-memory

# Installations
Don't mind the warnings.

In [2]:
!pip install --upgrade pip --quiet
!pip install langchain --quiet
!pip install transformers[torch] --no-cache-dir --quiet
!pip install sentence-transformers --quiet

[0m

# Imports

In [3]:
import os
import torch
import logging
import langchain
import nltk, time
import numpy as np
import ipywidgets as widgets
from tqdm.notebook import tqdm
from transformers import AutoTokenizer
from sklearn.neighbors import NearestNeighbors
from sentence_transformers import SentenceTransformer

# Download punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

# Document Embedding
We load, chunk, and then embed in this section with an embedding model of choice.

## Loading Documents

In [4]:
# Source directory
source_dir = './data/wikipedia'

# Dictionary with document full-texts
docs = {}
for filename in os.listdir(source_dir):
    if os.path.isfile(filepath := os.path.join(source_dir, filename)):
        with open(filepath, 'r') as inf:
            docs[filename] = {'raw': inf.read()}

## Select Embedding Model
Set up a sentence embedding model to process corpus documents.

In [5]:
# Define model options and relevant configuration
model_options = {
    'msmarco-distilbert-base-v4': {
        'base_repo': 'sentence-transformers',
        'similarity': 'cosine'
    },
    'msmarco-distilbert-base-v3': {
        'base_repo': 'sentence-transformers',
        'similarity': 'cosine'
    },
    'msmarco-roberta-base-v3': {
        'base_repo': 'sentence-transformers',
        'similarity': 'cosine'
    },
    'msmarco-distilbert-base-tas-b': {
        'base_repo': 'sebastian-hofstaetter',
        'similarity': 'dot'
    },
    'msmarco-roberta-base-ance-firstp': {
        'base_repo': 'sentence-transformers',
        'similarity': 'dot'
    }
}

In [6]:
# Dropdown to select model
select_model = widgets.Dropdown(
    options = model_options.keys(),
    layout = {'width':'max-content'},
    style = {'description_width': 'max-content'},
    description = 'Select an Embedding Model:'
)

# Display the dropdown widget
display(select_model)

Dropdown(description='Select an Embedding Model:', layout=Layout(width='max-content'), options=('msmarco-disti…

In [7]:
# Freeze the selected model for the rest of the notebook
selected_model = select_model.value

# Select device
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Load the model
tokenizer = AutoTokenizer.from_pretrained(f'{model_options[selected_model]["base_repo"]}/{selected_model}')
model = SentenceTransformer(f'{model_options[selected_model]["base_repo"]}/{selected_model}', device=device)
print(f'Loaded {selected_model}. Inference performed on {device}.')

Loaded msmarco-distilbert-base-v4. Inference performed on cpu.


# Process Documents
Chunk the documents, convert chunks into embeddings.

In [8]:
# Chunking parameter sliders
chunk_size_slider = widgets.IntSlider(
    value=100,
    min=50,
    max=500,
    step=5,
    description='chunk_size:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d'
)
chunk_overlap_slider = widgets.IntSlider(
    value=10,
    min=5,
    max=50,
    step=5,
    description='chunk_overlap:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
    style={'description_width': 'max-content'}
)

# Display sliders
display(chunk_size_slider)
display(chunk_overlap_slider)

IntSlider(value=100, continuous_update=False, description='chunk_size:', max=500, min=50, step=5)

IntSlider(value=10, continuous_update=False, description='chunk_overlap:', max=50, min=5, step=5, style=Slider…

In [9]:
# Lists for data
docs_sources = []
docs_chunks = []

# Split multi-sentence lines into multiple lines (RecursiveTextSplitter assumption)
for doc in docs:
    docs[doc]['sent_split'] = '\n'.join(nltk.tokenize.sent_tokenize(docs[doc]['raw']))

# Chunk the documents
logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.ERROR)
splitter = langchain.text_splitter.RecursiveCharacterTextSplitter(
    chunk_size = chunk_size_slider.value,
    chunk_overlap = chunk_overlap_slider.value,
    length_function = lambda s: len(tokenizer(s)['input_ids'])
)
for doc in docs:
    chunk_texts = [d.page_content for d in splitter.create_documents([docs[doc]['sent_split']])]
    docs_chunks += chunk_texts
    docs_sources += [doc] * len(chunk_texts)
logging.getLogger('transformers.tokenization_utils_base').setLevel(logging.WARNING)

In [10]:
%%time

# Function for batching
def batch(iterable, n=1):
    l = len(iterable)
    for ndx in range(0, l, n):
        yield iterable[ndx:min(ndx + n, l)]

# Generate embeddings for the chunks
bsize = 32
docs_embeddings_list = []
for chunk_batch in tqdm(batch(docs_chunks, bsize), desc='Embedding chunks', total=(len(docs_chunks) // bsize + 1)):
    docs_embeddings_list.append(model.encode(chunk_batch))

Embedding chunks:   0%|          | 0/14 [00:00<?, ?it/s]

CPU times: user 32.7 s, sys: 3.47 s, total: 36.1 s
Wall time: 37.9 s


In [11]:
# Convert embeddings list to a numpy array
docs_embeddings = torch.from_numpy(np.vstack(docs_embeddings_list)).to(device)
docs_embeddings_norm = docs_embeddings / docs_embeddings.norm(dim=1, keepdim=True)

## Indexing

Load the chunks into searchable data structures. In a production environments, these data structures would be replaced by Elasticsearch, Opensearch, or alternative vector search indices.

In [12]:
# Double check conversion
assert len(docs_sources) == len(docs_chunks)
assert docs_embeddings.shape[0] == len(docs_sources)

# Search 
def search(query, res_count=5):
    # Embed the query
    q_embed = torch.from_numpy(model.encode([query])[0]).to(device)
    
    # Compute dot
    if model_options[selected_model]['similarity'] == 'dot':
        similarities = docs_embeddings @ q_embed
    elif model_options[selected_model]['similarity'] == 'cosine':
        similarities = docs_embeddings_norm @ (q_embed / q_embed.norm(dim=0))
        
    # Get top indices
    values, indices = torch.topk(similarities, res_count)
    indices = indices[torch.argsort(values, descending=True)]
    
    # Fetch results
    return {
        'sources': [docs_sources[i] for i in indices],
        'chunks': [docs_chunks[i] for i in indices],
        'similarities': similarities[indices]
    }

# Search

In [13]:
# Define fields
search_bar = widgets.Text(placeholder='Search')
results_field = widgets.Output()

# Display fields
display(search_bar)
display(results_field)

# Define the search function
def handle_search(query):
    results_field.clear_output()
    if query:
        # Launch search against elasticsearch
        start = time.time()
        res = search(query)
        end = time.time()
        
        # Output the results
        with results_field:
            print(f'Search time: {end - start:.2f}s\n')
            for i, chunk in enumerate(res['chunks']):
                print(f'Source: {res["sources"][i]} ({res["similarities"][i]:.4f})')
                print(f'------------------\n{chunk}\n')
        
# Register the search function to execute on submit
def handle_submit(sender):
    handle_search(search_bar.value)
search_bar.continuous_update = False
search_bar.observe(handle_submit, 'value')

Text(value='', placeholder='Search')

Output()