## Semantic Search Tool Exploration - 09/01/2023
- Name the tool with the fastest speed

In [1]:
from annoy import AnnoyIndex
import numpy as np
import pandas as pd
import re
import pickle

from sentence_transformers import SentenceTransformer, util
import torch

pd.set_option('display.max_columns', None)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load test data
df_service = pd.read_parquet('../../../2way_sms_data/all_service_data.parquet')
print(df_service.shape)

(14524269, 40)


In [3]:
np.random.seed(42)
sample_conv_ids = df_service['_id'].unique()[np.random.choice(df_service['_id'].unique().shape[0], 50000)]
df_service_sample = df_service[df_service['_id'].isin(sample_conv_ids)].copy()
print(df_service_sample.shape)
df_service_sample_outbound = df_service_sample[(df_service_sample['messageList.direction']=='outbound') & (df_service_sample['messageList.repName'].isin(['system', 'Billing Bot']))].reset_index(drop=True)
df_service_sample_inbound = df_service_sample[(df_service_sample['messageList.direction']=='inbound') & (~df_service_sample['messageList.body'].isnull())].reset_index(drop=True)
print(df_service_sample_outbound.shape, df_service_sample_inbound.shape)

(349657, 40)
(120565, 40) (137205, 40)


In [5]:
# Corpus with example sentences
corpus = df_service_sample_inbound['messageList.body'].tolist()

### SBERT
Reference: https://www.sbert.net/examples/applications/semantic-search/README.html

In [16]:
%%time
"""
This is a simple application for sentence embeddings: semantic search

We have a corpus with various sentences. Then, for a given query sentence,
we want to find the most similar sentence in this corpus.

This script outputs for various queries the top 5 most similar sentences in the corpus.
"""

embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(corpus, batch_size=64, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)

with open('corpus_embeddings', 'wb') as pkl:
    pickle.dump(corpus_embeddings, pkl)

#with open('corpus_embeddings', 'rb') as pkl:
#    doc_embedding = pickle.load(pkl)

Batches: 100%|██████████| 2144/2144 [00:27<00:00, 78.35it/s] 


CPU times: user 57.8 s, sys: 12 s, total: 1min 9s
Wall time: 29.9 s


In [17]:
def search(query, top_n):
    
    # Get the query's embedding
    query_embeddings = embedder.encode([query], convert_to_tensor=True, normalize_embeddings=True)
    # Retrieve the nearest neighbors
    query_embeddings = query_embeddings.to('cuda')
    # Format the results
    results = util.semantic_search(query_embeddings, corpus_embeddings, top_k=top_n)[0]
    for result in results:
        result.update(
            {'message': df_service_sample_inbound.loc[result['corpus_id'], 'messageList.body'],
             'conversation_id': df_service_sample_inbound.loc[result['corpus_id'], '_id']})
    return results

In [18]:
%%time
results = search('I want to cancel my policy', top_n=10)

CPU times: user 13.2 ms, sys: 353 µs, total: 13.5 ms
Wall time: 12.8 ms


## Annoy - Build index for Service messages
Reference: https://pypi.org/project/annoy/

In [19]:
%%time
embedder = SentenceTransformer('all-MiniLM-L6-v2')

# Corpus with example sentences
corpus = df_service_sample_inbound['messageList.body'].tolist()
corpus_embeddings = embedder.encode(corpus, batch_size=64, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)

search_index = AnnoyIndex(corpus_embeddings.shape[1], 'angular')
index_inbound = df_service_sample_inbound.index.values

for i in range(len(corpus_embeddings)):
    #search_index.add_item(i, corpus_embeddings[i])
    search_index.add_item(index_inbound[i], corpus_embeddings[i])
search_index.build(100, n_jobs=-1) # 10 trees
search_index.save('service_inbound.ann')

Batches: 100%|██████████| 2144/2144 [00:26<00:00, 81.98it/s] 


CPU times: user 13min 31s, sys: 1min 30s, total: 15min 1s
Wall time: 12min 24s


True

In [20]:
search_index.load('service_inbound.ann')

True

In [21]:
pd.set_option('display.max_colwidth', None)
# texts = df_service_inbound['messageList.body'].values

def search(query, top_n):

    # Get the query's embedding
    query_embed = embedder.encode([query], convert_to_tensor=True, normalize_embeddings=True)

    # Retrieve the nearest neighbors
    similar_item_ids = search_index.get_nns_by_vector(query_embed[0], top_n, include_distances=True)
    # Format the results
    results = {
        'texts':  df_service_sample_inbound.loc[similar_item_ids[0], 'messageList.body'],
        'distance': similar_item_ids[1],
        'corpus_id': similar_item_ids[0],
        'conversation_id': df_service_sample_inbound.loc[similar_item_ids[0], '_id'],
    }
    # print(similar_item_ids)
    return results

In [22]:
%%time
query = 'I want to cancel my policy'
results = search(query, top_n=100)

CPU times: user 19.7 ms, sys: 7.99 ms, total: 27.7 ms
Wall time: 27.1 ms


## Faiss - Build index for Service messages
Reference: https://www.pinecone.io/learn/series/faiss/faiss-tutorial/

In [6]:
import faiss

In [7]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')
corpus_embeddings = embedder.encode(corpus, batch_size=64, show_progress_bar=True, convert_to_tensor=True, normalize_embeddings=True)
d = corpus_embeddings.shape[1]
print(d)

2023-09-05 13:23:44.856638: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Batches: 100%|██████████| 2144/2144 [00:29<00:00, 72.69it/s] 


384


### IndexFlatL2 - exhaustive search

In [8]:
index = faiss.IndexFlatL2(d)
index.is_trained

True

In [17]:
index.add(corpus_embeddings.cpu().numpy())
index.ntotal

137205

In [23]:
%%time
k = 4
xq = embedder.encode(["I want to cancel my policy"], normalize_embeddings=True)
D, I = index.search(xq, k)  # search
print(I)
print(df_service_sample_inbound['messageList.body'].iloc[I[0]])

[[  842   895 11037  7533]]
842      I want to cancel my policy
895      I want to cancel my policy
11037    I want to cancel my policy
7533     I want to cancel my policy
Name: messageList.body, dtype: object
CPU times: user 78.5 ms, sys: 0 ns, total: 78.5 ms
Wall time: 77.1 ms


### IndexFlatL2 + IndexIVFFlat

In [24]:
nlist = 50  # how many cells
quantizer = faiss.IndexFlatL2(d)
index = faiss.IndexIVFFlat(quantizer, d, nlist)

In [25]:
index.is_trained

False

In [27]:
index.train(corpus_embeddings.cpu().numpy())
index.is_trained

True

In [28]:
index.add(corpus_embeddings.cpu().numpy())
index.ntotal

137205

In [29]:
%%time
k = 4
xq = embedder.encode(["I want to cancel my policy"], normalize_embeddings=True)
D, I = index.search(xq, k)  # search
print(I)
print(df_service_sample_inbound['messageList.body'].iloc[I[0]])

[[  842  7533 11037   895]]
842      I want to cancel my policy
7533     I want to cancel my policy
11037    I want to cancel my policy
895      I want to cancel my policy
Name: messageList.body, dtype: object
CPU times: user 15 ms, sys: 0 ns, total: 15 ms
Wall time: 13.3 ms


In [38]:
index.nprobe = 20

In [39]:
%%time
k = 4
xq = embedder.encode(["I want to cancel my policy"], normalize_embeddings=True)
D, I = index.search(xq, k)  # search
print(I)
print(df_service_sample_inbound['messageList.body'].iloc[I[0]])

[[  842  7533 11037   895]]
842      I want to cancel my policy
7533     I want to cancel my policy
11037    I want to cancel my policy
895      I want to cancel my policy
Name: messageList.body, dtype: object
CPU times: user 32.3 ms, sys: 6.61 ms, total: 39 ms
Wall time: 37.6 ms


### IndexFlatL2 + IndexIVFFlat (IVF ) + Product Quantization (PQ)

In [40]:
m = 8  # number of centroid IDs in final compressed vectors
bits = 8 # number of bits in each centroid

quantizer = faiss.IndexFlatL2(d)  # we keep the same L2 distance flat index
index = faiss.IndexIVFPQ(quantizer, d, nlist, m, bits)

In [41]:
index.is_trained

False

In [42]:
index.train(corpus_embeddings.cpu().numpy())
index.is_trained

True

In [43]:
index.add(corpus_embeddings.cpu().numpy())
index.ntotal

137205

In [44]:
index.nprobe = 20

In [45]:
%%time
k = 4
xq = embedder.encode(["I want to cancel my policy"], normalize_embeddings=True)
D, I = index.search(xq, k)  # search
print(I)
print(df_service_sample_inbound['messageList.body'].iloc[I[0]])

[[333 377 895 842]]
333    I want to cancel  this policy
377    Hi I need to cancel my policy
895       I want to cancel my policy
842       I want to cancel my policy
Name: messageList.body, dtype: object
CPU times: user 12.5 ms, sys: 38 µs, total: 12.5 ms
Wall time: 10.8 ms


In [50]:
# Save index locally https://github.com/facebookresearch/faiss/issues/2078
from faiss import write_index, read_index

In [47]:
write_index(index, "faiss_large.index")

In [48]:
index_load = read_index("faiss_large.index")

In [49]:
%%time
k = 4
xq = embedder.encode(["I want to cancel my policy"], normalize_embeddings=True)
D, I = index_load.search(xq, k)  # search
print(I)
print(df_service_sample_inbound['messageList.body'].iloc[I[0]])

[[333 377 895 842]]
333    I want to cancel  this policy
377    Hi I need to cancel my policy
895       I want to cancel my policy
842       I want to cancel my policy
Name: messageList.body, dtype: object
CPU times: user 12.7 ms, sys: 77 µs, total: 12.8 ms
Wall time: 11 ms
