# Handy Utils to do Vector Search on Collections

## Configuration

In [1]:
class MyConfig:
    pass
MY_CONFIG = MyConfig()

MY_CONFIG.EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
MY_CONFIG.EMBEDDING_LENGTH = 384

MY_CONFIG.DB_URI = './rag_1_dpk.db'  # For embedded instance
#MY_CONFIG.DB_URI = 'http://localhost:19530'  # For Docker instance
MY_CONFIG.COLLECTION_NAME = 'dpk_walmart_docs'

## Connect to Vector Database

Milvus can be embedded and easy to use.

<span style="color:blue;">Note: If you encounter an error about unable to load database, try this: </span>

- <span style="color:blue;">In **vscode** : **restart the kernel** of previous notebook. This will release the db.lock </span>
- <span style="color:blue;">In **Jupyter**: Do `File --> Close and Shutdown Notebook` of previous notebook. This will release the db.lock</span>
- <span style="color:blue;">Re-run this cell again</span>


In [2]:
from pymilvus import MilvusClient

milvus_client = MilvusClient(MY_CONFIG.DB_URI)

print ("✅ Connected to Milvus instance:", MY_CONFIG.DB_URI)

✅ Connected to Milvus instance: ./rag_1_dpk.db


## Setup Embeddings

Two choices here. 

1. use sentence transformers directly
2. use Milvus model wrapper

In [3]:
## Option 1 - use sentence transformers directly

# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'

from sentence_transformers import SentenceTransformer

embedding_model = SentenceTransformer(MY_CONFIG.EMBEDDING_MODEL)

def get_embeddings (str):
    embeddings = embedding_model.encode(str, normalize_embeddings=True)
    return embeddings

  from tqdm.autonotebook import tqdm, trange


In [4]:
## Option 2 - Milvus model
from pymilvus import model

# If connection to https://huggingface.co/ failed, uncomment the following path
import os
os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'


# embedding_fn = model.DefaultEmbeddingFunction()

## initialize the SentenceTransformerEmbeddingFunction
embedding_fn = model.dense.SentenceTransformerEmbeddingFunction(
    model_name = MY_CONFIG.EMBEDDING_MODEL,
    device='cpu' # this will work on all devices (KIS)
)

In [5]:
# Test Embeddings
text = 'Paris 2024 Olympics'
embeddings = get_embeddings(text)
print ('sentence transformer : embeddings len =', len(embeddings))
print ('sentence transformer : embeddings[:5] = ', embeddings[:5])

embeddings = embedding_fn([text])
print ('milvus model wrapper : embeddings len =', len(embeddings[0]))
print ('milvus model wrapper  : embeddings[:5] = ', embeddings[0][:5])

sentence transformer : embeddings len = 384
sentence transformer : embeddings[:5] =  [ 0.02468893  0.10352128  0.02752643 -0.08551716 -0.01412826]
milvus model wrapper : embeddings len = 384
milvus model wrapper  : embeddings[:5] =  [ 0.02468893  0.10352128  0.02752643 -0.08551716 -0.01412826]


## Do A  Vector Search

We will do this to verify data

In [6]:
import random


## helper function to perform vector search
def  do_vector_search (query):
    query_vectors = [get_embeddings(query)]  # Option 1 - using sentence transformers
    # query_vectors = embedding_fn([query])  # using Milvus model 

    results = milvus_client.search(
        collection_name=MY_CONFIG.COLLECTION_NAME,  # target collection
        data=query_vectors,  # query vectors
        limit=5,  # number of returned entities
        output_fields=["filename", "page_number", "text"],  # specifies fields to be returned
    )
    return results
## ----

def  print_search_results (results):
    # pprint (results)
    print ('num results : ', len(results[0]))

    for i, r in enumerate (results[0]):
        #pprint(r, indent=4)
        print (f'------ result {i+1} --------')
        print ('search score:', r['distance'])
        print ('filename:', r['entity']['filename'])
        print ('page number:', r['entity']['page_number'])
        print ('text:\n', r['entity']['text'])
        print()

In [7]:
query = "What was Walmart's revenue in 2023?"

results = do_vector_search (query)
print_search_results(results)

num results :  5
------ result 1 --------
search score: 0.5978392958641052
filename: Walmart_2024_copy.pdf
page number: 99
text:
 Stock Performance Chart
Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 2023 = 161.45. S&P 500 Index, 2024 = 195.06. S&P 500 Consumer   Discretionary, 2019 = . S&P 500 Consumer   Discretionary, 2020 = . S&P 500 Consumer   Discretionary, 2021 = . S&P 500 Consumer   Discretionary, 2022 = . S&P 500 Consumer   Discretionary, 2023 = . S&P 500 Consumer   Discretionary, 2024 = . Discretionary   Distribution &   RiliId, 2019 = 100.00. Discretionary   Distribution &   RiliId, 2020 = 117.54. Discretionary   Distribution &   RiliId, 2021 = 166.19. Discretionary   Distribution &   RiliId, 2022 = 180.56. Disc

In [8]:
query = "How many distribution facilities does Walmart have?"

results = do_vector_search (query)
print_search_results(results)

num results :  5
------ result 1 --------
search score: 0.5755810141563416
filename: Walmart_2024_copy.pdf
page number: 2
text:
 "At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated
through, up to and including 2030. Additional qualifying information can be found by visiting http://corporate.walmart.com/purpose/esgreport.

------ result 2 --------
search score: 0.502342700958252
filename: Walmart_2024_copy.pdf
page number: 2
text:
 "At Walmart, we're a people-led, tech-powered omnichannel retailer dedicated
1B Tonnes

------ result 3 --------
search score: 0.5014065504074097
filename: Walmart_2024_copy.pdf
page number: 99
text:
 Stock Performance Chart
Walmart Inc., 2019 = $100.00. Walmart Inc., 2020 = $120.27. Walmart Inc., 2021 = $148.41. Walmart Inc., 2022 = $148.47. Walmart Inc., 2023 = $153.58. Walmart Inc., 2024 = $177.30. S&P 500 Index, 2019 = 100.00. S&P 500 Index, 2020 = 121.68. S&P 500 Index, 2021 = 142.67. S&P 500 Index, 2022 = 175.90. S&P 500 Index, 

In [9]:
# milvus_client.close()