# Semantic vs Keyword search

In [1]:
%%capture
from pymilvus import MilvusClient, DataType, Function, FunctionType

client = MilvusClient("./milvus_demo.db")

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('multi-qa-mpnet-base-cos-v1')

In [3]:
schema = client.create_schema()

schema.add_field(field_name="id", datatype=DataType.INT64, is_primary=True, auto_id=True)
schema.add_field(field_name="text", datatype=DataType.VARCHAR, max_length=2048, enable_analyzer=True)
schema.add_field(field_name="sparse", datatype=DataType.SPARSE_FLOAT_VECTOR)
schema.add_field(field_name="dense", datatype=DataType.FLOAT_VECTOR, dim=768)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>}, {'name': 'dense', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': False}

In [4]:
bm25_function = Function(
    name="text_bm25_emb", # Function name
    input_field_names=["text"], # Name of the VARCHAR field containing raw text data
    output_field_names=["sparse"], # Name of the SPARSE_FLOAT_VECTOR field reserved to store generated embeddings
    function_type=FunctionType.BM25, # Set to `BM25`
)

schema.add_function(bm25_function)

{'auto_id': False, 'description': '', 'fields': [{'name': 'id', 'description': '', 'type': <DataType.INT64: 5>, 'is_primary': True, 'auto_id': True}, {'name': 'text', 'description': '', 'type': <DataType.VARCHAR: 21>, 'params': {'max_length': 2048, 'enable_analyzer': True}}, {'name': 'sparse', 'description': '', 'type': <DataType.SPARSE_FLOAT_VECTOR: 104>, 'is_function_output': True}, {'name': 'dense', 'description': '', 'type': <DataType.FLOAT_VECTOR: 101>, 'params': {'dim': 768}}], 'enable_dynamic_field': False, 'functions': [{'name': 'text_bm25_emb', 'description': '', 'type': <FunctionType.BM25: 1>, 'input_field_names': ['text'], 'output_field_names': ['sparse'], 'params': {}}]}

In [5]:
index_params = client.prepare_index_params()

index_params.add_index(
    field_name="sparse",
    index_type="SPARSE_INVERTED_INDEX",
    metric_type="BM25",
    params={
        "inverted_index_algo": "DAAT_MAXSCORE",
        "bm25_k1": 1.2,
        "bm25_b": 0.75
    }
)

index_params.add_index(
    field_name="dense",
    index_name="text_dense_index",
    index_type="AUTOINDEX",
    metric_type="IP"
)

In [6]:
# Drop existing collection with the same name if it exists
if client.has_collection("semantic_vs_keyword_search"):
    client.drop_collection("semantic_vs_keyword_search")

In [7]:
client.create_collection(
    collection_name='semantic_vs_keyword_search', 
    schema=schema, 
    index_params=index_params
)

In [8]:
# Adapted from Hands-On Large Language Models by Jay Alammar, Maarten Grootendorst


text = '''
Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan. 
It stars Matthew McConaughey, Anne Hathaway, Jessica Chastain, Bill Irwin, Ellen Burstyn, Matt Damon, and Michael Caine. 
Set in a dystopian future where humanity is struggling to survive, the film follows a group of astronauts who travel through a wormhole near Saturn in search of a new home for mankind.

Brothers Christopher and Jonathan Nolan wrote the screenplay, which had its origins in a script Jonathan developed in 2007. 
Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar. 
Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm. 
Principal photography began in late 2013 and took place in Alberta, Iceland, and Los Angeles. 
Interstellar uses extensive practical and miniature effects and the company Double Negative created additional digital effects.

Interstellar premiered on October 26, 2014, in Los Angeles. 
In the United States, it was first released on film stock, expanding to venues using digital projectors. 
The film had a worldwide gross over $677 million (and $773 million with subsequent re-releases), making it the tenth-highest grossing film of 2014. 
It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight. 
It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics. Since its premiere, Interstellar gained a cult following,[5] and now is regarded by many sci-fi experts as one of the best science-fiction films of all time.
Interstellar was nominated for five awards at the 87th Academy Awards, winning Best Visual Effects, and received numerous other accolades"""
'''

# Split into a list of sentences
texts = text.split('.')

# Clean up to remove empty spaces and new lines
texts = [t.strip(' \n') for t in texts]

In [9]:
from tqdm import tqdm

for sentence in texts:
    client.insert('semantic_vs_keyword_search', [{'text': sentence, 'dense': model.encode(sentence)}])

In [10]:
query = "How precise was the science?"

In [11]:
search_params = {
    'params': {'drop_ratio_search': 0.2},
} 

num_retreive = 3

retreived_chunks = client.search(
    collection_name='semantic_vs_keyword_search', 
    data=[query],
    anns_field='sparse',
    output_fields=['text'], # Fields to return in search results; sparse field cannot be output
    limit=num_retreive,
    search_params=search_params
)

for hits in iter(retreived_chunks):
    for hit in hits:
        print("-----------------------------------------")
        print(f"BM-25 score is {hit.entity.get('distance')}")
        print(f"Retreived text: {hit.entity.get('text')}")

-----------------------------------------
BM-25 score is 2.4856224060058594
Retreived text: Caltech theoretical physicist and 2017 Nobel laureate in Physics[4] Kip Thorne was an executive producer, acted as a scientific consultant, and wrote a tie-in book, The Science of Interstellar
-----------------------------------------
BM-25 score is 1.6572182178497314
Retreived text: Interstellar is a 2014 epic science fiction film co-written, directed, and produced by Christopher Nolan
-----------------------------------------
BM-25 score is 1.6211800575256348
Retreived text: In the United States, it was first released on film stock, expanding to venues using digital projectors


In [12]:
query_dense_vector =  model.encode(query)

res = client.search(
    collection_name="semantic_vs_keyword_search",
    anns_field="dense",
    data=[query_dense_vector],
    limit=3,
    output_fields=["text"]
)


for hits in iter(res):
    for hit in hits:
        print("-----------------------------------------")
        print(f"IP distance is {hit.entity.get('distance')}")
        print(f"Retreived text: {hit.entity.get('text')}")

-----------------------------------------
IP distance is 0.3729895353317261
Retreived text: It has also received praise from many astronomers for its scientific accuracy and portrayal of theoretical astrophysics
-----------------------------------------
IP distance is 0.2487272471189499
Retreived text: It received acclaim for its performances, direction, screenplay, musical score, visual effects, ambition, themes, and emotional weight
-----------------------------------------
IP distance is 0.20131808519363403
Retreived text: Cinematographer Hoyte van Hoytema shot it on 35 mm movie film in the Panavision anamorphic format and IMAX 70 mm
