## Hybrid search using QDrant

In [1]:
import os

import torch
import tqdm
from dotenv import load_dotenv
from fastembed import TextEmbedding

load_dotenv()

True

In [2]:
embedding_model = TextEmbedding(
    model_name="BAAI/bge-small-en-v1.5", providers=["CUDAExecutionProvider"]
)

In [3]:
embedding_model.model.providers

['CUDAExecutionProvider']

In [4]:
embedding_model.model

<fastembed.text.onnx_embedding.OnnxTextEmbedding at 0x1b7884b9a60>

In [5]:
# Import client library
from qdrant_client import QdrantClient, models

client = QdrantClient(
    url=os.environ.get("QDRANT_URL"), api_key=os.environ.get("QDRANT_API_KEY")
)

In [6]:
dense_vector_name = "dense"
sparse_vector_name = "sparse"
dense_model_name = "sentence-transformers/all-MiniLM-L6-v2"
sparse_model_name = "prithivida/Splade_PP_en_v1"

In [10]:
if not client.collection_exists("startups"):
    client.create_collection(
        collection_name="startups",
        vectors_config={
            dense_vector_name: models.VectorParams(
                size=client.get_embedding_size(dense_model_name),
                distance=models.Distance.COSINE,
            )
        },  # size and distance are model dependent
        sparse_vectors_config={sparse_vector_name: models.SparseVectorParams()},
    )

In [11]:
client.create_payload_index(
    collection_name="startups",
    field_name="city",
    field_schema="keyword",
)

UpdateResult(operation_id=3, status=<UpdateStatus.COMPLETED: 'completed'>)

In [None]:
import json

payload_path = "startups_demo.json"
# Download from
# https://storage.googleapis.com/generall-shared-data/startups_demo.json

documents = []
metadata = []

with open(payload_path) as fd:
    for line in fd:
        obj = json.loads(line)
        description = obj["description"]
        dense_document = models.Document(text=description, model=dense_model_name)
        sparse_document = models.Document(text=description, model=sparse_model_name)
        documents.append(
            {
                dense_vector_name: dense_document,
                sparse_vector_name: sparse_document,
            }
        )
        metadata.append(obj)

In [14]:
documents[:5]

[{'dense': Document(text='QR codes systems for COVID-19.\nSimple tools for bars, restaurants, offices, and other small proximity businesses.', model='sentence-transformers/all-MiniLM-L6-v2', options=None),
  'sparse': Document(text='QR codes systems for COVID-19.\nSimple tools for bars, restaurants, offices, and other small proximity businesses.', model='prithivida/Splade_PP_en_v1', options=None)},
 {'dense': Document(text='Point-of-care word of mouth\nPreferral is a mobile platform that channels physicians’ interest in networking with their peers to build referrals within a hospital system.\nHospitals are in a race to employ physicians, even though they lose billions each year ($40B in 2014) on employment. Why ...', model='sentence-transformers/all-MiniLM-L6-v2', options=None),
  'sparse': Document(text='Point-of-care word of mouth\nPreferral is a mobile platform that channels physicians’ interest in networking with their peers to build referrals within a hospital system.\nHospitals a

In [31]:
client.upload_collection(
    collection_name="startups",
    vectors=tqdm.tqdm(documents),
    payload=metadata,
    parallel=1,  # Use 4 CPU cores to encode data.
    # This will spawn a model per process, which might be memory expensive
    # Make sure that your system does not use swap, and reduce the amount
    # # of processes if it does.
    # Otherwise, it might significantly slow down the process.
    # Requires wrapping code into if __name__ == '__main__' block
)

100%|██████████| 19/19 [00:01<00:00, 14.36it/s]


In [None]:
class HybridSearcher:
    DENSE_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
    SPARSE_MODEL = "prithivida/Splade_PP_en_v1"

    def __init__(self, collection_name):
        self.collection_name = collection_name
        self.qdrant_client = QdrantClient(
            url=os.environ.get("QDRANT_URL"), api_key=os.environ.get("QDRANT_API_KEY")
        )

    def search(self, text: str):
        city_of_interest = "Chicago"

        # Define a filter for cities
        city_filter = models.Filter(
            must=[
                models.FieldCondition(
                    key="city", match=models.MatchValue(value=city_of_interest)
                )
            ]
        )

        search_result = self.qdrant_client.query_points(
            collection_name=self.collection_name,
            query=models.FusionQuery(
                fusion=models.Fusion.RRF  # we are using reciprocal rank fusion here
            ),
            prefetch=[
                models.Prefetch(
                    query=models.Document(text=text, model=self.DENSE_MODEL),
                    using=dense_vector_name,
                ),
                models.Prefetch(
                    query=models.Document(text=text, model=self.SPARSE_MODEL),
                    using=sparse_vector_name,
                ),
            ],
            query_filter=city_filter,  # If you don't want any filters for now
            limit=2,  # 2 the closest results
        ).points
        # `search_result` contains models.QueryResponse structure
        # We can access list of scored points with the corresponding similarity scores,
        # vectors (if `with_vectors` was set to `True`), and payload via `points` attribute.

        # Select and return metadata
        metadata = [point.payload for point in search_result]
        return metadata

In [16]:
hybrid_searcher = HybridSearcher(collection_name="startups")

In [17]:
hybrid_searcher.search("startup in airplanes")

[{'name': 'NowBoarding ✈️',
  'images': 'https://static.above.flights/img/lowcost/envelope_blue.png',
  'alt': 'Lowcost Email cheap flights alerts',
  'description': 'Invite-only mailing list.\n\nWe search the best weekend and long-haul flight deals\nso you can book before everyone else.',
  'link': 'https://nowboarding.club/',
  'city': 'Chicago'},
 {'name': 'Rocketmiles',
  'images': 'https://d1qb2nb5cznatu.cloudfront.net/startups/i/158571-e53ddffe9fb3ed5e57080db7134117d0-thumb_jpg.jpg?buster=1361371304',
  'alt': 'Rocketmiles -  e-commerce online travel loyalty programs hotels',
  'description': "Fueling more vacations\nWe enable our customers to travel more, travel better and travel further. 20M+ consumers stock away miles & points to satisfy their wanderlust.\nFlying around or using credit cards are the only good ways to fill the stockpile today. We've built the third way. Customers ...",
  'link': 'http://www.Rocketmiles.com',
  'city': 'Chicago'}]

### Reference
https://qdrant.tech/articles/sparse-vectors/  
https://qdrant.tech/documentation/beginner-tutorials/hybrid-search-fastembed/