In [1]:
from decouple import config


QDRANT_URL = config('QDRANT_URL')
QDRANT_API_KEY = config('QDRANT_API_KEY')


In [None]:
# https://qdrant.tech/documentation/quickstart/

# NOTE best: https://qdrant.tech/documentation/search-precision/reranking-hybrid-search/

In [2]:
from qdrant_client import QdrantClient


qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
)

print(qdrant_client.get_collections())

collections=[]


In [3]:
from fastembed import TextEmbedding


documents = [
    'FastEmbed is lighter than Transformers & Sentence-Transformers.',
    'FastEmbed is supported by and maintained by Qdrant.',
]

In [5]:
embedding_model = TextEmbedding(model_name='BAAI/bge-small-en-v1.5')

Fetching 5 files: 100%|██████████| 5/5 [00:06<00:00,  1.32s/it]


In [None]:
# https://qdrant.tech/documentation/fastembed/fastembed-quickstart/

embeddings = list(embedding_model.embed(documents))
len(embeddings[0])  

384

In [None]:
from qdrant_client.models import (
    Distance, 
    VectorParams, 
    SparseVectorParams, 
    SparseIndexParams, 
    MultiVectorConfig,
    MultiVectorComparator,
    Modifier
)

# https://qdrant.tech/articles/sparse-vectors/

COLLECTION_NAME = ...

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),    # TODO HNSW config
    sparse_vectors_config={
        'text': SparseVectorParams(
            index=SparseIndexParams(
                on_disk=True,
            )
        )
    },
)

dense_model = ...
sparse_model = ...
reranking_model = ...

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config={
        dense_model: VectorParams(
            size=384,
            distance=Distance.COSINE,
        ),
        reranking_model: VectorParams(
            size=100,
            distance=Distance.COSINE,
            multivector_config=MultiVectorConfig(
                comparator=MultiVectorComparator.MAX_SIM,
            )
        ),
    },
    sparse_vectors_config={
        "bm25": SparseVectorParams(modifier=Modifier.IDF)
    }
)

In [5]:
from pydantic import BaseModel


class DenseModelConfig(BaseModel):
    name: str
    vector_params: VectorParams
    
class RerankingModelConfig(BaseModel):
    name: str
    vector_params: VectorParams
    
class SparseModelConfig(BaseModel):
    name: str
    sparse_vector_params: SparseVectorParams

In [3]:
from qdrant_client.models import PointStruct, SearchParams, ScoredPoint
from qdrant_client.models import Vector, SparseVector


class HybridSearchMetadata(BaseModel):
    id: str
    text: str


class HybridSearch(BaseModel):
    qdrant_client: QdrantClient
    dense_model_config: DenseModelConfig
    sparse_model_config: SparseModelConfig
    
    def create_collection(self, collection_name: str):
        self.qdrant_client.create_collection(
            collection_name=collection_name,
            vectors_config={
                self.dense_model_config.name: self.dense_model_config.vector_params
            },
            sparse_vectors_config={
                self.sparse_model_config.name: self.sparse_model_config.sparse_vector_params
            }
        )
    
    def delete_collection(self, collection_name: str):
        self.qdrant_client.delete_collection(collection_name)
        
    def upsert(
        self, 
        collection_name: str, 
        dense_embeddings: list[Vector], 
        sparse_embeddings: list[SparseVector], 
        metadatas: list[HybridSearchMetadata]
    ) -> str:
        items = zip(dense_embeddings, sparse_embeddings, metadatas)
        
        result = self.qdrant_client.upsert(
            collection_name=collection_name,
            wait=True,
            points=[
                PointStruct(
                    id=metadata.id, 
                    vector={
                        self.dense_model_config.name: dense_embedding,
                        self.sparse_model_config.name: sparse_embedding
                    },
                    payload={
                        'text': metadata.text
                    }
                )
                for dense_embedding, sparse_embedding, metadata in items
            ],
        )
        
        return str(result.status)
    
    def search(
        self, 
        collection_name: str, 
        dense_embedding: Vector, 
        sparse_embedding: SparseVector, 
        limit: int
    ) -> list[ScoredPoint]:
        return qdrant_client.search(
            collection_name=collection_name,
            query_vector={
                self.dense_model_config.name: dense_embedding,
                self.sparse_model_config.name: sparse_embedding
            },
            params=SearchParams(
                hnsw_ef=128
            ),
            limit=limit,
            with_payload=True
        )

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue

# finds items based on ids and filters, doesn't perform vector search
search_result = qdrant_client.query_points(
    collection_name=COLLECTION_NAME,
    query=...,
    #query_filter=Filter(
    #    must=[FieldCondition(key='text', match=MatchValue(value=...))]  # metadata
    #),
    with_payload=False,
    limit=3
).points

print(search_result)

In [None]:
from qdrant_client.models import SearchParams
from qdrant_client.http.models import SparseSearchParams    # TODO


result = qdrant_client.search(
    collection_name=COLLECTION_NAME,
    query_vector={
        "dense_vector": [0.1, 0.2, 0.3],  # Dense vector
        "text": {"indices": [5, 8], "values": [1.0, 0.8]}  # Sparse vector matching the key in sparse_vectors_config
    },
    query=None,  # Optional BM25-based text query if needed
    params=SearchParams(
        hnsw_ef=128,  # Controls recall for HNSW search
    ),
    limit=10,
    with_payload=True,  # Return payload (e.g., text, metadata)
)

print(result)


In [18]:
from fastembed import SparseTextEmbedding

# Initialize the BM25 model with custom k1 and b parameters
model = SparseTextEmbedding(
    model_name="Qdrant/bm25",
    k1=1.5,  # Set your desired k1 value
    b=0.75   # Set your desired b value
)

# Example documents
documents = [
    "You should stay, study and sprint.",
    "History can only prepare us to be surprised yet again.",
]

# Generate embeddings
embeddings = list(model.embed(documents))


In [8]:
for x in embeddings:
    print(x.values, x.indices)

[1.67868852 1.67868852 1.67868852] [1881538586  150760872 1932363795]
[1.66973021 1.66973021 1.66973021 1.66973021 1.66973021] [ 733618285 1849833631 1008800696 2090661150 1117393019]


In [4]:
from fastembed import SparseTextEmbedding, SparseEmbedding

# https://qdrant.tech/documentation/fastembed/fastembed-splade/

In [5]:
SparseTextEmbedding.list_supported_models()

[{'model': 'prithivida/Splade_PP_en_v1',
  'vocab_size': 30522,
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},
  'model_file': 'model.onnx'},
 {'model': 'prithvida/Splade_PP_en_v1',
  'vocab_size': 30522,
  'description': 'Independent Implementation of SPLADE++ Model for English.',
  'license': 'apache-2.0',
  'size_in_GB': 0.532,
  'sources': {'hf': 'Qdrant/SPLADE_PP_en_v1'},
  'model_file': 'model.onnx'},
 {'model': 'Qdrant/bm42-all-minilm-l6-v2-attentions',
  'vocab_size': 30522,
  'description': 'Light sparse embedding model, which assigns an importance score to each token in the text',
  'license': 'apache-2.0',
  'size_in_GB': 0.09,
  'sources': {'hf': 'Qdrant/all_miniLM_L6_v2_with_attentions'},
  'model_file': 'model.onnx',
  'additional_files': ['stopwords.txt'],
  'requires_idf': True},
 {'model': 'Qdrant/bm25',
  'description': 'BM25 as sparse embedd

In [None]:
model = SparseTextEmbedding(model_name='prithvida/Splade_PP_en_v1')

documents = [
    'Chandrayaan-3 is India\'s third lunar mission',
    'It aimed to land a rover on the Moon\'s surface - joining the US, China and Russia',
    'The mission is a follow-up to Chandrayaan-2, which had partial success',
    'Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)',
    'The estimated cost of the mission is around $35 million',
    'It will carry instruments to study the lunar surface and atmosphere',
    'Chandrayaan-3 landed on the Moon\'s surface on 23rd August 2023',
    'It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.',
    'The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit',
    'The mission used GSLV Mk III rocket for its launch',
    'Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota',
    'Chandrayaan-3 was launched earlier in the year 2023',
]

In [None]:
sparse_embeddings = list(model.embed(documents, batch_size=5)) 

In [None]:
import json
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained('Qdrant/SPLADE_PP_en_v1')

In [None]:
def get_tokens_and_weights(sparse_embedding: SparseEmbedding, tokenizer: Tokenizer) -> dict:
    token_weight_dict = {}
    
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))
    
    return token_weight_dict

In [None]:
index = 0
print(json.dumps(get_tokens_and_weights(sparse_embeddings[index], tokenizer), indent=4))

In [None]:
# https://qdrant.tech/documentation/concepts/hybrid-queries/
# https://huggingface.co/datasets/microsoft/ms_marco

In [None]:
from datasets import load_dataset
from datasets import get_dataset_config_info
from pandas import DataFrame


def get_row_count(config_name: str):
    dataset_info = get_dataset_config_info('BeIR/hotpotqa', config_name=config_name)
    
    return  dataset_info.splits[config_name].num_examples

def get_corpus_queries_qrels(corpus_count: int, queries_count: int) -> tuple[DataFrame, DataFrame, DataFrame]:
    max_corpus_count = get_row_count('corpus')
    max_query_count = get_row_count('queries')
    
    if corpus_count > max_corpus_count or queries_count > max_query_count:
        raise ValueError()
    
    # load
    corpus = load_dataset('BeIR/hotpotqa', 'corpus', split=f'corpus[:{corpus_count}]')
    queries = load_dataset('BeIR/hotpotqa', 'queries', split=f'queries[:{queries_count}]')
    qrels = load_dataset('BeIR/hotpotqa-qrels')

    # filter
    query_ids_set = set(queries['_id'])
    corpus_ids_set = set(corpus['_id'])

    qrels_df = qrels['train'].to_pandas()
    filtered_qrels_df = qrels_df[
        qrels_df['corpus-id'].astype(str).isin(corpus_ids_set) &
        qrels_df['query-id'].isin(query_ids_set)
    ]

    unique_corpus_ids = set(filtered_qrels_df['corpus-id'].astype(str))
    unique_query_ids = set(filtered_qrels_df['query-id'])

    filtered_corpus = corpus.filter(lambda x: x['_id'] in unique_corpus_ids)
    filtered_queries = queries.filter(lambda x: x['_id'] in unique_query_ids)
    
    filtered_corpus_df = filtered_corpus.to_pandas()
    filtered_queries_df = filtered_queries.to_pandas()
    
    return filtered_corpus_df, filtered_queries_df, filtered_qrels_df

In [None]:
from fastembed.sparse.bm25 import Bm25