In [1]:
from decouple import config


QDRANT_URL = config('QDRANT_URL')
QDRANT_API_KEY = config('QDRANT_API_KEY')


In [None]:
# https://qdrant.tech/documentation/quickstart/

In [2]:
from qdrant_client import QdrantClient


qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
)

print(qdrant_client.get_collections())

collections=[]


In [4]:
from fastembed import TextEmbedding


documents = [
    'FastEmbed is lighter than Transformers & Sentence-Transformers.',
    'FastEmbed is supported by and maintained by Qdrant.',
]

In [5]:
embedding_model = TextEmbedding(model_name='BAAI/bge-small-en-v1.5')

Fetching 5 files: 100%|██████████| 5/5 [00:06<00:00,  1.32s/it]


In [None]:
# https://qdrant.tech/documentation/fastembed/fastembed-quickstart/

embeddings = list(embedding_model.embed(documents))
len(embeddings[0])  

384

In [None]:
from qdrant_client.models import Distance, VectorParams


COLLECTION_NAME = ...

qdrant_client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

In [None]:
from qdrant_client.models import PointStruct
from uuid import uuid4

operation_info = qdrant_client.upsert(
    collection_name=COLLECTION_NAME,
    wait=True,
    points=[
        PointStruct(
            id=uuid4(), 
            vector=embedding.tolist(), 
            payload={'text': document}
        )
        for document, embedding in zip(documents, embeddings)
    ],
)

print(operation_info)

In [None]:
from qdrant_client.models import Filter, FieldCondition, MatchValue


search_result = qdrant_client.query_points(
    collection_name=COLLECTION_NAME,
    query=...,
    #query_filter=Filter(
    #    must=[FieldCondition(key='text', match=MatchValue(value=...))]  # metadata
    #),
    with_payload=False,
    limit=3
).points

print(search_result)

In [10]:
from fastembed import SparseTextEmbedding, SparseEmbedding

# https://qdrant.tech/documentation/fastembed/fastembed-splade/

In [None]:
SparseTextEmbedding.list_supported_models()

In [None]:
model = SparseTextEmbedding(model_name='prithvida/Splade_PP_en_v1')

documents = [
    'Chandrayaan-3 is India\'s third lunar mission',
    'It aimed to land a rover on the Moon\'s surface - joining the US, China and Russia',
    'The mission is a follow-up to Chandrayaan-2, which had partial success',
    'Chandrayaan-3 will be launched by the Indian Space Research Organisation (ISRO)',
    'The estimated cost of the mission is around $35 million',
    'It will carry instruments to study the lunar surface and atmosphere',
    'Chandrayaan-3 landed on the Moon\'s surface on 23rd August 2023',
    'It consists of a lander named Vikram and a rover named Pragyan similar to Chandrayaan-2. Its propulsion module would act like an orbiter.',
    'The propulsion module carries the lander and rover configuration until the spacecraft is in a 100-kilometre (62 mi) lunar orbit',
    'The mission used GSLV Mk III rocket for its launch',
    'Chandrayaan-3 was launched from the Satish Dhawan Space Centre in Sriharikota',
    'Chandrayaan-3 was launched earlier in the year 2023',
]

In [None]:
sparse_embeddings = list(model.embed(documents, batch_size=5)) 

In [None]:
import json
from tokenizers import Tokenizer

tokenizer = Tokenizer.from_pretrained('Qdrant/SPLADE_PP_en_v1')

In [None]:
def get_tokens_and_weights(sparse_embedding: SparseEmbedding, tokenizer: Tokenizer) -> dict:
    token_weight_dict = {}
    
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))
    
    return token_weight_dict

In [None]:
index = 0
print(json.dumps(get_tokens_and_weights(sparse_embeddings[index], tokenizer), indent=4))

In [None]:
# https://qdrant.tech/documentation/concepts/hybrid-queries/
# https://huggingface.co/datasets/microsoft/ms_marco

In [None]:
from datasets import load_dataset
from datasets import get_dataset_config_info
from pandas import DataFrame


def get_row_count(config_name: str):
    dataset_info = get_dataset_config_info('BeIR/hotpotqa', config_name=config_name)
    
    return  dataset_info.splits[config_name].num_examples

def get_corpus_queries_qrels(corpus_count: int, queries_count: int) -> tuple[DataFrame, DataFrame, DataFrame]:
    max_corpus_count = get_row_count('corpus')
    max_query_count = get_row_count('queries')
    
    if corpus_count > max_corpus_count or queries_count > max_query_count:
        raise ValueError()
    
    # load
    corpus = load_dataset('BeIR/hotpotqa', 'corpus', split=f'corpus[:{corpus_count}]')
    queries = load_dataset('BeIR/hotpotqa', 'queries', split=f'queries[:{queries_count}]')
    qrels = load_dataset('BeIR/hotpotqa-qrels')

    # filter
    query_ids_set = set(queries['_id'])
    corpus_ids_set = set(corpus['_id'])

    qrels_df = qrels['train'].to_pandas()
    filtered_qrels_df = qrels_df[
        qrels_df['corpus-id'].astype(str).isin(corpus_ids_set) &
        qrels_df['query-id'].isin(query_ids_set)
    ]

    unique_corpus_ids = set(filtered_qrels_df['corpus-id'].astype(str))
    unique_query_ids = set(filtered_qrels_df['query-id'])

    filtered_corpus = corpus.filter(lambda x: x['_id'] in unique_corpus_ids)
    filtered_queries = queries.filter(lambda x: x['_id'] in unique_query_ids)
    
    filtered_corpus_df = filtered_corpus.to_pandas()
    filtered_queries_df = filtered_queries.to_pandas()
    
    return filtered_corpus_df, filtered_queries_df, filtered_qrels_df