In [1]:
from decouple import config
from qdrant_client import QdrantClient


QDRANT_URL = config('QDRANT_URL')
QDRANT_API_KEY = config('QDRANT_API_KEY')

In [None]:
from datasets import load_dataset
from datasets import get_dataset_config_info
from pandas import DataFrame


def get_row_count(config_name: str):
    dataset_info = get_dataset_config_info('BeIR/hotpotqa', config_name=config_name)
    
    return  dataset_info.splits[config_name].num_examples

def get_corpus_queries_qrels(corpus_count: int, queries_count: int) -> tuple[DataFrame, DataFrame, DataFrame]:
    max_corpus_count = get_row_count('corpus')
    max_query_count = get_row_count('queries')
    
    if corpus_count > max_corpus_count or queries_count > max_query_count:
        raise ValueError()
    
    # load
    corpus = load_dataset('BeIR/hotpotqa', 'corpus', split=f'corpus[:{corpus_count}]')
    queries = load_dataset('BeIR/hotpotqa', 'queries', split=f'queries[:{queries_count}]')
    qrels = load_dataset('BeIR/hotpotqa-qrels')

    # filter
    query_ids_set = set(queries['_id'])
    corpus_ids_set = set(corpus['_id'])

    qrels_df = qrels['train'].to_pandas()
    filtered_qrels_df = qrels_df[
        qrels_df['corpus-id'].astype(str).isin(corpus_ids_set) &
        qrels_df['query-id'].isin(query_ids_set)
    ]

    unique_corpus_ids = set(filtered_qrels_df['corpus-id'].astype(str))
    unique_query_ids = set(filtered_qrels_df['query-id'])

    filtered_corpus = corpus.filter(lambda x: x['_id'] in unique_corpus_ids)
    filtered_queries = queries.filter(lambda x: x['_id'] in unique_query_ids)
    
    filtered_corpus_df = filtered_corpus.to_pandas()
    filtered_queries_df = filtered_queries.to_pandas()
    
    return filtered_corpus_df, filtered_queries_df, filtered_qrels_df

In [2]:
qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
)

print(qdrant_client.get_collections())

collections=[]


In [7]:
from fastembed import (
    TextEmbedding, 
    SparseTextEmbedding, 
    LateInteractionTextEmbedding
)


documents = [
    'FastEmbed is lighter than Transformers & Sentence-Transformers.',
    'FastEmbed is supported by and maintained by Qdrant.',
]

In [None]:
from qdrant_client.models import (
    Distance, 
    VectorParams, 
    SparseVectorParams, 
    MultiVectorConfig,
    MultiVectorComparator,
    Modifier
)

from .models.config import (
    DenseModelConfig, 
    SparseModelConfig,
    RerankingModelConfig
)

COLLECTION_NAME = ...

dense_model_name = 'BAAI/bge-small-en-v1.5'
dense_model = TextEmbedding(model_name=dense_model_name)

sparse_model_name = 'Qdrant/bm25'
sparse_model = SparseTextEmbedding(
    model_name=sparse_model_name,
    k1=1.5,
    b=0.75
)

reranking_model_name = 'colbert-ir/colbertv2.0'
reranking_model = LateInteractionTextEmbedding(reranking_model_name)

dense_model_config = DenseModelConfig(
    name=dense_model_name,
    vector_params=VectorParams(
        size=384,
        distance=Distance.COSINE,
    )
)

sparse_model_config = SparseModelConfig(
    name=sparse_model_name,
    sparse_vector_params=SparseVectorParams(modifier=Modifier.IDF)
)

reranking_model_config = RerankingModelConfig(
    name=reranking_model_name,
    vector_params=VectorParams(
        size=100,
        distance=Distance.COSINE,
        multivector_config=MultiVectorConfig(
            comparator=MultiVectorComparator.MAX_SIM,
        )
    )
)

In [None]:
dense_embeddings = list(dense_model.embed(documents))
sparse_embeddings = list(sparse_model.embed(documents))
reranking_embeddings = list(reranking_model.embed(documents))

In [None]:
import json
from fastembed.sparse import SparseEmbedding
from tokenizers import Tokenizer


def get_tokens_and_weights(sparse_embedding: SparseEmbedding, tokenizer: Tokenizer) -> dict:
    token_weight_dict = {}
    
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # Sort the dictionary by weights
    token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))
    
    return token_weight_dict


tokenizer = Tokenizer.from_pretrained('Qdrant/SPLADE_PP_en_v1')
index = 0
print(json.dumps(get_tokens_and_weights(sparse_embeddings[index], tokenizer), indent=4))