## Demo

### Imports

In [1]:
%load_ext autoreload
%autoreload 2

from decouple import config
from fastembed import (
    TextEmbedding, 
    SparseTextEmbedding, 
    LateInteractionTextEmbedding
)
from openai import OpenAI
from qdrant_client import QdrantClient
from qdrant_client.models import (
    Distance,
    Fusion,
    VectorParams, 
    SparseVectorParams,
    SparseIndexParams,
    MultiVectorConfig,
    MultiVectorComparator,
    Modifier,
    HnswConfigDiff
)

from rag.models import (
    DenseModelConfig, 
    SparseModelConfig,
    RerankingModelConfig,
    Metadata
)
from rag.repositories import (
    DenseSearchRepository,
    SparseSearchRepository,
    HybridFusionSearchRepository, 
    HybridRerankingSearchRepository
)


QDRANT_URL = config('QDRANT_URL')
QDRANT_API_KEY = config('QDRANT_API_KEY')
NVIDIA_API_KEY = config('NVIDIA_API_KEY')
CUDA=config('CUDA', cast=bool)

### Client

In [2]:
qdrant_client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
)

print(qdrant_client.get_collections())

collections=[CollectionDescription(name='sparse_collection2')]


### Models

In [3]:
dense_model_name = 'BAAI/bge-small-en-v1.5'
dense_model = TextEmbedding(model_name=dense_model_name)

sparse_model_name = 'Qdrant/bm25'
sparse_model = SparseTextEmbedding(
    model_name=sparse_model_name,
    k=1.5,
    b=0.75
)

reranking_model_name = 'colbert-ir/colbertv2.0'
reranking_model = LateInteractionTextEmbedding(reranking_model_name)

dense_model_config = DenseModelConfig(
    name=dense_model_name,
    vector_params=VectorParams(
        size=384,
        distance=Distance.COSINE,
        hnsw_config=HnswConfigDiff(
            m=16,
            ef_construct=128,
            on_disk=True
        ),
        on_disk=True
    )
)

sparse_model_config = SparseModelConfig(
    name=sparse_model_name,
    sparse_vector_params=SparseVectorParams(
        index=SparseIndexParams(
            on_disk=True
        ),
        modifier=Modifier.IDF
    )
)

reranking_model_config = RerankingModelConfig(
    name=reranking_model_name,
    vector_params=VectorParams(
        size=128,
        distance=Distance.COSINE,
        hnsw_config=HnswConfigDiff(
            m=0     # disable HNSW
        ),
        on_disk=True,
        multivector_config=MultiVectorConfig(
            comparator=MultiVectorComparator.MAX_SIM,
        )
    )
)

### Documents

In [4]:
documents = [
    'FastEmbed is lighter than Transformers & Sentence-Transformers.',
    'FastEmbed is supported by and maintained by Qdrant.',
]
metadatas = [
    Metadata(
        id=i + 1,
        text=document
    )
    for i, document in enumerate(documents)
]

dense_embeddings = list(dense_model.embed(documents))
sparse_embeddings = list(sparse_model.embed(documents))
reranking_embeddings = list(reranking_model.embed(documents))

### Indexing

In [5]:
DENSE_COLLECTION_NAME = 'dense_collection'
SPARSE_COLLECTION_NAME = 'sparse_collection'
FUSION_COLLECTION_NAME = 'fusion_collection'
RERANKING_COLLECTION_NAME = 'reranking_collection'

#### Dense

In [6]:
dense_search = DenseSearchRepository(
    qdrant_client=qdrant_client, 
    dense_model_config=dense_model_config
)

dense_search.create_collection(DENSE_COLLECTION_NAME)
dense_search.upload_points(
    collection_name=DENSE_COLLECTION_NAME,
    metadatas=metadatas,
    dense_embeddings=dense_embeddings
)

#### Sparse

In [7]:
sparse_search = SparseSearchRepository(
    qdrant_client=qdrant_client, 
    sparse_model_config=sparse_model_config
)

sparse_search.create_collection(SPARSE_COLLECTION_NAME)
sparse_search.upload_points(
    collection_name=SPARSE_COLLECTION_NAME,
    metadatas=metadatas,
    sparse_embeddings=sparse_embeddings
)

#### Hybrid - Fusion

In [8]:
fusion_search = HybridFusionSearchRepository(
    qdrant_client=qdrant_client, 
    dense_model_config=dense_model_config,
    sparse_model_config=sparse_model_config
)

fusion_search.create_collection(FUSION_COLLECTION_NAME)
fusion_search.upload_points(
    collection_name=FUSION_COLLECTION_NAME,
    metadatas=metadatas,
    dense_embeddings=dense_embeddings, 
    sparse_embeddings=sparse_embeddings
)

#### Hybrid - Reranking

In [9]:
reranking_search = HybridRerankingSearchRepository(
    qdrant_client=qdrant_client, 
    dense_model_config=dense_model_config,
    sparse_model_config=sparse_model_config,
    reranking_model_config=reranking_model_config
)

reranking_search.create_collection(RERANKING_COLLECTION_NAME)
reranking_search.upload_points(
    collection_name=RERANKING_COLLECTION_NAME,
    metadatas=metadatas,
    dense_embeddings=dense_embeddings, 
    sparse_embeddings=sparse_embeddings,
    reranking_embeddings=reranking_embeddings
)

### Queries

In [10]:
query_document = 'Who is lighter than Transformers?'

query_dense_embedding = next(iter(dense_model.embed(query_document)))
query_sparse_embedding = next(iter(sparse_model.embed(query_document)))
query_reranking_embedding = next(iter(reranking_model.embed(query_document)))

### Search

#### Dense

In [11]:
dense_scored_points = dense_search.search(
    collection_name=DENSE_COLLECTION_NAME,
    limit=1,
    dense_embedding=query_dense_embedding
)

print(*dense_scored_points, sep='\n')

id='90543e17-dd2d-4bcc-ac03-b4770e0a368a' version=0 score=0.7509401 payload={'id': 1, 'text': 'FastEmbed is lighter than Transformers & Sentence-Transformers.'} vector=None shard_key=None order_value=None


#### Sparse

In [12]:
sparse_scored_points = sparse_search.search(
    collection_name=SPARSE_COLLECTION_NAME,
    limit=1,
    sparse_embedding=query_sparse_embedding
)

print(*sparse_scored_points, sep='\n')

id='655efa35-a3be-4a4a-986a-b599bc5e4de4' version=0 score=4.1631217 payload={'id': 1, 'text': 'FastEmbed is lighter than Transformers & Sentence-Transformers.'} vector=None shard_key=None order_value=None


#### Hybrid - Fusion

In [13]:
fusion_scored_points = fusion_search.search(
    collection_name=FUSION_COLLECTION_NAME,
    limit=1,
    fusion_algorithm=Fusion.RRF,
    dense_embedding=query_dense_embedding,
    sparse_embedding=query_sparse_embedding
)

print(*fusion_scored_points, sep='\n')

id='4da639c2-2e0c-4133-bca1-b9ec7a561a8b' version=0 score=1.0 payload={'id': 1, 'text': 'FastEmbed is lighter than Transformers & Sentence-Transformers.'} vector=None shard_key=None order_value=None


#### Hybrid - Reranking

In [14]:
reranking_scored_points = reranking_search.search(
    collection_name=RERANKING_COLLECTION_NAME,
    limit=1,
    prefetch_limit=2,
    dense_embedding=query_dense_embedding,
    sparse_embedding=query_sparse_embedding,
    reranking_embedding=query_reranking_embedding
)

print(*reranking_scored_points, sep='\n')

id='bd55ad2b-b168-4a5b-8f29-56bbe3df29bc' version=0 score=5.8587246 payload={'id': 1, 'text': 'FastEmbed is lighter than Transformers & Sentence-Transformers.'} vector=None shard_key=None order_value=None


In [None]:
# NOTE: use with SPLADE

import json
from fastembed.sparse import SparseEmbedding
from tokenizers import Tokenizer


def get_tokens_to_weights(sparse_embedding: SparseEmbedding, tokenizer: Tokenizer) -> dict:
    token_weight_dict = {}
    
    for i in range(len(sparse_embedding.indices)):
        token = tokenizer.decode([sparse_embedding.indices[i]])
        weight = sparse_embedding.values[i]
        token_weight_dict[token] = weight

    # sort by weights
    token_weight_dict = dict(sorted(token_weight_dict.items(), key=lambda item: item[1], reverse=True))
    
    return token_weight_dict


index = 0
tokenizer = Tokenizer.from_pretrained('Qdrant/SPLADE_PP_en_v1')
tokens_to_weights = get_tokens_to_weights(sparse_embeddings[index], tokenizer)

print(json.dumps(tokens_to_weights, indent=4))

### LLM

In [21]:
client = OpenAI(
  base_url = 'https://integrate.api.nvidia.com/v1',
  api_key = NVIDIA_API_KEY
)

content = (
    'You are a helpful assistant that answers given question using ONLY PROVIDED CONTEXT.\n'
    'You are not allowed to use any previous knowledge.\n\n'

    'The output should be a well-formatted JSON object that conforms to the example below\n'
    '("answer" is either string or null):\n'
    '{"answer": "some answer"}\n\n'

    'If you don\'t know the answer, return:\n'
    '{"answer": null}\n\n'

    '<context_start>\n'
    f'{reranking_scored_points[0].payload['text']}\n'
    '<context_end>\n\n'

    '<question_start>\n'
    f'{query_document}\n'
    '<question_end>'
)

completion = client.chat.completions.create(
    model='meta/llama-3.1-405b-instruct',
    messages=[{
        'role': 'user', 
        'content': content
    }],
    temperature=0.2,
    max_tokens=1024,
    stream=False
)
print(completion)

ChatCompletion(id='chat-5f1ee1ebe77d4e9ca3fb01051219d96a', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='{"answer": "FastEmbed"}', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None), stop_reason=None)], created=1736019507, model='meta/llama-3.1-405b-instruct', object='chat.completion', service_tier=None, system_fingerprint=None, usage=CompletionUsage(completion_tokens=7, prompt_tokens=115, total_tokens=122, completion_tokens_details=None, prompt_tokens_details=None))
