In [1]:
from qdrant_client import QdrantClient, models

class QdrantVectorStore:
    def __init__(self, host: str):
        self.client = QdrantClient(url=host)

    def create_collection(self, collection_name: str, embedding_dimensionality: int):
        if not self.client.collection_exists(collection_name=collection_name):
            self.client.create_collection(collection_name=collection_name,
                                        vectors_config=models.VectorParams(
                                            size=embedding_dimensionality,
                                            distance=models.Distance.COSINE
                                        )
                                        )
        else:
            print(f"Collection {collection_name} already exists.")

    def create_collection_sparse(self, collection_name: str):
        if not self.client.collection_exists(collection_name=collection_name):
            self.client.create_collection(
                collection_name=collection_name,
                sparse_vectors_config={
                    "bm25": models.SparseVectorParams(
                        modifier=models.Modifier.IDF
                    )
                }
            )
        else:
            print(f"Collection {collection_name} already exists.")
            
    def create_collection_hybrid(self, collection_name: str, embedding_dimensionality: int):
        if not self.client.collection_exists(collection_name=collection_name):
            self.client.create_collection(
                collection_name=collection_name,
                vectors_config={
                    "jina-small": models.VectorParams(
                        size=embedding_dimensionality,
                        distance=models.Distance.COSINE
                    )
                },
                sparse_vectors_config={
                    "bm25": models.SparseVectorParams(
                        modifier=models.Modifier.IDF
                    )
                }
            )
        else:
            print(f"Collection {collection_name} already exists.")

In [15]:
import uuid
from qdrant_client import models

def upsert_sparse_dense_documents(client, collection_name: str, model_handle_dense: str, model_handle_sparse: str, documents: list):
    client.upsert(
        collection_name=collection_name,
        points=[
            models.PointStruct(
                id=uuid.uuid4().hex,
                vector={
                    "jina-small": models.Document(
                        text=doc["text"],
                        model=model_handle_dense
                        ),
                    "bm25": models.Document(
                        text=doc["text"],
                        model=model_handle_sparse
                        )
                    },
                payload={
                    "text": doc["text"],
                    "section": doc["section"],
                    "course": doc['course'],
                    "question": doc["question"],
                    "id": doc["document_id"]
                }
            )
        for doc in documents
        ]
    )

In [3]:
def search_multi_stage_sparse_and_dense(query: str, client, collection_name, limit: int) -> list[models.ScoredPoint]:
    
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en"
                ),
                using='jina-small',
                limit=(limit*5)
            )
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25"
        ),
        limit=limit,
        using="bm25",
        with_payload=True
    )
    return results.points

def search_hybrid(query: str, client, collection_name: str, limit: int=1) -> list[models.ScoredPoint]:
    
    results = client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en"
                ),
                using='jina-small',
                limit=(limit*5)
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25"
                ),
                using="bm25",
                limit=(limit*5)
            )
        ],
        query=models.FusionQuery(
            fusion=models.Fusion.RRF
        ),
        with_payload=True,
    )
    return results.points

In [5]:
import json
with open('./documents-with-ids-updated.json', 'rt') as f_in:
    documents = json.load(f_in)

In [8]:
host = "http://localhost:6333"
collection_name = "nakul-zoomcamp-faq-collection"

In [10]:
qv = QdrantVectorStore(host=host)
qv.create_collection_hybrid(collection_name=collection_name, embedding_dimensionality=512)

In [16]:
# Adding documents to the vector store
upsert_sparse_dense_documents(client=qv.client,
                              collection_name=collection_name,
                              model_handle_dense="jinaai/jina-embeddings-v2-small-en",
                              model_handle_sparse="Qdrant/bm25",
                              documents=documents
                             )

In [21]:
data = search_hybrid("when does the course start", 
              client=qv.client, 
              collection_name=collection_name,
              limit=5)
for point in data:
    print(point.payload)

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.", 'section': 'General course-related questions', 'course': 'data-engineering-zoomcamp', 'question': 'Course - When will the course start?', 'id': '12ba9d1bc6ca27c5'}
{'text': 'The course videos are pre-recorded, you can start watching the course right now.\nWe will also occasionally have office hours - live sessions where we will answer your questions. The office hours sessions are recorded too.\nYou can see the office hours as well as the pre-recorded course videos in the course playlist on YouTube.', 'section': 'Gener

In [22]:
import pandas as pd

In [23]:
df_ground_truth = pd.read_csv('ground-truth-data-practice.csv')
ground_truth = df_ground_truth.to_dict(orient='records')

In [26]:
df_ground_truth.head()

Unnamed: 0,question,course,document
0,Could you let me know the specific date and ti...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
1,What is the starting date for the first live O...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
2,Is there a way to add the course schedule to m...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
3,When exactly should I register for the course ...,data-engineering-zoomcamp,12ba9d1bc6ca27c5
4,How do I stay updated with announcements regar...,data-engineering-zoomcamp,12ba9d1bc6ca27c5


In [25]:
from tqdm.auto import tqdm

In [27]:
relevance_total = []
for q in tqdm(ground_truth):
    doc_id = q['document']
    results = search_hybrid(query=q['question'], 
              client=qv.client, 
              collection_name=collection_name,
              limit=5)
    relevance = [point.payload['id'] == doc_id for point in results]
    relevance_total.append(relevance)

  0%|          | 0/4119 [00:00<?, ?it/s]

In [28]:
relevance_total

[[False, False, True, False, False, False, False, False, False, False],
 [False, True, False, False, False, False, False, False, False, False],
 [True, False, False, False, False, False, False, False, False, False],
 [False, True, False, False, False, False, False, False, False, False],
 [False, False, True, False, False, False, False, False, False, False],
 [False, False, False, False, False, False, False, False, False, False],
 [False, False, False, False, False, False, False, False, False, False],
 [False, False, False, False, False, False, False, False, False, False],
 [False, False, False, False, True, False, False, False, False, False],
 [False, False, False, False, False, False, False, False, False, False],
 [False, False, False, False, False, False, False, False, False, False],
 [True, False, False, False, False, False, False, False, False, False],
 [False, False, True, False, False, False, False, False, False, False],
 [False, True, False, False, False, False, False, False, Fa

In [32]:
def hit_rate(relevance_total):
    counter = 0
    for ls in relevance_total:
        if sum(ls) > 0:
            counter += 1
    return counter/len(relevance_total)

In [33]:
hit_rate(relevance_total=relevance_total)

0.883709638261714

In [34]:
def mrr(relevance_total):
    score = 0.0
    for ls in relevance_total:
        for rank in range(len(ls)):
            if ls[rank]:
                score = score + (1/(rank+1))
    return score/len(relevance_total)

In [35]:
mrr(relevance_total)

0.7220038189266164

In [38]:
def evaluate(ground_truth, search_function, collection_name, client):
    relevance_total = []
    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(query=q['question'], 
                  client=qv.client, 
                  collection_name=collection_name,
                  limit=5)
        relevance = [point.payload['id'] == doc_id for point in results]
        relevance_total.append(relevance)

    return {'hit_rate': hit_rate(relevance_total=relevance_total),
            'mrr': mrr(relevance_total=relevance_total)}

In [39]:
# hybrid search
evaluate(
    ground_truth=ground_truth,
    search_function=search_hybrid,
    collection_name=collection_name,
    client=qv.client)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.8824957513959699, 'mrr': 0.7198882838722612}

In [40]:
# sparse and dense search
evaluate(
    ground_truth=ground_truth,
    search_function=search_multi_stage_sparse_and_dense,
    collection_name=collection_name,
    client=qv.client)

  0%|          | 0/4119 [00:00<?, ?it/s]

{'hit_rate': 0.7960670065549891, 'mrr': 0.6563445820182896}