In [1]:
from qdrant_client import QdrantClient, models
import requests
import uuid

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
docs_raw = docs_response.json()

documents = []
for course in docs_raw:
	course_name = course['course']
	for doc in course['documents']:
		doc['course'] = course_name
		documents.append(doc)

In [3]:
embedding_dim = 512
embedding_model_handle = 'jinaai/jina-embeddings-v2-small-en'
sparse_model_handle = 'Qdrant/bm25'
sparse_collection_name = "zoomcamp-sparse"
sparseAndDense_collection_name = "zoomcamp-sparse-and-dense"

Let's downloads Qdrant’s Docker image, then run Qdrant locally, expose its APIs, and ensure stored data survives container restarts.

```bash
docker pull qdrant/qdrant

docker run -p 6333:6333 -p 6334:6334 \
   -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
   qdrant/qdrant

In [5]:
client = QdrantClient('http://localhost:6333')
client.get_collections()

CollectionsResponse(collections=[CollectionDescription(name='zoomcamp-sparse'), CollectionDescription(name='zoomcamp-faq'), CollectionDescription(name='zoomcamp-rag')])

In [7]:
client.delete_collection(collection_name=sparse_collection_name)
client.create_collection(
    collection_name=sparse_collection_name, 
    sparse_vectors_config={
        'bm25': models.SparseVectorParams(
            modifier=models.Modifier.IDF, 
        )
    }
)

True

```python
id=uuid.uuid4().hex
```

* `uuid` is a Python module that generates **universally unique identifiers**.
* `uuid.uuid4()` generates a **random UUID** (version 4).
* `.hex` converts it to a **32-character hexadecimal string** (128 bits long → there are 2¹²⁸ possible UUIDs, so the probability of generating the same UUID twice is astronomically low, essentially zero for any practical application).

Each point you send to the Qdrant collection gets a **unique ID**, so Qdrant can track it individually.

In [8]:
client.upsert(
    collection_name=sparse_collection_name, 
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex, 
            vector={
                'bm25': models.Document(
                    text=doc['text'], 
                    model=sparse_model_handle
                )
            }, 
            payload=doc
        )
        for doc in documents
    ]
)

Fetching 18 files: 100%|██████████| 18/18 [00:00<00:00, 28.97it/s]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [9]:
def search(query, limit=1):
    results = client.query_points(
        collection_name=sparse_collection_name,
        query=models.Document(text=query, model=sparse_model_handle), 
        using='bm25', 
        limit=limit, 
        with_payload=True
    )
    return results.points

In [10]:
search('Qdrant')

[]

Scores returned by BM25 are not calculated with cosine similarity, but with BM25 formula. 

They are not bounded to a specific range, but are virtually unbounded. 

In [11]:
search('Kafka')

[ScoredPoint(id='9342ea3e-4ea3-4f4b-97dc-c19e6ea4fca2', version=0, score=8.2663765, payload={'text': "✅SOLUTION: pip install confluent-kafka[avro].\nFor some reason, Conda also doesn't include this when installing confluent-kafka via pip.\nMore sources on Anaconda and confluent-kafka issues:\nhttps://github.com/confluentinc/confluent-kafka-python/issues/590\nhttps://github.com/confluentinc/confluent-kafka-python/issues/1221\nhttps://stackoverflow.com/questions/69085157/cannot-import-producer-from-confluent-kafka", 'section': 'Module 6: streaming with kafka', 'question': "ModuleNotFoundError: No module named 'avro'", 'course': 'data-engineering-zoomcamp'}, vector=None, shard_key=None, order_value=None)]

In [21]:
import random
import json

random.seed(1)
random_doc = random.choice(documents)
print(json.dumps(random_doc, indent=2))
print(f"\nBM25 Search Result:\n{search(random_doc['question'])[0].payload['text']}")

{
  "text": "Failed to save '<file>': Unable to write file 'vscode-remote://ssh-remote+de-zoomcamp/home/<user>/data_engineering_course/week_2/airflow/dags/<file>' (NoPermissions (FileSystemError): Error: EACCES: permission denied, open '/home/<user>/data_engineering_course/week_2/airflow/dags/<file>')\nYou need to change the owner of the files you are trying to edit via VS Code. You can run the following command to change the ownership.\nssh\nsudo chown -R <user> <path to your directory>",
  "section": "Module 1: Docker and Terraform",
  "question": "GCP VM - Error while saving the file in VM via VS Code",
  "course": "data-engineering-zoomcamp"
}

BM25 Search Result:
My SSH connection to AWS cannot last more than a few minutes, whether via terminal or VS code.
My config:
# Copy Configuration in local nano editor, then Save it!
Host mlops-zoomcamp                                         # ssh connection calling name
User ubuntu                                             # username AWS

BAD !!! We need to integrate this BM25 traditional search with semantic search, creating **hybrid search**.

**Hybrid Search** combines multiple retrieval methods (dense + sparse) in a **multi-stage pipeline**. Often **sequential**:
  1. Retrieve candidates using one method (dense or sparse).
  2. Rerank using another method.

```ascii
┌─────────────┐           ┌─────────────┐
│             │           │             │
│  Retrieval  │ ────────► │  Reranking  │
│             │           │             │
└─────────────┘           └─────────────┘
```

**Example:**

```text
Dense retrieval → top 100 candidates → BM25 rerank → top 10 results
```

In [13]:
client.create_collection(
    collection_name=sparseAndDense_collection_name, 
    vectors_config={
        'jina-small': models.VectorParams(
            size=embedding_dim, 
            distance=models.Distance.COSINE
        )
    }, 
    sparse_vectors_config={
        'bm25': models.SparseVectorParams(
            modifier=models.Modifier.IDF
        )
    }
)

True

In [17]:
client.upsert(
    collection_name=sparseAndDense_collection_name, 
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex, 
            vector={
                'jina-small': models.Document(
                    text=doc['text'], 
                    model=embedding_model_handle
                ), 
                'bm25': models.Document(
                    text=doc['text'], 
                    model=sparse_model_handle
                )
            }, 
            payload=doc
        )
        for doc in documents
    ]
)

Fetching 5 files: 100%|██████████| 5/5 [00:09<00:00,  1.82s/it]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [18]:
def multi_stage_search(query, limit=1):
    results = client.query_points(
        collection_name=sparseAndDense_collection_name, 
        prefetch=[
            models.Prefetch(
                query=models.Document(text=query, model=embedding_model_handle), 
                using='jina-small', 
                limit=10*limit
            )
        ], 
        query=models.Document(text=query, model=sparse_model_handle), 
        using='bm25', 
        limit=limit, 
        with_payload=True
    )
    return results.points

In [None]:
random.seed(1)
random_doc = random.choice(documents)
print(json.dumps(random_doc, indent=2))
print(f"\nHybrid Search Result:\n{multi_stage_search(random_doc['question'])[0].payload['text']}")

{
  "text": "Failed to save '<file>': Unable to write file 'vscode-remote://ssh-remote+de-zoomcamp/home/<user>/data_engineering_course/week_2/airflow/dags/<file>' (NoPermissions (FileSystemError): Error: EACCES: permission denied, open '/home/<user>/data_engineering_course/week_2/airflow/dags/<file>')\nYou need to change the owner of the files you are trying to edit via VS Code. You can run the following command to change the ownership.\nssh\nsudo chown -R <user> <path to your directory>",
  "section": "Module 1: Docker and Terraform",
  "question": "GCP VM - Error while saving the file in VM via VS Code",
  "course": "data-engineering-zoomcamp"
}

Hybrid Search Result:
Failed to save '<file>': Unable to write file 'vscode-remote://ssh-remote+de-zoomcamp/home/<user>/data_engineering_course/week_2/airflow/dags/<file>' (NoPermissions (FileSystemError): Error: EACCES: permission denied, open '/home/<user>/data_engineering_course/week_2/airflow/dags/<file>')
You need to change the owner of

**Fusion Search** combines results from multiple retrieval methods by **merging their scores** into a final ranking.
* Usually **parallel retrieval**: dense and sparse both return results independently, then a formula combines their scores.
* Focus is on **score aggregation**, not sequential reranking.

The **Reciprocal Rank Fusion (RRF)** formula is:

$$
\text{RRF}(d) = \sum_{r \in \text{rankers}} \frac{1}{k + \text{rank}_r(d)}
$$

Where:

* $d$ = document
* $\text{rank}_r(d)$ = the position (rank) of document $d$ in the results from ranker $r$ (1 = best)
* $k$ = a constant (commonly $k = 60$) that ensures even low positions still get a small nonzero contribution.

For the table below, with **two rankers** (dense and sparse) the formula is:

$$
\text{RRF}(d) = \frac{1}{k + \text{rank}_{\text{dense}}(d)} + \frac{1}{k + \text{rank}_{\text{sparse}}(d)}
$$

**Example:**

| Document | Dense ranking | Sparse ranking | RRF score | Final ranking |
|----------|---------------|----------------|-----------|---------------|
| D1       | **1**             | 5              | 0.0318    | 2             |
| D2       | 2             | 4              | 0.0317    | 3             |
| D3       | 3             | 2              | **0.0320**   | **1**             |
| D4       | 4             | 3              | 0.0315    | 5             |
| D5       | 5             | **1**              | 0.0318    | 2             |

<br>

→ **All fusion can be hybrid (in the broad sense)**, but **not all hybrid is fusion**.

In [25]:
def fusion_search(query, limit=1):
    results = client.query_points(
        collection_name=sparseAndDense_collection_name, 
        prefetch=[
            models.Prefetch(
                query=models.Document(text=query, model=embedding_model_handle), 
                using='jina-small', 
                limit=5*limit
            ), 
            models.Prefetch(
                query=models.Document(text=query, model=sparse_model_handle), 
                using='bm25', 
                limit=5*limit
            )
        ], 
        query=models.FusionQuery(fusion=models.Fusion.RRF), 
        limit=limit, 
        with_payload=True
    )
    return results.points

In [30]:
random.seed(1)
random_doc = random.choice(documents)
print(json.dumps(random_doc, indent=2))
print(f"\nFusion Search Result:\n{fusion_search(random_doc['question'])[0].payload['text']}")

{
  "text": "Failed to save '<file>': Unable to write file 'vscode-remote://ssh-remote+de-zoomcamp/home/<user>/data_engineering_course/week_2/airflow/dags/<file>' (NoPermissions (FileSystemError): Error: EACCES: permission denied, open '/home/<user>/data_engineering_course/week_2/airflow/dags/<file>')\nYou need to change the owner of the files you are trying to edit via VS Code. You can run the following command to change the ownership.\nssh\nsudo chown -R <user> <path to your directory>",
  "section": "Module 1: Docker and Terraform",
  "question": "GCP VM - Error while saving the file in VM via VS Code",
  "course": "data-engineering-zoomcamp"
}

Fusion Search Result:
Failed to save '<file>': Unable to write file 'vscode-remote://ssh-remote+de-zoomcamp/home/<user>/data_engineering_course/week_2/airflow/dags/<file>' (NoPermissions (FileSystemError): Error: EACCES: permission denied, open '/home/<user>/data_engineering_course/week_2/airflow/dags/<file>')
You need to change the owner of