In [1]:
import json
import pandas as pd
from tqdm.auto import tqdm
from sentence_transformers import SentenceTransformer
from qdrant_client import QdrantClient
from qdrant_client.http import models

In [2]:
# Load the dataset
with open('./../dataset/medical_qa_documents_with_id.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [3]:
documents = []

for docs_info in docs_raw:
    for doc in docs_info['documents']:
        documents.append(doc)

## Sparse vector search with BM25

In [4]:
# Connect Qdrant Client

qd_client = QdrantClient(
    url="http://localhost:6333",
    timeout=60  # seconds (1 minutes)
)

collection_name = "medical-faq-sparse"

In [7]:
from qdrant_client import models

# Create the collection with specified sparse vector parameters
qd_client.create_collection(
    collection_name=collection_name,
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

True

In [21]:
from tqdm import tqdm
import uuid


BATCH_SIZE = 200  # adjust depending on dataset size

all_points = [
    models.PointStruct(
        id=uuid.uuid4().hex,
        vector={
            "bm25": models.Document(
                text=doc["answer"],
                model="Qdrant/bm25",
            ),
        },
        payload={
            "answer": doc["answer"],
            "question": doc["question"],
            "qtype": doc["qtype"],
            "id": doc["id"]
        }
    )
    for docs_info in docs_raw
    for doc in docs_info['documents']
]

# Send in chunks
for i in tqdm(range(0, len(all_points), BATCH_SIZE)):
    batch = all_points[i:i+BATCH_SIZE]
    qd_client.upsert(collection_name=collection_name, points=batch)


100%|███████████████████████████████████████████| 75/75 [00:25<00:00,  2.99it/s]


## Running sparse vector search with BM25

In [9]:
def bm25_search(query: str, limit: int = 4):
    results = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )
    return results.points


In [10]:
results = bm25_search("chroma")
results

[]

In [11]:
results = bm25_search("hiv")
for r in results:
    print(f"Score: {r.score:.3f} | answer: {r.payload['answer'][:120]}...\n")


Score: 9.776 | answer: Summary : In the early 1980s, when the HIV/AIDS epidemic began, patients rarely lived longer than a few years. But today...

Score: 9.756 | answer: Summary : HIV, the human immunodeficiency virus, kills or damages cells of the body's immune system. The most advanced s...

Score: 9.415 | answer: HIV stands for human immunodeficiency virus. It kills or damages the body's immune system cells. AIDS stands for acquire...

Score: 9.409 | answer: Infection with HIV is serious. But the outlook for people with HIV/AIDS is improving. If you are infected with HIV, ther...



### Natural language like queries


In [12]:
import random
import json

random.seed(12001)

docs_records = random.choice(docs_raw)
docs_record = random.choice(docs_records["documents"])
print(json.dumps(docs_record, indent=2))

{
  "answer": "Achondroplasia is a disorder of bone growth that prevents the changing of cartilage (particularly in the long bones of the arms and legs) to bone. It is characterized by dwarfism, limited range of motion at the elbows, large head size, small fingers, and normal intelligence. Achondroplasia can cause health complications such as apnea, obesity, recurrent ear infections, and lordosis of the spine. Achondroplasia is caused by mutations in the FGFR3 gene. It is inherited in an autosomal dominant fashion.",
  "question": "What is (are) Achondroplasia?",
  "qtype": "information",
  "id": "aed895b1"
}


In [13]:
results = bm25_search(docs_record["question"])
print(results[0].payload["answer"])

These resources address the diagnosis or management of achondroplasia: - Gene Review: Gene Review: Achondroplasia - GeneFacts: Achondroplasia: Diagnosis - GeneFacts: Achondroplasia: Management - Genetic Testing Registry: Achondroplasia - MedlinePlus Encyclopedia: Achondroplasia - MedlinePlus Encyclopedia: Hydrocephalus - MedlinePlus Encyclopedia: Lordosis - MedlinePlus Encyclopedia: Spinal Stenosis These resources from MedlinePlus offer information about the diagnosis and management of various health conditions: - Diagnostic Tests - Drug Therapy - Surgery and Rehabilitation - Genetic Counseling - Palliative Care


## Sparse and Dense Embedding Vector search with BM25 and embedding model


##### Embedding model

In [14]:
EMBEDDING_DIMENSIONALITY=384

# load embedding model
model_name = 'multi-qa-MiniLM-L6-cos-v1'
embed_model = SentenceTransformer(model_name)

In [15]:
collection_name = "medical-faq-sparse-and-dense"

qd_client.recreate_collection(
    collection_name=collection_name,
    vectors_config={
        "dense-vector": models.VectorParams(
            size=EMBEDDING_DIMENSIONALITY,
            distance=models.Distance.COSINE
        )
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)


### Generate Embeeding Vector ONLY if it's not already generated

In [None]:
# no need to run embedding is already generated and locally saved
vectors = []

for doc in tqdm(documents):
    question = doc['question']
    answer = doc['answer']
    vector = embed_model.encode(question + ' ' + answer)
    vectors.append(vector)

In [None]:
# Save
with open("./../dataset/documents-vectors-multi-qa-MiniLM-L6-cos-v1.pkl", "wb") as f:
    pickle.dump(vectors, f)

print("Saved vectors locally")

### Load  the embedding vectors if it's already generated

In [16]:
import pickle
with open("./../dataset/documents-vectors-multi-qa-MiniLM-L6-cos-v1.pkl", "rb") as f:
    vectors = pickle.load(f)

print(f"Loaded {len(vectors)} vectors and {len(documents)} documents")

Loaded 14979 vectors and 14979 documents


#### embedding and sparse vector ingestion in qdrant

In [30]:
from tqdm import tqdm
import uuid

BATCH_SIZE = 200  # adjust depending on dataset size

# Prepare all points
all_points = []
vector_index = 0  # track embedding index globally across all docs

for docs_info in docs_raw:
    for doc in docs_info["documents"]:
        # make sure each document gets the correct vector
        dense_vector = vectors[vector_index]
        vector_index += 1

        all_points.append(
            models.PointStruct(
                id=uuid.uuid4().hex,  # generate a unique valid UUID for each doc
                vector={
                    # dense embedding (note: typo fixed from "desnse-vector")
                    "dense-vector": dense_vector.tolist(),
                    
                    # sparse embedding
                    "bm25": models.Document(
                        text=doc["answer"],
                        model="Qdrant/bm25",
                    ),
                },
                payload={
                    "answer": doc["answer"],
                    "question": doc["question"],
                    "qtype": doc["qtype"],
                    "id": doc["id"]
                }
            )
        )


In [31]:
for i in tqdm(range(0, len(all_points), BATCH_SIZE), desc="Uploading to Qdrant"):
    batch = all_points[i:i + BATCH_SIZE]
    qd_client.upsert(collection_name=collection_name, points=batch)

Uploading to Qdrant: 100%|██████████████████████| 75/75 [00:49<00:00,  1.50it/s]


#### Running multi-stage search with embedding vector and BM25

In [17]:
def multi_stage_search(query: str, limit: int = 1):
    results = qd_client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query= embed_model.encode([query])[0].tolist(),
                using="dense-vector",
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )
    return results.points


In [18]:
print(json.dumps(docs_record, indent=2))

{
  "answer": "Achondroplasia is a disorder of bone growth that prevents the changing of cartilage (particularly in the long bones of the arms and legs) to bone. It is characterized by dwarfism, limited range of motion at the elbows, large head size, small fingers, and normal intelligence. Achondroplasia can cause health complications such as apnea, obesity, recurrent ear infections, and lordosis of the spine. Achondroplasia is caused by mutations in the FGFR3 gene. It is inherited in an autosomal dominant fashion.",
  "question": "What is (are) Achondroplasia?",
  "qtype": "information",
  "id": "aed895b1"
}


In [19]:
results = multi_stage_search(docs_record["question"])
print(results[0].payload["answer"])

Achondroplasia is a form of short-limbed dwarfism. The word achondroplasia literally means "without cartilage formation." Cartilage is a tough but flexible tissue that makes up much of the skeleton during early development. However, in achondroplasia the problem is not in forming cartilage but in converting it to bone (a process called ossification), particularly in the long bones of the arms and legs. Achondroplasia is similar to another skeletal disorder called hypochondroplasia, but the features of achondroplasia tend to be more severe. All people with achondroplasia have short stature. The average height of an adult male with achondroplasia is 131 centimeters (4 feet, 4 inches), and the average height for adult females is 124 centimeters (4 feet, 1 inch). Characteristic features of achondroplasia include an average-size trunk, short arms and legs with particularly short upper arms and thighs, limited range of motion at the elbows, and an enlarged head (macrocephaly) with a prominen

### Building Hybrid Search

#### 1.Fusion

In [20]:
def rrf_search(query: str, limit: int = 3):
    results = qd_client.query_points(
        collection_name=collection_name,
        prefetch=[
            models.Prefetch(
                query= embed_model.encode([query])[0].tolist(),
                using="dense-vector",
                limit=(10 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )
    return results.points


In [21]:
print(json.dumps(docs_record, indent=2))

{
  "answer": "Achondroplasia is a disorder of bone growth that prevents the changing of cartilage (particularly in the long bones of the arms and legs) to bone. It is characterized by dwarfism, limited range of motion at the elbows, large head size, small fingers, and normal intelligence. Achondroplasia can cause health complications such as apnea, obesity, recurrent ear infections, and lordosis of the spine. Achondroplasia is caused by mutations in the FGFR3 gene. It is inherited in an autosomal dominant fashion.",
  "question": "What is (are) Achondroplasia?",
  "qtype": "information",
  "id": "aed895b1"
}


In [22]:
results = rrf_search(docs_record["question"])
print(results[0].payload["answer"])

Achondroplasia is a disorder of bone growth that prevents the changing of cartilage (particularly in the long bones of the arms and legs) to bone. It is characterized by dwarfism, limited range of motion at the elbows, large head size, small fingers, and normal intelligence. Achondroplasia can cause health complications such as apnea, obesity, recurrent ear infections, and lordosis of the spine. Achondroplasia is caused by mutations in the FGFR3 gene. It is inherited in an autosomal dominant fashion.


#### 2. Reranking

In [23]:
# Load the reranker model
from sentence_transformers import CrossEncoder

reranker = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
print("Reranker model loaded successfully.")


Reranker model loaded successfully.


In [24]:
def rerank_results(query: str, results, top_k: int = 5):
    """
    Rerank results from hybrid search using cross-encoder.
    Args:
        query: str
        results: list of models.ScoredPoint (from rrf_search)
        top_k: int
    Returns:
        List of tuples: (payload_dict, rerank_score)
    """
    # Extract candidate payloads
    candidates = [res.payload for res in results]
    pairs = [[query, c["answer"]] for c in candidates]

    # Compute relevance scores from reranker
    scores = reranker.predict(pairs)

    # Combine payloads with scores
    reranked = list(zip(candidates, scores))

    # Sort by rerank score in descending order
    reranked = sorted(reranked, key=lambda x: x[1], reverse=True)

    return reranked[:top_k]


In [25]:
docs_record

{'answer': 'Achondroplasia is a disorder of bone growth that prevents the changing of cartilage (particularly in the long bones of the arms and legs) to bone. It is characterized by dwarfism, limited range of motion at the elbows, large head size, small fingers, and normal intelligence. Achondroplasia can cause health complications such as apnea, obesity, recurrent ear infections, and lordosis of the spine. Achondroplasia is caused by mutations in the FGFR3 gene. It is inherited in an autosomal dominant fashion.',
 'question': 'What is (are) Achondroplasia?',
 'qtype': 'information',
 'id': 'aed895b1'}

In [26]:
# RUN Hybrid + ReRanking

query = docs_record["question"]

# Step 1: Hybrid search
hybrid_results = rrf_search(query)

# Step 2: Rerank top 3 retrieved documents
reranked = rerank_results(query, hybrid_results, top_k=3)


print(f"Query: {query}\n")
for i, (payload, score) in enumerate(reranked, 1):
    print(f"Top {i} — Rerank Score: {score:.4f}\n{payload['answer'][:300]}...\nid: {payload['id']}\n")


Query: What is (are) Achondroplasia?

Top 1 — Rerank Score: 8.9235
Achondroplasia is a disorder of bone growth that prevents the changing of cartilage (particularly in the long bones of the arms and legs) to bone. It is characterized by dwarfism, limited range of motion at the elbows, large head size, small fingers, and normal intelligence. Achondroplasia can cause...
id: aed895b1

Top 2 — Rerank Score: 7.5208
Achondroplasia is the most common type of short-limbed dwarfism. The condition occurs in 1 in 15,000 to 40,000 newborns....
id: 182ff118

Top 3 — Rerank Score: 6.8525
Achondroplasia is a form of short-limbed dwarfism. The word achondroplasia literally means "without cartilage formation." Cartilage is a tough but flexible tissue that makes up much of the skeleton during early development. However, in achondroplasia the problem is not in forming cartilage but in co...
id: 0d911b0e



In [27]:
# - Evaluate reranking
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np


# Embed query and reranked result
query_emb = embed_model.encode([query])
reranked_embs = embed_model.encode([answer for answer, _ in reranked])
similarities = cosine_similarity(query_emb, reranked_embs)[0]

print("Cosine similarities of reranked results:")
for i, sim in enumerate(similarities, 1):
    print(f"Doc {i}: {sim:.4f}")


Cosine similarities of reranked results:
Doc 1: 0.7260
Doc 2: 0.5901
Doc 3: 0.6373


## Evaluate Hybrid Search and Reranking

In [28]:
df_ground_truth = pd.read_csv('./../dataset/search_ground-truth-data.csv')

In [29]:
ground_truth_dict = df_ground_truth.to_dict(orient='records')

In [30]:
def hit_rate(relevance_total):
    cnt = 0

    for line in relevance_total:
        if True in line:
            cnt = cnt + 1

    return cnt / len(relevance_total)

In [31]:
def mrr(relevance_total):
    total_score = 0.0

    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1)

    return total_score / len(relevance_total)

In [32]:
def evaluate_search(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        payloads = [p.payload for p in results]
        relevance = [d['id'] == doc_id for d in payloads]
        relevance_total.append(relevance)

    return {
        'hit_rate': hit_rate(relevance_total),
        'mrr': mrr(relevance_total),
    }

In [33]:
def evaluate_rerank(ground_truth, hybrid_search_fn, rerank_fn, top_k=1):
    relevance_total = []

    for q in tqdm(ground_truth):
        query = q["question"]
        doc_id = q["document"]

        # Step 1: Run hybrid search (RRF)
        hybrid_results = hybrid_search_fn(query)

        # Step 2: Rerank results
        reranked = rerank_fn(query, hybrid_results, top_k=top_k)

        # Step 3: Extract payloads only (ignore score)
        payloads = [payload for payload, score in reranked]

        # Step 4: Check relevance: True if ground-truth doc matches
        relevance = [p["id"] == doc_id for p in payloads]
        relevance_total.append(relevance)

    return {
        "hit_rate": hit_rate(relevance_total),
        "mrr": mrr(relevance_total),
    }


In [38]:
# Qdrant multi-stage Evaluation
evaluate_search(ground_truth_dict, lambda q: multi_stage_search(q['question'], limit=5))

100%|█████████████████████████████████████████| 200/200 [00:11<00:00, 17.57it/s]


{'hit_rate': 0.73, 'mrr': 0.5594999999999999}

In [36]:
# Qdrant HybridSearch Evaluation
evaluate_search(ground_truth_dict, lambda q: rrf_search(q['question'], limit=5))

100%|█████████████████████████████████████████| 200/200 [00:14<00:00, 14.00it/s]


{'hit_rate': 0.885, 'mrr': 0.6524047619047616}

In [37]:
evaluate_rerank(
    ground_truth=ground_truth_dict,
    hybrid_search_fn=lambda q: rrf_search(q, limit=10),
    rerank_fn=rerank_results,
    top_k=3
)

100%|█████████████████████████████████████████| 200/200 [38:31<00:00, 11.56s/it]


{'hit_rate': 0.755, 'mrr': 0.6483333333333334}

In [58]:
evaluate_rerank(
    ground_truth=ground_truth_dict,
    hybrid_search_fn=lambda q: rrf_search(q, limit=10),
    rerank_fn=rerank_results,
    top_k=5
)

100%|█████████████████████████████████████████| 200/200 [31:04<00:00,  9.32s/it]


{'hit_rate': 0.83, 'mrr': 0.6660833333333334}