In [None]:
#to see localhost:6333 locally, you must use codespace in vscode... by opening the codespace in browser, then f1, then Codespaces: Open in VS Code

# Json + Hybrid search

In [None]:
#!pip install openai python-dotenv tqdm requests beautifulsoup4
#!pip install --upgrade pinecone

# Step 1: Connect to Qdrant

In [None]:
import requests
from tqdm import tqdm  # Progress bars (e.g., looping through files)
from qdrant_client import QdrantClient, models
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance
client.get_collections()

# Step 2: Sparse vector search with BM25

In [None]:
docs_url = 'https://raw.githubusercontent.com/Mamdouh-Muhammad/llm/refs/heads/main/rk.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [None]:
# for course in documents_raw:
#         for doc in course['documents']:
#             print(type(doc["text"]), doc["text"])


In [None]:
from qdrant_client import models

# Create the collection with specified sparse vector parameters
client.create_collection(
    collection_name="llm2-sparse",
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

In [None]:
import uuid

points = []

for course in documents_raw:
    for doc in course["documents"]:
        text = doc["text"]
        if isinstance(text, str):
            text_str = text
        elif isinstance(text, list):
            text_str = " ".join(text)
        else:
            raise TypeError(f"Unexpected type for text: {type(text)}")

        point = models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "bm25": models.Document(
                    text=text_str,
                    model="Qdrant/bm25"
                )
            },
            payload={
                "text": text,
                "section": doc["section"],
                "course": course["course"]
            }
        )
        points.append(point)

# ✅ Now send the points
client.upsert(
    collection_name="llm2-sparse",
    points=points
)


# Step 3: Running sparse vector search with BM25

In [None]:
def search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name="llm2-sparse",
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [None]:
results = search("Tutor")
print(results[0].payload["text"])

In [None]:
results[0].score

In [None]:
import random
import json

random.seed(22)

course = random.choice(documents_raw)
course_piece = random.choice(course["documents"])
print(json.dumps(course_piece, indent=2))

In [None]:
results = search(course_piece["question"])
print(results[0].payload["text"])

In [None]:
# Create the collection with both vector types
client.create_collection(
    collection_name="llm-sparse-and-dense",
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

In [None]:
import uuid
from qdrant_client import QdrantClient, models

client.upsert(
    collection_name="llm-sparse-and-dense",
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=(
                        doc["text"]
                        if isinstance(doc["text"], str)
                        else " ".join(doc["text"])
                    ),
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                "bm25": models.Document(
                    text=(
                        doc["text"]
                        if isinstance(doc["text"], str)
                        else " ".join(doc["text"])
                    ),
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"],
            }
        )
        for course in documents_raw
        for doc in course["documents"]
    ]
)


In [None]:
def multi_stage_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name="llm-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                # Prefetch ten times more results, then
                # expected to return, so we can really rerank
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [None]:
print(json.dumps(course_piece, indent=2))

In [None]:
results = multi_stage_search(course_piece["question"])
print(results[0].payload["text"])

In [None]:
def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name="llm-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )

    return results.points

In [None]:
results = rrf_search(course_piece["question"])
print(json.dumps(course_piece, indent=2))
print(results[0].payload["text"])