# Json + Hybrid search

In [None]:
#!pip install openai python-dotenv tqdm requests beautifulsoup4
#!pip install --upgrade pinecone

# Step 1: Connect to Qdrant

In [3]:
import requests
from tqdm import tqdm  # Progress bars (e.g., looping through files)
from qdrant_client import QdrantClient, models
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance
client.get_collections()

  from .autonotebook import tqdm as notebook_tqdm


CollectionsResponse(collections=[CollectionDescription(name='llm-sparse-and-dense'), CollectionDescription(name='llm-rag'), CollectionDescription(name='llm2-sparse')])

# Step 2: Sparse vector search with BM25

In [4]:
docs_url = 'https://raw.githubusercontent.com/Mamdouh-Muhammad/llm/refs/heads/main/rk.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [5]:
# for course in documents_raw:
#         for doc in course['documents']:
#             print(type(doc["text"]), doc["text"])


<class 'list'> ['Reserve the I7 CIP room (tell Stephanie).', 'Check the expected number of students that will enroll.', 'Plan the number of tutors and time slots based on expected student count.', 'Contact students to join as tutors, starting with former tutors.', 'Email previous year students based on RK grades and notify Erika for paperwork.', 'Create StudOn content for the RK semester including Parallelgruppen and Vierergruppen.', 'Use previous year examples as templates (e.g., SS24).', 'Adjust Parallelgruppe registration via StudOn: Übungen → Übungsbereich → Anmeldung zu den Übungen.', 'Edit the Zeitplan to reflect Feiertage and Vorlesungsfrei Tage.', 'Schedule and email tutors about the first Zoom meeting, using DFN Terminplaner.', 'Edit the RK_0_Org file in the lectures folder.', 'Request StudOn access from I7admin for RK lectures.', 'Create rkXYint, rkXY, rkXYabgabe email addresses (deprecated from RK25 onwards).', 'Set correct sender/receiver access for mailing lists.']
<class 

In [6]:
from qdrant_client import models

# Create the collection with specified sparse vector parameters
client.create_collection(
    collection_name="llm2-sparse",
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

UnexpectedResponse: Unexpected Response: 409 (Conflict)
Raw response content:
b'{"status":{"error":"Wrong input: Collection `llm2-sparse` already exists!"},"time":0.000408052}'

In [7]:
import uuid

points = []

for course in documents_raw:
    for doc in course["documents"]:
        text = doc["text"]
        if isinstance(text, str):
            text_str = text
        elif isinstance(text, list):
            text_str = " ".join(text)
        else:
            raise TypeError(f"Unexpected type for text: {type(text)}")

        point = models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "bm25": models.Document(
                    text=text_str,
                    model="Qdrant/bm25"
                )
            },
            payload={
                "text": text,
                "section": doc["section"],
                "course": course["course"]
            }
        )
        points.append(point)

# ✅ Now send the points
client.upsert(
    collection_name="llm2-sparse",
    points=points
)


Fetching 30 files: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 34.02it/s]


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

# Step 3: Running sparse vector search with BM25

In [8]:
def search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name="llm2-sparse",
        query=models.Document(
            text=query,
            model="Qdrant/bm25",
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [9]:
results = search("Tutor")
print(results[0].payload["text"])

['Attend tutor meeting (usually Monday); assign slots and gather Benutzer IDs.', 'Fill and submit Zeiterfassung-Formular monthly.', 'Insert weekly tutor meetings and own slots into SoGo calendar.', 'Attend the lecture and introduce exercises, encourage registration on Campo for both exams and exercises.', 'Provide tutors access to exercise folder.', 'Ensure whiteboard markers and erasers are available.', 'Test room cables/projector/HDMI connection.', 'Find suitable date for weekly tutor meeting.', 'Add tutors as course tutors on StudOn.', 'Upload Zeitplan on StudOn for students.', 'Print room schedule from Campo and hang in front of the respective room.', 'Check lecture slides for updates and upload to StudOn if changed.', 'Use the Excel sheet from SVN for homework evaluations if needed.', 'Send tutors the Zeiterfassung form to track their time.', 'Provide tutors with exercise videos as refreshers.', 'Check availability of lecture videos for students (at end or with buffer).']


In [None]:
results[0].score

In [10]:
import random
import json

random.seed(22)

course = random.choice(documents_raw)
course_piece = random.choice(course["documents"])
print(json.dumps(course_piece, indent=2))

{
  "section": "After the Semester",
  "question": "What actions are necessary after the semester ends?",
  "text": [
    "Collect Zeiterfassung forms from tutors.",
    "Ask Peter to create and share two Overleaf .tex files (exam and solution).",
    "Export student status from StudOn and upload to Campo.",
    "Ensure no 'pending' status, and notify students via email for confirmation.",
    "Announce Fragestunde date on StudOn (typically 1-2 weeks before the exam).",
    "Determine how many students registered for the exam to plan hall and supervision.",
    "Check availability of enough paper cartons/sheets for the exams.",
    "Verify toner levels for printing via printer screen or online portal.",
    "Get exam approval via email from the professor before printing."
  ]
}


In [41]:
results = search(course_piece["question"])
print(results[0].payload["text"])

['Attend tutor meeting (usually Monday); assign slots and gather Benutzer IDs.', 'Fill and submit Zeiterfassung-Formular monthly.', 'Insert weekly tutor meetings and own slots into SoGo calendar.', 'Attend the lecture and introduce exercises, encourage registration on Campo for both exams and exercises.', 'Provide tutors access to exercise folder.', 'Ensure whiteboard markers and erasers are available.', 'Test room cables/projector/HDMI connection.', 'Find suitable date for weekly tutor meeting.', 'Add tutors as course tutors on StudOn.', 'Upload Zeitplan on StudOn for students.', 'Print room schedule from Campo and hang in front of the respective room.', 'Check lecture slides for updates and upload to StudOn if changed.', 'Use the Excel sheet from SVN for homework evaluations if needed.', 'Send tutors the Zeiterfassung form to track their time.', 'Provide tutors with exercise videos as refreshers.', 'Check availability of lecture videos for students (at end or with buffer).']


In [11]:
# Create the collection with both vector types
client.create_collection(
    collection_name="llm-sparse-and-dense",
    vectors_config={
        # Named dense vector for jinaai/jina-embeddings-v2-small-en
        "jina-small": models.VectorParams(
            size=512,
            distance=models.Distance.COSINE,
        ),
    },
    sparse_vectors_config={
        "bm25": models.SparseVectorParams(
            modifier=models.Modifier.IDF,
        )
    }
)

UnexpectedResponse: Unexpected Response: 409 (Conflict)
Raw response content:
b'{"status":{"error":"Wrong input: Collection `llm-sparse-and-dense` already exists!"},"time":0.000212722}'

In [12]:
import uuid
from qdrant_client import QdrantClient, models

client.upsert(
    collection_name="llm-sparse-and-dense",
    points=[
        models.PointStruct(
            id=uuid.uuid4().hex,
            vector={
                "jina-small": models.Document(
                    text=(
                        doc["text"]
                        if isinstance(doc["text"], str)
                        else " ".join(doc["text"])
                    ),
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                "bm25": models.Document(
                    text=(
                        doc["text"]
                        if isinstance(doc["text"], str)
                        else " ".join(doc["text"])
                    ),
                    model="Qdrant/bm25",
                ),
            },
            payload={
                "text": doc["text"],
                "section": doc["section"],
                "course": course["course"],
            }
        )
        for course in documents_raw
        for doc in course["documents"]
    ]
)


Fetching 5 files: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:01<00:00,  3.23it/s]


UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [45]:
def multi_stage_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name="llm-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                # Prefetch ten times more results, then
                # expected to return, so we can really rerank
                limit=(10 * limit),
            ),
        ],
        query=models.Document(
            text=query,
            model="Qdrant/bm25", 
        ),
        using="bm25",
        limit=limit,
        with_payload=True,
    )

    return results.points

In [46]:
print(json.dumps(course_piece, indent=2))

{
  "section": "After the Semester",
  "question": "What actions are necessary after the semester ends?",
  "text": [
    "Collect Zeiterfassung forms from tutors.",
    "Ask Peter to create and share two Overleaf .tex files (exam and solution).",
    "Export student status from StudOn and upload to Campo.",
    "Ensure no 'pending' status, and notify students via email for confirmation.",
    "Announce Fragestunde date on StudOn (typically 1-2 weeks before the exam).",
    "Determine how many students registered for the exam to plan hall and supervision.",
    "Check availability of enough paper cartons/sheets for the exams.",
    "Verify toner levels for printing via printer screen or online portal.",
    "Get exam approval via email from the professor before printing."
  ]
}


In [47]:
results = multi_stage_search(course_piece["question"])
print(results[0].payload["text"])

['Attend tutor meeting (usually Monday); assign slots and gather Benutzer IDs.', 'Fill and submit Zeiterfassung-Formular monthly.', 'Insert weekly tutor meetings and own slots into SoGo calendar.', 'Attend the lecture and introduce exercises, encourage registration on Campo for both exams and exercises.', 'Provide tutors access to exercise folder.', 'Ensure whiteboard markers and erasers are available.', 'Test room cables/projector/HDMI connection.', 'Find suitable date for weekly tutor meeting.', 'Add tutors as course tutors on StudOn.', 'Upload Zeitplan on StudOn for students.', 'Print room schedule from Campo and hang in front of the respective room.', 'Check lecture slides for updates and upload to StudOn if changed.', 'Use the Excel sheet from SVN for homework evaluations if needed.', 'Send tutors the Zeiterfassung form to track their time.', 'Provide tutors with exercise videos as refreshers.', 'Check availability of lecture videos for students (at end or with buffer).']


In [14]:
def rrf_search(query: str, limit: int = 1) -> list[models.ScoredPoint]:
    results = client.query_points(
        collection_name="llm-sparse-and-dense",
        prefetch=[
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="jinaai/jina-embeddings-v2-small-en",
                ),
                using="jina-small",
                limit=(5 * limit),
            ),
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="bm25",
                limit=(5 * limit),
            ),
        ],
        # Fusion query enables fusion on the prefetched results
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        with_payload=True,
    )

    return results.points

In [15]:
results = rrf_search(course_piece["question"])
print(json.dumps(course_piece, indent=2))
print(results[0].payload["text"])

{
  "section": "After the Semester",
  "question": "What actions are necessary after the semester ends?",
  "text": [
    "Collect Zeiterfassung forms from tutors.",
    "Ask Peter to create and share two Overleaf .tex files (exam and solution).",
    "Export student status from StudOn and upload to Campo.",
    "Ensure no 'pending' status, and notify students via email for confirmation.",
    "Announce Fragestunde date on StudOn (typically 1-2 weeks before the exam).",
    "Determine how many students registered for the exam to plan hall and supervision.",
    "Check availability of enough paper cartons/sheets for the exams.",
    "Verify toner levels for printing via printer screen or online portal.",
    "Get exam approval via email from the professor before printing."
  ]
}
['Reserve the I7 CIP room (tell Stephanie).', 'Check the expected number of students that will enroll.', 'Plan the number of tutors and time slots based on expected student count.', 'Contact students to join as 