In [None]:
import time

while True:
    print("Keeping the notebook alive...")
    time.sleep(1500)  # 5 minutes


In [None]:
# docker run -p 6333:6333 -p 6334:6334 \
#    -v "$(pwd)/qdrant_storage:/qdrant/storage:z" \
#     qdrant/qdrant

# Json + Qdrant + OpenAI API

In [None]:
import requests
from tqdm import tqdm  # Progress bars (e.g., looping through files)

# Connect to Qdrant

In [None]:
from qdrant_client import QdrantClient, models
client = QdrantClient("http://localhost:6333") #connecting to local Qdrant instance

# Study the Dataset

In [None]:
docs_url = 'https://raw.githubusercontent.com/Mamdouh-Muhammad/llm/refs/heads/main/rk.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [None]:
documents_raw[0]

# Choosing the Embedding Model with FastEmbed

In [None]:
from fastembed import TextEmbedding
# TextEmbedding.list_supported_models()

In [None]:
# import json

# EMBEDDING_DIMENSIONALITY = 512

# for model in TextEmbedding.list_supported_models():
#     if model["dim"] == EMBEDDING_DIMENSIONALITY:
#         print(json.dumps(model, indent=2))

In [None]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
model = TextEmbedding(model_name=model_handle)

# Create a Collection

In [None]:
# Define the collection name
collection_name = "llm-rag"

# Create the collection with specified vector parameters
client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,  # Dimensionality of the vectors
        distance=models.Distance.COSINE  # Distance metric for similarity search
    )
)

# Create, Embed & Insert Points into the Collection

In [None]:
points = []
id = 0
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

for course in documents_raw:
    for doc in course['documents']:
        text_raw = doc["text"]
        text_str = text_raw if isinstance(text_raw, str) else " ".join(text_raw)
        point = models.PointStruct(
            id=id,
        vector = list(embedding_model.embed([text_str]))[0],
            payload={
                "text": doc['text'],
                "section": doc['section'],
                "course": course['course']
            } #save all needed metadata fields
        )
        points.append(point)

        id += 1

In [None]:
client.upsert(
    collection_name=collection_name,
    points=points
)

# 6. Running a Similarity Search

In [None]:
def search(query, limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [None]:
import random

course = random.choice(documents_raw)
course_piece = random.choice(course['documents'])
print(json.dumps(course_piece, indent=2))

In [None]:
result = search(course_piece['question'])
result

In [None]:
print(f"Question:\n{course_piece['question']}\n")
print("Top Retrieved Answer:\n{}\n".format(result.points[0].payload['text']))
print("Original Answer:\n{}".format(course_piece['text']))

In [None]:
print(search("What should be done toward the end of the semester?").points[0].payload['text'])

# 7. Running a Similarity Search with Filters

In [None]:
client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword" # exact matching on string metadata fields
)

In [None]:
def search_in_course(query, course="rechnerkommunikation-preparation-guide", limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle
        ),
        query_filter=models.Filter( # filter by course name
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results

In [None]:
print(search_in_course("What should be done toward the end of the semester?", "rechnerkommunikation-preparation-guide").points[0].payload['text'])