In [3]:
from qdrant_client import QdrantClient, models

In [18]:
client = QdrantClient(
    url="http://localhost:6333")

In [5]:
import requests

docs_url = "https://raw.githubusercontent.com/alexeygrigorev/llm-rag-workshop/refs/heads/main/notebooks/documents.json"
doc_response = requests.get(docs_url)
documents_raw = doc_response.json()



In [None]:
from fastembed import TextEmbedding

list_of_all_models = TextEmbedding.list_supported_models()
print("Available models:")
for model in list_of_all_models:
    print(f"- {model}")

In [8]:
EMBEDDING_DIMENSION = 512

In [None]:
for model in list_of_all_models:
    if model['dim'] == EMBEDDING_DIMENSION:
       print("Models with dimension 512:")
       print(f"- {model}")

In [None]:
# embedding_model = TextEmbedding(model='jinaai/jina-embeddings-v2-small-en')

Fetching 5 files: 100%|██████████| 5/5 [00:00<00:00,  5.12it/s]


In [14]:
model_handle = "jinaai/jina-embeddings-v2-small-en" # we will use this handle later in point creation.

In [None]:
# this is the qdrant collection, i.e. imagine like it is a vectorstore.
collection_name = "zoomcamp_rag"

client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSION,
        distance=models.Distance.COSINE
    )
)

# right now the collection is empty, so we can insert points..

True

In [None]:
# we will be iterating through each course and each document in the course and only use the 'text' i.e. answer to make it into a point.
# we also store additional information in the payload such as text, course and section.
points = []
id= 0

for course in documents_raw:
    for documents in course['documents']:
        text = documents['text']
        point = models.PointStruct(
            id=id,
            vector=models.Document(text=text, model=model_handle),
            payload={
                "text":documents['text'],
                "course":course['course'],
                "section":documents['section']
            }
        )
        points.append(point)

        id += 1

In [23]:
# we now have all the point and we need to insert these into the collection.

client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files: 100%|██████████| 5/5 [00:02<00:00,  2.17it/s]


UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:
# now that we have our collection ready, it is time to query it.
# we define a small function for it.

def search(query, limit=1):
    result = client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=query,
            model=model_handle
        ),
        limit=limit,
        with_payload=True
    )

    return result

In [None]:
res = search("what if i submit my homework late?", limit=5)
for point in res.points:
    print(f"similarity score:{point.score}")
    print(point.payload['text'])
    print()