In [1]:
from qdrant_client import QdrantClient, models
from fastembed import TextEmbedding

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qd_client = QdrantClient('http://localhost:6333')

In [3]:
import json

with open('documents.json', 'r') as documents_file:
    documents_raw = json.load(documents_file)

documents = []

for course_dict in documents_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
model_handle = "jinaai/jina-embeddings-v2-small-en"
collection_name = "zoomcamp-faq"
EMBEDDING_DIMENSIONALITY = 512

In [8]:
qd_client.delete_collection(collection_name=collection_name)

False

In [9]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [5]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id = i,
        vector = vector,
        payload = doc
    )
    points.append(point)

In [10]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [47]:
from google import genai
from google.genai import types

# The client gets the API key from the environment variable `GEMINI_API_KEY`.
genai_client = genai.Client()

In [25]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=6, status=<UpdateStatus.COMPLETED: 'completed'>)

In [37]:
question = 'I just discovered this course. Can I still join it?'
limit = 5
course = 'data-engineering-zoomcamp'

In [40]:
def vector_search(question, limit=1, course='data-engineering-zoomcamp'):
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=question,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )
    results = []
    for point in query_points.points:
        results.append(point.payload)
    return results

In [42]:
results = vector_search(question='How do I run Kafka?', limit=5)
results

[{'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'question': 'Java Kafka: How to run producer/consumer/kstreams/etc in terminal',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'For example, when running JsonConsumer.java, got:\nConsuming form kafka started\nRESULTS:::0\nRESULTS:::0\nRESULTS:::0\nOr when running JsonProducer.java, got:\nException in thread "main" java.util.concurrent.ExecutionException: org.apache.kafka.common.errors.SaslAuthenticationException: Authentication failed\nSolution:\nMake sure in the scripts in src/main/java/org/example/ that you are running (e.g. JsonConsumer.java, JsonProducer.java), the StreamsConfig.BOOTSTRAP_SERVERS_CONFIG is the correct server url (e.g. europe-west3 from example vs europe-west2)\nMake sure cluster key and secrets are updated in src/main/java/org/example/Secrets.java (KAFKA_CLUSTER_KEY and KA

In [48]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT:
{context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}"

    prompt = prompt_template.format(question=query, context=context)
    return prompt

In [49]:
def llm(prompt):
    contents = types.Content(
      role='user',
      parts=[types.Part.from_text(text=prompt)]
    )
    
    response = genai_client.models.generate_content(
        model="gemini-2.5-flash", contents=contents
    )

    return response.text

In [52]:
def rag(query):
    search_results = vector_search(question=query, limit=5)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [54]:
rag('How do I run Kafka?')

'To run Kafka:\n\n1.  **Ensure Kafka Broker is running**: If you encounter a `NoBrokersAvailable` error, it likely means your Kafka broker Docker container is not working. Use `docker ps` to confirm, then navigate to the docker compose yaml file folder and run `docker compose up -d` to start all instances.\n2.  **Run Java Kafka client applications (e.g., producer/consumer)**: In the project directory, run:\n    `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java`\n    (Replace `<jar_name>` with the appropriate jar name, and `JsonProducer.java` with the specific script you want to run).\n3.  **Run Python Kafka client applications (e.g., producer.py)**: Create and activate a virtual environment, then run your Python files within it.\n    *   To create and install packages (run once):\n        `python -m venv env`\n        `source env/bin/activate` (or `env/Scripts/activate` on Windows)\n        `pip install -r ../requirements.txt`\n    *   To 