In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py --no-check-certificate

--2025-06-17 13:08:14--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/refs/heads/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.109.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
  Unable to locally verify the issuer's authority.
HTTP request sent, awaiting response... 200 OK
Length: 4073 (4.0K) [text/plain]
Saving to: ‘minsearch.py.4’


2025-06-17 13:08:14 (5.12 MB/s) - ‘minsearch.py.4’ saved [4073/4073]



In [2]:
import json
from mistralai import Mistral
from mistralai.models import UserMessage
import requests 
import minsearch



In [3]:
import os
from dotenv import load_dotenv


In [4]:
from qdrant_client import QdrantClient, models
import requests
from fastembed import TextEmbedding
import json


In [6]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [7]:
documents[5]

{'text': "There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:\nData-Engineering (Jan - Apr)\nMLOps (May - Aug)\nMachine Learning (Sep - Jan)\nThere's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.\nThey follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.",
 'section': 'General course-related questions',
 'question': 'Course - how many Zoomcamps in a year?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
load_dotenv()

True

In [9]:
api_key = os.getenv("API_KEY")

In [10]:
mis_client = Mistral(api_key = api_key, )

In [11]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x735ed6c97920>

In [12]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [13]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [14]:
def llm(prompt):
    response = mis_client.chat.complete(
        model = "mistral-medium-latest", 
        messages = [UserMessage(content=prompt)]
    )
    return response.choices[0].message.content


In [15]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [16]:
rag('how do I run kafka?')

"To run Kafka, you can follow these steps based on the context provided:\n\n1. **For Java Kafka**:\n   In the project directory, run the following command in the terminal:\n   ```bash\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **For Python Kafka**:\n   - Create and activate a virtual environment:\n     ```bash\n     python -m venv env\n     source env/bin/activate  # On Windows, use `env\\Scripts\\activate`\n     pip install -r ../requirements.txt\n     ```\n   - Ensure Docker images are up and running before executing the Python files.\n   - If you encounter the error `ModuleNotFoundError: No module named 'kafka.vendor.six.moves'`, use the following command to install the Kafka Python package:\n     ```bash\n     pip install kafka-python-ng\n     ```\n\n3. **For Permission Issues**:\n   If you encounter a `Permission denied` error when running `./build.sh`, execute:\n   ```bash\n   chmod +x build.sh\n   ```"

# RAG with Vector Search

In [17]:
#connecting to local Qdrant instance
qd_client = QdrantClient(url="http://localhost:6333")

In [18]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [19]:
collection_name = "zoomcamp-faq"


In [20]:
# Delete a connection 
qd_client.delete_collection(collection_name=collection_name)

True

In [21]:
#Create the collection

qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size = EMBEDDING_DIMENSIONALITY, 
        distance=models.Distance.COSINE 
    )
)


True

In [22]:
qd_client.create_payload_index(
    collection_name=collection_name, 
    field_name="course",
    field_schema="keyword" )

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [23]:
points = []

for i, doc in enumerate(documents):
    text=doc['question'] + '' + doc['text']  # for the index
    vector=models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)


In [24]:
points[20]

PointStruct(id=20, vector=Document(text='Environment - Should I use my local machine, GCP, or GitHub Codespaces for my environment?You can set it up on your laptop or PC if you prefer to work locally from your laptop or PC.\nYou might face some challenges, especially for Windows users. If you face cnd2\nIf you prefer to work on the local machine, you may start with the week 1 Introduction to Docker and follow through.\nHowever, if you prefer to set up a virtual machine, you may start with these first:\nUsing GitHub Codespaces\nSetting up the environment on a cloudV Mcodespace\nI decided to work on a virtual machine because I have different laptops & PCs for my home & office, so I can work on this boot camp virtually anywhere.', model='jinaai/jina-embeddings-v2-small-en', options=None), payload={'text': 'You can set it up on your laptop or PC if you prefer to work locally from your laptop or PC.\nYou might face some challenges, especially for Windows users. If you face cnd2\nIf you pref

In [25]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [26]:
question= 'I just discovered the course. Can I still join it?'

In [27]:
def vector_search(question, course, limit):

    course = course
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document( 
            text=question,
            model=model_handle
        ),
        query_filter=models.Filter(
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=limit, 
        with_payload=True 
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)

    return results 

In [28]:
vector_search('how do I run kafka', 'data-engineering-zoomcamp', 5)

[{'text': 'For example, when running JsonConsumer.java, got:\nConsuming form kafka started\nRESULTS:::0\nRESULTS:::0\nRESULTS:::0\nOr when running JsonProducer.java, got:\nException in thread "main" java.util.concurrent.ExecutionException: org.apache.kafka.common.errors.SaslAuthenticationException: Authentication failed\nSolution:\nMake sure in the scripts in src/main/java/org/example/ that you are running (e.g. JsonConsumer.java, JsonProducer.java), the StreamsConfig.BOOTSTRAP_SERVERS_CONFIG is the correct server url (e.g. europe-west3 from example vs europe-west2)\nMake sure cluster key and secrets are updated in src/main/java/org/example/Secrets.java (KAFKA_CLUSTER_KEY and KAFKA_CLUSTER_SECRET)',
  'section': 'Module 6: streaming with kafka',
  'question': 'Java Kafka: When running the producer/consumer/etc java scripts, no results retrieved or no message sent',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SN

In [29]:
def rag(query):
    search_results = vector_search(query, 'data-engineering-zoomcamp', 5)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [30]:
rag('how do I run kafka?')

'To run Kafka, follow these steps based on the provided context:\n\n1. **For Java Kafka scripts**:\n   - Ensure the `StreamsConfig.BOOTSTRAP_SERVERS_CONFIG` in your script (e.g., `JsonProducer.java`, `JsonConsumer.java`) has the correct server URL.\n   - Update the cluster key and secrets in `src/main/java/org/example/Secrets.java` (`KAFKA_CLUSTER_KEY` and `KAFKA_CLUSTER_SECRET`).\n   - Run the script from the project directory using:\n     ```bash\n     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n     ```\n\n2. **For Python Kafka scripts**:\n   - Create and activate a virtual environment:\n     ```bash\n     python -m venv env\n     source env/bin/activate  # Use `env/Scripts/activate` on Windows\n     pip install -r ../requirements.txt\n     ```\n   - Ensure Docker containers are running (`docker ps` to check, `docker compose up -d` to start).\n\n3. **If encountering broker issues**:\n   - Verify Kafka broker Docker containers are 