In [None]:
# import requests 

# docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
# docs_response = requests.get(docs_url)
# documents_raw = docs_response.json()

# documents = []

# for course in documents_raw:
#     course_name = course['course']

#     for doc in course['documents']:
#         doc['course'] = course_name
#         documents.append(doc)

In [29]:
import requests
import os
import json
from groq import Groq

In [2]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [3]:
documents[2]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# import minsearch

# index = minsearch.Index(
#     text_fields=["question", "text", "section"],
#     keyword_fields=["course"]
# )

# index.fit(documents)

In [5]:
# from openai import OpenAI

# openai_client = OpenAI()

In [6]:
# def search(query):
#     boost = {'question': 3.0, 'section': 0.5}

#     results = index.search(
#         query=query,
#         filter_dict={'course': 'data-engineering-zoomcamp'},
#         boost_dict=boost,
#         num_results=5
#     )

#     return results

In [7]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [33]:
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

In [34]:
def llm(prompt):

    response = client.chat.completions.create(
        model="llama-3.3-70b-versatile",  
        messages=[{"role": "user", "content": prompt}],
        max_tokens=1000,
    )
    
    return response.choices[0].message.content

In [8]:
# def llm(prompt):
#     response = openai_client.chat.completions.create(
#         model='gpt-4o-mini',
#         messages=[{"role": "user", "content": prompt}]
#     )
    
#     return response.choices[0].message.content

In [None]:
# def rag(query):
#     search_results = search(query)
#     prompt = build_prompt(query, search_results)
#     answer = llm(prompt)
#     return answer

In [10]:
#rag('how do I run kafka?')

In [11]:
#rag('the course has already started, can I still enroll?')

## RAG with Vector Search

In [17]:
from tqdm.auto import tqdm

In [None]:
from qdrant_client import QdrantClient, models

In [19]:
qd_client = QdrantClient("http://localhost:6333")

In [20]:
EMBEDDING_DIMENSIONALITY = 512
model_handle = "jinaai/jina-embeddings-v2-small-en"

In [21]:
collection_name = "zoomcamp-faq"

In [22]:
qd_client.delete_collection(collection_name=collection_name)

False

In [23]:
qd_client.create_collection(
    collection_name=collection_name,
    vectors_config=models.VectorParams(
        size=EMBEDDING_DIMENSIONALITY,
        distance=models.Distance.COSINE
    )
)

True

In [24]:
qd_client.create_payload_index(
    collection_name=collection_name,
    field_name="course",
    field_schema="keyword"
)

UpdateResult(operation_id=1, status=<UpdateStatus.COMPLETED: 'completed'>)

In [25]:
points = []

for i, doc in enumerate(documents):
    text = doc['question'] + ' ' + doc['text']
    vector = models.Document(text=text, model=model_handle)
    point = models.PointStruct(
        id=i,
        vector=vector,
        payload=doc
    )
    points.append(point)

In [26]:
qd_client.upsert(
    collection_name=collection_name,
    points=points
)

Fetching 5 files: 100%|██████████| 5/5 [00:02<00:00,  2.31it/s]


UpdateResult(operation_id=2, status=<UpdateStatus.COMPLETED: 'completed'>)

In [27]:
question = 'I just discovered the course. Can I still join it?'

In [28]:
def vector_search(question):
    print('vector_search is used')
    
    course = 'data-engineering-zoomcamp'
    query_points = qd_client.query_points(
        collection_name=collection_name,
        query=models.Document(
            text=question,
            model=model_handle 
        ),
        query_filter=models.Filter( 
            must=[
                models.FieldCondition(
                    key="course",
                    match=models.MatchValue(value=course)
                )
            ]
        ),
        limit=5,
        with_payload=True
    )
    
    results = []
    
    for point in query_points.points:
        results.append(point.payload)
    
    return results

In [31]:
def rag(query):
    search_results = vector_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [36]:
print(rag('how do I run kafka?'))

vector_search is used
To run Kafka, you need to follow these steps:

1. Make sure your Kafka broker docker container is running. If not, use `docker ps` to confirm and then run `docker compose up -d` in the docker compose yaml file folder to start all instances.
2. Ensure you are using the correct server URL in the `StreamsConfig.BOOTSTRAP_SERVERS_CONFIG` and that your cluster key and secrets are updated in `Secrets.java` (for Java).
3. For Python, create a virtual environment, run `requirements.txt`, and then activate the virtual environment before running your Python files.
4. In the project directory, you can run Kafka-related Java scripts (like `JsonProducer.java` or `JsonConsumer.java`) using the command: `java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java` (replace `<jar_name>` with your actual jar name). 

Make sure to address any potential errors, such as `NoBrokersAvailable` or authentication issues, by checking your Kafka broker st

In [37]:
def search(query, limit=1):

    results = client.query_points(
        collection_name=collection_name,
        query=models.Document( #embed the query text locally with "jinaai/jina-embeddings-v2-small-en"
            text=query,
            model=model_handle 
        ),
        limit=limit, # top closest matches
        with_payload=True #to get metadata in the results
    )

    return results