In [2]:
%pip install python-dotenv

Note: you may need to restart the kernel to use updated packages.


In [3]:
from dotenv import load_dotenv

In [4]:
load_dotenv()

True

In [5]:
from openai import OpenAI

openai_client = OpenAI()

In [6]:
def llm(user_prompt, instructions=None, model="gpt-4o-mini"):
    messages = []

    if instructions:
        messages.append({
            "role": "system",
            "content": instructions
        })

    messages.append({
        "role": "user",
        "content": user_prompt
    })

    response = openai_client.responses.create(
        model=model,
        input=messages
    )

    return response.output_text

In [7]:
llm('When does the course start?')

"Could you please specify which course you're referring to?"

In [8]:
def rag(question):
    search_results = search(question)
    user_prompt = build_prompt(question, search_results)
    results = llm(user_prompt, instructions=instructions)
    

In [9]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [10]:
documents[11]

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'section': 'General course-related questions',
 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
 'course': 'data-engineering-zoomcamp'}

In [11]:
len(documents)

948

In [12]:
!uv add minsearch

[2mResolved [1m151 packages[0m [2min 2ms[0m[0m
[2mAudited [1m131 packages[0m [2min 418ms[0m[0m


In [13]:
from minsearch import Index

In [14]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x1e28c1bb620>

In [15]:
def search(question):
    return index.search(
                 question,
                 boost_dict={'question': 3.0, 'section': 0.3},
                 filter_dict={'course': 'data-engineering-zoomcamp'}, num_results=5)

In [16]:
question = 'I just discovered the course, can I join now?'

In [17]:
search_results = search(question)

In [18]:
instructions = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.
""".strip()

prompt_template = """
<QUESTION>
{question}
</QUESTION>

<CONTEXT>
{context}
</CONTEXT>
""".strip()




In [19]:
import json

In [20]:
def build_prompt(question, search_results):
    search_json = json.dumps(search_results)
    return prompt_template.format(
        question=question,
        context=search_json
    )

In [21]:
def rag(question):
    search_results = search(question)
    user_prompt = build_prompt(question, search_results)
    return llm (user_prompt, instructions=instructions)

In [22]:
rag(question)

'Yes, you can still join the course even after it has started, and you are eligible to submit the homework. However, be aware that there will be deadlines for turning in the final projects, so it’s best not to leave everything for the last minute.'

In [23]:
!uv add sentence-transformers

[2mResolved [1m151 packages[0m [2min 2ms[0m[0m
[2mAudited [1m131 packages[0m [2min 80ms[0m[0m


In [24]:
from sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer('multi-qa-distilbert-cos-v1')

In [25]:
docs = [
        ["I just discovered the course, can I still join?"],
        ['I just found out about this program. Can I still enroll?'],
        ["you can join the course at any point of time"]
]

vectors = []
for d in docs:
    v = embedding_model.encode(d)
    vectors.append(v)

In [26]:
q1, q2, d = vectors

In [27]:
q1 = q1[0]
q2 = q2[0]
d = d[0]

In [28]:
q1.dot(q2)

np.float32(0.6203526)

In [29]:
q1.dot(d)

np.float32(0.7205938)

In [30]:
q2.dot(d)

np.float32(0.48765683)

In [31]:
!uv add tqdm

[2mResolved [1m151 packages[0m [2min 2ms[0m[0m
[2mAudited [1m131 packages[0m [2min 79ms[0m[0m


In [32]:
from tqdm.auto import tqdm

In [33]:
d = documents[11]

In [34]:
d

{'text': "No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running.",
 'section': 'General course-related questions',
 'question': 'Certificate - Can I follow the course in a self-paced mode and get a certificate?',
 'course': 'data-engineering-zoomcamp'}

In [35]:
text = d['question'] + ' ' + d['text']
text

"Certificate - Can I follow the course in a self-paced mode and get a certificate? No, you can only get a certificate if you finish the course with a “live” cohort. We don't award certificates for the self-paced mode. The reason is you need to peer-review capstone(s) after submitting a project. You can only peer-review projects at the time the course is running."

In [36]:
import numpy as np

In [37]:


embeddings = []

for d in tqdm(documents):
    text = d['question'] + ' ' + d['text']
    v = embedding_model.encode(text)
    embeddings.append(v)

embeddings = np.array(embeddings)

  0%|          | 0/948 [00:00<?, ?it/s]

In [38]:
embeddings.shape

(948, 768)

In [39]:
from minsearch import VectorSearch

In [40]:
vindex = VectorSearch(keyword_fields=['course'])
vindex.fit(embeddings, documents)

<minsearch.vector.VectorSearch at 0x1e2911fe270>

In [41]:
vindex.search(q1, filter_dict={'course': 'data-engineering-zoomcamp'}, num_results=5)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
  'section': 'General course-related questions',
  'question': 'How can we contribute to the course?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  's

In [44]:
def vector_search(question):
    q = embedding_model.encode(question)
    return vindex.search(
        q,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        num_results=5
    )

In [45]:
question = 'I just found out about this program. Can I still enrol?'
vector_search(question)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

In [46]:
def rag(question):
    search_results = vector_search(question)
    user_prompt = build_prompt(question, search_results)
    return llm (user_prompt, instructions=instructions)

In [47]:
rag(question)

"Yes, you can join the course even if you missed the registration date. You are still eligible to submit homeworks. However, keep in mind that there will be deadlines for turning in final projects, so it's best not to delay your work."

In [None]:
def hybrid_search(question):
    r1 = search(question)
    r2 = vector_search(question)
    return r1 + r2