In [26]:
from dotenv import load_dotenv
# Importing the keys
load_dotenv()
import os

In [4]:
import requests

url = "https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json"
response = requests.get(url)

if response.status_code == 200:
    with open("documents.json", "wb") as f:
        f.write(response.content)
    print("File downloaded successfully.")
else:
    print("Failed to download the file.")


File downloaded successfully.


In [16]:
import json

with open('./documents.json', 'rt') as f_in:
    documents_file = json.load(f_in)

documents = []

for course in documents_file:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)


In [17]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [18]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

# Retrival Augmentend Generation


In [6]:
!curl -O https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py


  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0  3832    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100  3832  100  3832    0     0   5934      0 --:--:-- --:--:-- --:--:--  5959


In [8]:
import minsearch
import json

text_fields=["question", "text", "section"]
Purpose: Specifies the fields in your data that should be treated as text fields.
Text Fields: These are typically fields containing large chunks of text that you want to be searchable. Full-text search capabilities (such as tokenization, stemming, and relevance ranking) are usually applied to these fields.
Specified Fields:
"question": A field that likely contains questions or queries.
"text": A general text field.
"section": Another text field, possibly indicating sections or segments of documents.
When these fields are indexed, the search engine will treat them as full-text fields, making them searchable for keywords and phrases.

keyword_fields=["course"]
Purpose: Specifies the fields in your data that should be treated as keyword fields.
Keyword Fields: These fields are typically used for exact match searches. They might contain categorical data or metadata where you want to filter or match the exact terms rather than performing a full-text search.
Specified Field:
"course": A field that likely contains course identifiers, names, or types.
When this field is indexed, the search engine will treat it as a keyword field, which means it will be used for filtering and exact matching rather than full-text searching.
It the same as saying:
SELECT * WHERE course = 'data-engineering-zoomcamp';



In [11]:

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [13]:
q = 'Can I still enroll in the course?'

In [19]:
index.fit(documents)

<minsearch.Index at 0x1bbe7b21d30>

In [22]:
#To give more importance to the question
boost ={'question':3.0, 'section':0.5}

result=index.search(
    query=q,
    filter_dict={'course': 'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)
    

In [23]:
result

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

# LLMs

In [27]:
from openai import OpenAI
client =OpenAI()

In [28]:
response=client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role":"user","content":q}]
)

In [31]:
response.choices[0].message.content

"To determine if you can still enroll in a course, you'll need to check with the specific institution or organization offering the course. Here are some steps you can take:\n\n1. **Visit the Website**: Check the official website of the institution or platform offering the course. They often list deadlines and other enrollment details.\n\n2. **Contact the Admissions Office**: If the information isn't clear online, contact the admissions office or the department offering the course. They can provide you with the most up-to-date information.\n\n3. **Check for Late Enrollment Policies**: Some institutions allow for late enrollment under certain conditions. These policies will usually be listed in the course catalog or on the institution's website.\n\n4. **Look for Online or Self-Paced Options**: If traditional enrollment deadlines have passed, you might find online or self-paced versions of the course that have more flexible enrollment options.\n\n5. **Consider Next Term or Session**: If y

In [None]:
prompt_template = """
You are a course teaching assistant.
Use only the facts from the CONTEXT when answering the QUESTION

QUESTION:{question}

CONTEXT:{context}


"""

In [33]:
context =""

for doc in result:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"


In [34]:
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.

section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start wit