In [1]:
# !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

--2024-06-24 12:47:07--  https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py.1’


2024-06-24 12:47:07 (12.7 MB/s) - ‘minsearch.py.1’ saved [3832/3832]



In [1]:
import minsearch
import json
from openai import OpenAI
import os

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[500]

{'text': 'Q: “In lesson 2.8 why is y_pred different from y? After all, we trained X_train to get the weights that when multiplied by X_train should give exactly y, or?”\nA: linear regression is a pretty simple model, it neither can nor should fit 100% (nor any other model, as this would be the sign of overfitting). This picture might illustrate some intuition behind this, imagine X is a single feature:\nAs our model is linear, how would you draw a line to fit all the "dots"?\nYou could "fit" all the "dots" on this pic using something like scipy.optimize.curve_fit (non-linear least squares) if you wanted to, but imagine how it would perform on previously unseen data.\nAdded by Andrii Larkin',
 'section': '2. Machine Learning for Regression',
 'question': 'Why linear regression doesn’t provide a “perfect” fit?',
 'course': 'machine-learning-zoomcamp'}

In [6]:
index = minsearch.Index(text_fields =["question","text","section"],
               keyword_fields =["course"]
               )

In [18]:
index.fit(documents)

<minsearch.Index at 0x77ed83ab8d60>

In [7]:
client = OpenAI()

In [10]:
q = 'Can I enroll for the course if it has already started?'

In [11]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user', 'content':q}],
    
)

In [12]:
response.choices[0].message.content

"The ability to enroll in a course that has already started typically depends on the specific policies of the educational institution or platform offering the course. Here are some common scenarios:\n\n1. **Grace Period**: Some institutions offer a grace period during which you can still enroll in a course after it has begun. This period is often clearly specified.\n   \n2. **Self-Paced Courses**: If the course is self-paced, you can usually enroll at any time and complete the work according to your own schedule.\n\n3. **Instructor Permission**: For more structured courses, you might have the option to contact the instructor directly to seek permission to join late.\n\n4. **Platform-Specific Guidelines**: If you're enrolling through an online platform like Coursera, edX, or Udacity, check their specific guidelines. Many of these platforms have flexible enrollment policies.\n\n5. **Missed Content**: Be aware that if you enroll late, you might have to catch up on missed lectures, assignm

In [13]:
def search(query):
    boost ={'question': 3.0, 'section':0.5}

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5,
        filter_dict={'course':'data-engineering-zoomcamp'}
    )
    return results

In [14]:
def build_prompt(query, search_results):
    prompt_template = '''
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
Use only the facts from the CONTEXT when answering the QUESTION.
QUESTION: {question}

CONTEXT: 
{context}
    '''.strip()
    
    context=""
    for doc in search_results:    
        context= context + f"section:{doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [15]:
def llm(prompt):
    response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user', 'content':prompt}],
        
    )
    return response.choices[0].message.content

In [19]:

query = "How do I enroll in the course after it has started?"
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer
rag(query)

"Yes, you can still join the course after the start date. Even if you don't register initially, you're still eligible to submit the homework assignments. However, keep in mind that there will be deadlines for turning in the final projects, so it's important not to procrastinate."

#### Indexing with Elastic Search

In [20]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

In [21]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

documents[4]

{'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
 'section': 'General course-related questions',
 'question': 'Course - What can I do before the course starts?',
 'course': 'data-engineering-zoomcamp'}

In [22]:
es_client = Elasticsearch('http://localhost:9200')

In [23]:
index_settings ={
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [24]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [39]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [42]:

query = "What will I learn in this course?"
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer
rag(query)

'The CONTEXT provided does not contain specific information about what you will learn in the course. For details about the specific topics and skills you will acquire, please refer to the course syllabus or the introductory materials provided by the course instructors.'