In [1]:
import openai

In [6]:
from openai import OpenAI

In [7]:
client = OpenAI()

OpenAIError: The api_key client option must be set either by passing api_key to the client or by setting the OPENAI_API_KEY environment variable

In [4]:
response = client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=[{"role": "user", "content": "is it to late to join the course?"}]
)

In [10]:
response.choices[0].message.content

"It depends on the course and the institution offering it. Some courses may have specific enrollment deadlines, while others may allow students to join late with permission from the instructor or department. It's best to contact the course coordinator or admissions office to inquire about late enrollment options."

In [13]:
import minsearch
import json

In [14]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [15]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [16]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [17]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [18]:
q = 'the course has already started, can I still enroll?'

In [19]:
index.fit(documents)

<minsearch.Index at 0x7b7cd5f2ab70>

In [20]:
response = client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"It depends on the policies of the institution offering the course. Some institutions may allow late enrollment with permission from the instructor, while others may have strict enrollment deadlines. It's best to contact the institution or instructor directly to inquire about late enrollment options."

In [22]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-3.5-turbo',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [23]:
query = 'how do I run kafka?'

rag(query)

'To run Kafka, you can follow these steps:\nFor Java Kafka - In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\nFor Python Kafka - Create a virtual environment and run requirements.txt and the python files in that environment. Use the provided installation command to install the \'dlt[duckdb]\' package if necessary. Additionally, to fix permission denied error, run chmod +x build.sh in the appropriate directory. If you encounter the "ModuleNotFoundError: No module named \'kafka.vendor.six.moves\'" error, consider using pip install kafka-python-ng instead.'

In [24]:
from elasticsearch import Elasticsearch

In [25]:
es_client = Elasticsearch('http://localhost:9200')

In [26]:
es_client.info()

ObjectApiResponse({'name': '8419c2d669ee', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'd5vUA0YdQx60iNYGIltqzw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [27]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

In [29]:
index_name = "course-questions"
es_client.indices.create(index=index_name, body= index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [31]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document = doc)

100%|██████████████████████████████████████| 948/948 [00:16<00:00, 56.92it/s]


In [44]:
def elastic_search(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    rsponse = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in rsponse['hits']['hits']:
        result_docs.append(hit['_source'])
    return result_docs

In [45]:
elastic_search(query)

[{'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAP

In [46]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [47]:
rag(query)

'To run Kafka, you can use Java commands in the terminal. For example, in the project directory, you can run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java'

In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents_homework = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents_homework.append(doc)

In [2]:
documents_homework[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [3]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
es_client = Elasticsearch('http://localhost:9200')

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions-homework-week-1"

es_client.indices.create(index=index_name, body= index_settings)



BadRequestError: BadRequestError(400, 'resource_already_exists_exception', 'index [course-questions-homework-week-1/KKR0C3RFT7CCoJaxLtWGzA] already exists')

In [5]:
for doc in tqdm(documents_homework):
    es_client.index(index=index_name, document = doc)

100%|██████████████████████████████████████| 948/948 [00:16<00:00, 58.19it/s]


In [18]:
def elastic_search_homework(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    rsponse = es_client.search(index=index_name, body=search_query)
    return rsponse 
    #result_docs = []
    #for hit in rsponse['hits']['hits']:
     #   result_docs.append(hit['_source'])
    #return result_docs

In [19]:
prueba = elastic_search_homework('How do I execute a command in a running docker container?')

In [20]:
prueba['hits']['hits'][2]

{'_index': 'course-questions-homework-week-1',
 '_id': 'dY8UtZAB29i-kKywW7WF',
 '_score': 51.134113,
 '_source': {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\nHrithik Kumar Advani",
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I copy files from my local machine to docker container?',
  'course': 'machine-learning-zoomcamp'}}

In [65]:
es_client.info()

ObjectApiResponse({'name': '8419c2d669ee', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'd5vUA0YdQx60iNYGIltqzw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [18]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
    
)

In [14]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [19]:
llm("write test passed")

" Test Passed\n\nThis is a simple instruction that signifies the completion of some form of testing, indicating all assertions or checks were correct and no errors are present in the code under evaluation. The message should be concise yet clear to anyone reading it on their end. Here's how you might document such an outcome:\n\n```markdown\n# Test Results\n\n## Status Report for ExampleFunctionTestCase\n- **Test Case**: Check if 'example()' function returns true when given positive integer inputs of 1 and 2.\n\n| Input | Expected Result | Actual Result    | Pass/Fail | Comments              |\n|-------|-----------------|------------------|-----------|-----------------------|\n| [1]   | True            | True             | Pass      | Test passed           |\n| [2]   end_of_table\n\n**Test Summary: All test cases have been executed successfully. No issues detected with the example function in question as of this report's compilation on March 23, 2023. Further testing may be required to

In [12]:
client.info()

AttributeError: 'OpenAI' object has no attribute 'info'