In [1]:
import openai
from openai import OpenAI
import os

# 1.1 GPT API Quick Start

In [2]:
client = OpenAI(api_key="")

In [None]:
completion = client.chat.completions.create(
    model="gpt-3.5-turbo",
    messages=[
        {"role": "system", "content": "You are a poetic assistant, skilled in explaining complex programming concepts with creative flair."},
        {"role": "user", "content": "Compose a poem that explains the concept of recursion in programming."}
    ]
)

print(completion.choices[0].message)

# 1.3 - Retrieval and Search (Local Search Engine)

In [3]:
import minsearch
import json

import requests

In [4]:
# 使用documents.json文件中的文档进行搜索

# with open('documents.json', 'rt') as f_in:
#     docs_raw = json.load(f_in)


In [5]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [6]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [7]:
print(documents[0].keys())

dict_keys(['text', 'section', 'question', 'course'])


In [8]:
# Create an instance of the Index class, specifying the text and keyword fields.
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)

q1 = 'the course has already started, can I still enroll?'

index.fit(documents) # 训练模型，传入文档，其实就是把文档的内容放到模型中，以便后续搜索

# 这个boost的作用是，如果搜索的关键词在question字段中，那么这个字段的权重是3.0，如果在section字段中，那么这个字段的权重是0.5
# 也就意味着 question字段的权重是section字段的6倍
boost = {
    "question": 3.0,
    'section':0.5,
}

result = index.search(
    query=q1,
    boost_dict=boost,
    num_results=6 # 返回多少个结果
)

result

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the cour

在上面的结果里，每一个回答都和q1的问题有相关性。

至少在结果中，前三个是基本都在正确回答q1的问题的。后面的三个虽然有关，但是不是很多。

In [9]:
# 完整版，这里天街了一个filter_dict，用来过滤搜索结果，只返回course字段为'data-engineering-zoomcamp'的结果
minsearch_index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

query = 'the course has already started, can I still enroll?'
num_result = 3

minsearch_index.fit(documents)

boost_dict = {"question": 3.0, "section": 0.5}
filter_dict = {"course": "data-engineering-zoomcamp"}

filtered_boosted_results = minsearch_index.search(
    query=query,
    filter_dict=filter_dict,
    boost_dict=boost_dict,
    num_results=num_result
)

filtered_boosted_results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

# 1.4 GPT 生成回答

In [10]:
# 准备一个模板，用于生成根据给定问题和上下文生成回答的提示，并将这个提示输出

prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: 
{question}

CONTEXT: 
{context} 
""".strip()

# .strip() 方法在这里的作用是移除字符串首尾的空白字符，包括空格、制表符和换行符

In [11]:
# 列表包含了一些常见课程相关问题的字典，每个字典包含一个问题（question）、回答（text）、部分（section）和课程名（course）
result 

[{'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.',
  'section': 'General course-related questions',
  'question': 'The course has already started. Can I still join it?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the cour

In [12]:
context = ""

# 遍历 result 中的每个字典，将其 section、question 和 text 信息添加到 context 字符串中
for doc in result:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

print(context)

section: General course-related questions
question: The course has already started. Can I still join it?
answer: Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.
In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you c

In [13]:
q1

'the course has already started, can I still enroll?'

In [14]:
# 生成并打印提示
# 该作用是生成最终的提示字符串
# q1 是要回答的问题，context 是从 result 列表中构建的上下文
prompt = prompt_template.format(question=q1, context=context).strip()

print(prompt_template.format(question=q1, context=context).strip())

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: 
the course has already started, can I still enroll?

CONTEXT: 
section: General course-related questions
question: The course has already started. Can I still join it?
answer: Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.
In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be de

In [15]:
completion = client.chat.completions.create(
    model="gpt-4o",
    messages=[
        {"role": "user", "content": prompt}
    ]
)

print(completion.choices[0].message)

ChatCompletionMessage(content='Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course. In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', role='assistant', function_call=None, tool_calls=None)


In [16]:
completion.choices[0].message.content

'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course. In order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.'

In [17]:
completion.choices[0].message.role

'assistant'

# 1.5 代码集成

In [18]:
# 用本地的documents.json文件进行搜索
def search(query):
    # 这个boost的作用是，如果搜索的关键词在question字段中，那么这个字段的权重是3.0，如果在section字段中，那么这个字段的权重是0.5
    # 也就意味着 question字段的权重是section字段的6倍
    boost = {
        "question": 3.0,
        'section':0.5,
    }
    
    filter_dict = {"course": "data-engineering-zoomcamp"}
    
    result = index.search(
        query=query,
        filter_dict=filter_dict,
        boost_dict=boost,
        num_results=10 # 返回多少个结果
    )
    return result

In [19]:
result = search('how do i run kafka?')
result

[{'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'question': 'Java Kafka: How to run producer/consumer/kstreams/etc in terminal',
  'course': 'data-engineering-zoomcamp'},
 {'text': "Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you'll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on MacOS, Linux and Windows - but for Windows the path is slightly different (it's env/Scripts/activate)\nAlso the virtual environment should be created only to run the python file. Docker images should first all be up and running.",
  'section': 'Module 6: streaming wi

In [20]:
# 用prompt的方式生成回答
def build_prompt(query, research_results):
    """
    query: str, the question to answer
    research_results: list of dicts, the search results from the index, here is the one we built with the documents.json file 即我们自己构建的索引和其对应的文档（回答）
    """
    
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: 
    {question}

    CONTEXT: 
    {context} 
    """.strip()
    
    context = ""
    
    # 遍历 result 中的每个字典，将其 section、question 和 text 信息添加到 context 字符串中
    for doc in research_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    # 该作用是生成最终的提示字符串
    # q1 是要回答的问题，context 是从 result 列表中构建的上下文
    prompt = prompt_template.format(question=query, context=context).strip()
    
    return prompt

# result = search('how do i run kafka?') # 这个在上面已经执行过了
test_prompt = build_prompt('how do i run kafka?', result)
print(test_prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT. 
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: 
    how do i run kafka?

    CONTEXT: 
    section: Module 6: streaming with kafka
question: Java Kafka: How to run producer/consumer/kstreams/etc in terminal
answer: In the project directory, run:
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java

section: Module 6: streaming with kafka
question: Module “kafka” not found when trying to run producer.py
answer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.
To create a virtual env and install packages (run only once)
python -m venv env
source env/bin/activate
pip install -r ../requirements.txt
To activate it (you'll need to run it every time you need the virtual env):
source env/bin/activate
To deactivate it:
deactivate
This works on MacOS, Linux and Windows - 

In [21]:
def llm(prompt):
    completion = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {"role": "user", "content": prompt}
        ]
    )
    
    return completion.choices[0].message.content

In [22]:
# 第一步，通过问题，配合documents.json文件进行搜索，得到相应（较为符合条件）的文档
query = 'how do i run kafka?'
search_results = search(query)

# 第二部分，将搜索结果和问题组合成一个prompt，即有了三个主体：
# role - 表示设定的角色
# question - 问题
# context - 通过search_results得到的上下文
prompt = build_prompt(query, search_results)
# print(prompt)

# 第三部分，将prompt传入llm函数，得到由gpt-4o生成的回答，不过是基于给定的上下文和问题
answer = llm(prompt)

In [23]:
print(answer)

To run Kafka, follow these steps as provided in the course content:

1. **Java Kafka**:
   - Navigate to the project directory.
   - Run the following command in the terminal:
     ```sh
     java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
     ```

2. **Python Kafka**:
   - Ensure you have a virtual environment set up and the necessary dependencies installed.
   - To create and activate a virtual environment, run:
     ```sh
     python -m venv env
     source env/bin/activate
     pip install -r ../requirements.txt
     ```
     For Windows, activate the virtual environment with:
     ```sh
     env\Scripts\activate
     ```
   - To deactivate the virtual environment when done:
     ```sh
     deactivate
     ```

3. **Docker for Kafka environment**:
   - If you encounter `kafka.errors.NoBrokersAvailable: NoBrokersAvailable`, check if your Kafka broker docker container is running with:
     ```sh
     docker ps
     ```
   - If it's not

In [24]:
# 第四步，将上述的所有步骤封装成一个函数
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [25]:
rag_result = rag('how do i run docker?')
print(rag_result)

To run Docker, you can follow these steps depending on what you need to do. Below are some relevant scenarios and solutions from the context provided:

1. **Running a Docker Container on Windows:**
   If you encounter the error message "the input device is not a TTY," append `winpty` before your `docker run` command. For example:
   ```bash
   winpty docker run -it ubuntu bash
   ```

   You can also create an alias to avoid typing `winpty` every time:
   ```bash
   echo "alias docker='winpty docker'" >> ~/.bashrc
   OR
   echo "alias docker='winpty docker'" >> ~/.bash_profile
   ```

2. **Connecting to Docker Daemon:**
   If you get the error "Cannot connect to Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?", ensure the Docker daemon is started. If using WSL, update it with the following command in PowerShell:
   ```bash
   wsl --update
   ```

3. **Pulling Docker Images:**
   To pull a Docker image, ensure you are using the correct repository name. For ex

# 1.6 Search with Elasticsearch

我们要用documents.json文件中的文档，然后将其放入到elasticsearch中

In [27]:
from elasticsearch import Elasticsearch

In [28]:
es_client = Elasticsearch('http://localhost:9200')
es_client.info()
# 这一步是检查是否连接成功

ObjectApiResponse({'name': '85ce1f368f18', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'qYpo-1qLRIGmAN7rAG_izA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [30]:
# 创建index，
# text: 用于全文搜索
# keyword 类型，用于结构化数据的精确搜索
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

# 创建索引名称
index_name = 'course-questions'

# 创建实例并指定索引名称和设置，最后打印返回结果
response = es_client.indices.create(index=index_name, body=index_settings)
print(response)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [31]:
from tqdm.auto import tqdm

In [35]:
len(documents) # document的总体长度为948，这里包含了zoomcamp课程中所有可能出现的问答

948

In [32]:
# 将一批文档插入到已经创建好的 Elasticsearch 索引中
for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [36]:
query = 'I just discovered the course. Can I still join it?'

# query the data
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": { # 这里的must是用来搜索的，返回的结果中，必须包含query字段中的内容
                "multi_match": { # 这里的multi_match是用来多字段搜索的，这里的fields字段指定了搜索的字段
                    "query": query, # 这里的query是搜索的关键词
                    "fields": ["question^3", "text", "section"],   # 这里的fields是指定搜索的字段, question^3表示question字段的权重是3, text和section字段的权重是1
                    "type": "best_fields" # 这里的type是指定搜索的类型，这里是best_fields，表示只要有一个字段匹配成功，就返回
                }
            },
            "filter": { # 这里的filter是用来过滤的，只返回course字段为'data-engineering-zoomcamp'的结果
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [40]:
search_result = es_client.search(index=index_name, body=search_query)
search_result

ObjectApiResponse({'took': 26, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 405, 'relation': 'eq'}, 'max_score': 72.849266, 'hits': [{'_index': 'course-questions', '_id': '5kpfgJABYXo4f2CmtNhv', '_score': 72.849266, '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}}, {'_index': 'course-questions', '_id': '60pfgJABYXo4f2CmtNjt', '_score': 54.057133, '_source': {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next 

In [43]:
result_docs = []

for hit in search_result['hits']['hits']:
    result_docs.append(hit['_source'])

result_docs

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [44]:
# 结合在一块
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

query = 'I just discovered the course. Can I still join it?'
search_result = elastic_search(query)
search_result

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [45]:
# 将search_result传入build_prompt函数，生成prompt
def rag_improved(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [46]:
query = 'I just discovered the course. Can I still join it?'
rag_result = rag_improved(query)

In [47]:
rag_result

"Yes, you can still join the course. Even if you haven't registered, you are eligible to submit the homeworks. However, make sure to adhere to the deadlines for turning in the final projects."