In [1]:
import minsearch
import json
from openai import OpenAI

## Get Q&A data

In [2]:
# Downloading the data
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
docs_raw = docs_response.json()


In [3]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
documents[0]


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## Index Json into vectors

In [4]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [5]:
q = 'the course has already started, can I still enroll?'

In [6]:
index.fit(documents)

<minsearch.Index at 0x7fdd101e7640>

In [33]:
client = OpenAI()

## Define some functions for RAG

**Retrival**  - Search function

In [8]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [9]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [11]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content


In [12]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [16]:
rag(query)

UnicodeEncodeError: 'ascii' codec can't encode character '\u201c' in position 7: ordinal not in range(128)

## Elastic search 
persistent : save the data in the database

In [13]:
from elasticsearch import Elasticsearch

In [14]:
es_client = Elasticsearch('http://localhost:9200') 

In [15]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [16]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [17]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████| 948/948 [00:25<00:00, 37.28it/s]


In [18]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [19]:
elastic_search(query)

[{'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java',
  'section': 'Module 6: streaming with kafka',
  'question': 'Java Kafka: How to run producer/consumer/kstreams/etc in terminal',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repositor

## Home work 1

In [23]:
q2 = "How do I execute a command in a running docker container?"

In [24]:
search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": q2,
                        "fields": ["question^4", "text",],
                        "type": "best_fields"
                    }
                },
                # "filter": {
                #     "term": {
                #         "course": "data-engineering-zoomcamp"
                #     }
                # }
            }
        }
    }

response = es_client.search(index=index_name, body=search_query)
for hit in response['hits']['hits']:
        print(hit['_score'])  

84.050095
75.54128
72.08518
51.04628
49.938507


In [25]:
search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": q2,
                        "fields": ["question^4", "text",],
                        "type": "best_fields"
                    }
                },
                 "filter": {
                     "term": {
                         "course": "machine-learning-zoomcamp"
                     }
                 }
            }
        }
    }

response = es_client.search(index=index_name, body=search_query)
for hit in response['hits']['hits']:
        print(hit['_source']['question']) 

How do I debug a docker container?
How do I copy files from my local machine to docker container?
How do I copy files from a different folder into docker container’s working directory?


In [26]:
# quesion 4 : 
def elastic_search_q4(query):
    search_query = {
            "size": 3,
            "query": {
                "bool": {
                    "must": {
                        "multi_match": {
                            "query": query,
                            "fields": ["question^4", "text",],
                            "type": "best_fields"
                        }
                    },
                     "filter": {
                         "term": {
                             "course": "machine-learning-zoomcamp"
                         }
                     }
                }
            }
        }
    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [27]:
def build_prompt_q4(query, search_results):
    context_template = """
    Q: {question}
    A: {text}
    """.strip()
    
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + context_template.format(question=doc['question'], text=doc['text']) + "\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [28]:
search_results = elastic_search_q4(q2)
prompt = build_prompt_q4(q2, search_results)

In [29]:
len(prompt)

1498

In [36]:
import tiktoken

encoding = tiktoken.get_encoding("cl100k_base")


# Tokenize the query
tokens = encoding.encode(prompt)

# Calculate the number of tokens
num_tokens = len(tokens)

print(f"Number of tokens: {num_tokens}")


Number of tokens: 334


## generate an answer

In [37]:
print(llm(prompt))

To execute a command in a running Docker container, you can follow these steps:

1. Identify the running container's ID by listing all running containers:
   ```bash
   docker ps
   ```

2. Use the container ID to execute a command within the running container. For example, to start a bash shell in the container:
   ```bash
   docker exec -it <container-id> bash
   ```

This allows you to run commands inside the Docker container interactively.


In [39]:
(150 * 0.005) + (250 * 0.015)

4.5

In [40]:
answer = "To execute a command in a running Docker container, you can follow these steps: \n\n \
1. Identify the running container's ID by listing all running containers: \n \
   ```bash \n \
   docker ps \n \
   ```\\n\n \
2. Use the container ID to execute a command within the running container. For example, to start a bash shell in the container: \n \
   ```bash \n \
   docker exec -it <container-id> bash\n\
   ``` \n\n\
This allows you to run commands inside the Docker container interactively."

In [41]:
# Tokenize the query
tokens = encoding.encode(answer)

# Calculate the number of tokens
num_tokens = len(tokens)

print(f"Number of tokens: {num_tokens}")

Number of tokens: 106


In [42]:
(334 * 0.005) + (106 * 0.015)

3.26