In [1]:
import requests
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from openai import OpenAI
import tiktoken

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [3]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
len(documents)

948

In [5]:
documents[-1]

{'text': 'Problem description\nInfrastructure created in AWS with CD-Deploy Action needs to be destroyed\nSolution description\nFrom local:\nterraform init -backend-config="key=mlops-zoomcamp-prod.tfstate" --reconfigure\nterraform destroy --var-file vars/prod.tfvars\nAdded by Erick Calderin',
 'section': 'Module 6: Best practices',
 'question': 'How to destroy infrastructure created via GitHub Actions',
 'course': 'mlops-zoomcamp'}

# Q1: "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73"

In [6]:
image = !curl localhost:9200

In [7]:
image

['  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current',
 '                                 Dload  Upload   Total   Spent    Left  Speed',
 '',
 '  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0',
 '100   539  100   539    0     0   109k      0 --:--:-- --:--:-- --:--:--  131k{',
 '  "name" : "64ba05140871",',
 '  "cluster_name" : "docker-cluster",',
 '  "cluster_uuid" : "rYwY329rQr-pLJYC5fNJjA",',
 '  "version" : {',
 '    "number" : "8.4.3",',
 '    "build_flavor" : "default",',
 '    "build_type" : "docker",',
 '    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",',
 '    "build_date" : "2022-10-04T07:17:24.662462378Z",',
 '    "build_snapshot" : false,',
 '    "lucene_version" : "9.3.0",',
 '    "minimum_wire_compatibility_version" : "7.17.0",',
 '    "minimum_index_compatibility_version" : "7.0.0"',
 '  },',
 '  "tagline" : "You Know, for Search"',
 '}',
 '']

In [8]:
matching_elements = list(filter(lambda x: "build_hash" in x, image))
matching_elements

['    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",']

# Q2: index

In [9]:
es_client = Elasticsearch("http://localhost:9200")

In [10]:
es_client.info()

ObjectApiResponse({'name': '64ba05140871', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'rYwY329rQr-pLJYC5fNJjA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [11]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

In [12]:
index_name = "course-questions"

In [13]:
from elasticsearch import Elasticsearch
from elasticsearch import NotFoundError

try:
    es_client.indices.get(index=index_name)
    print(f"{index_name} already exists")
    es_client.indices.delete(index=index_name, ignore=[400, 404]) # uncomment if you want to remove the index.
    response = es_client.indices.create(index=index_name, body=index_settings)
except NotFoundError:
    response = es_client.indices.create(index=index_name, body=index_settings)
    print(response)

course-questions already exists


  es_client.indices.delete(index=index_name, ignore=[400, 404]) # uncomment if you want to remove the index.


In [14]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████| 948/948 [00:57<00:00, 16.56it/s]


# Q3: Top ranking result score: 84.050095

In [15]:
query = "How do I execute a command in a running docker container?"
query

'How do I execute a command in a running docker container?'

In [16]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                # "filter": {
                #     "term": {
                #         "course": "data-engineering-zoomcamp"
                #     }
                # }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    # Get the score of the top-ranking result
    if response["hits"]["hits"]:
        top_score = response["hits"]["hits"][0]["_score"]
        print("Top ranking result score:", top_score)
    else:
        top_score = None
        print("No results found")

    return result_docs, top_score


In [17]:
search = elastic_search(query)
search[1]

Top ranking result score: 84.050095


84.050095

# Q4: How do I copy files from a different folder into docker container’s working directory?

In [18]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    print(response)
    
    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    # Get the score of the top-ranking result
    if response["hits"]["hits"]:
        top_score = response["hits"]["hits"][0]["_score"]
        print("Top ranking result score:", top_score)
    else:
        top_score = None
        print("No results found")

    return result_docs, top_score


In [19]:
search = elastic_search(query)
# search[0]

{'took': 4, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 345, 'relation': 'eq'}, 'max_score': 84.050095, 'hits': [{'_index': 'course-questions', '_id': '0mQaOpABDPWUvUvKpl3S', '_score': 84.050095, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}}, {'_index': 'course-questions', '_id': '8WQaOpABDPWUvUvKrV3G', '_score': 51.04628, '_source': {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or dire

# Q5: 588

In [20]:
client = OpenAI()

In [21]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    # print(response)
    
    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit["_source"])

    # Get the score of the top-ranking result
    # if response["hits"]["hits"]:
    #     top_score = response["hits"]["hits"][0]["_score"]
    #     print("Top ranking result score:", top_score)
    # else:
    #     top_score = None
    #     print("No results found")

    return result_docs


In [22]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT:
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(
        question=query,
        context=context
    ).strip()

    # print(prompt)
    
    return prompt

In [23]:
def llm(prompt):
    response = client.chat.completions.create(
        model="gpt-4o",
        # model="gpt-3.5-turbo",
        messages=[
            
                {
                    "role": "user",
                    "content": prompt
                }
            ]
        )
    
    return response.choices[0].message.content

In [24]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [25]:
answer = rag(query)
answer

'To execute a command in a running Docker container, you can use the `docker exec` command. Here are the steps to do so:\n\n1. First, find the container ID of the running container using `docker ps`:\n    ```bash\n    docker ps\n    ```\n2. Then, execute a command in the specific container using the container ID. For example, to run a bash shell, use:\n    ```bash\n    docker exec -it <container-id> bash\n    ```\n\nThis will start an interactive bash session inside the running container.'

In [26]:
len(answer)

481

# Q6: 711

In [27]:
search_results = elastic_search(query)
prompt = build_prompt(query, search_results)
prompt

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n    Use only the facts from the CONTEXT when answering the QUESTION.\n    \n    QUESTION: How do I execute a command in a running docker container?\n    \n    CONTEXT:\n    section: 5. Deploying Machine Learning Models\nquestion: How do I debug a docker container?\nanswer: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)\n\nsection: 5. Deploying Machine Learning Models\nquestion: How do I copy files from my local machine to docker container?\nanswer: You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nTo copy a file or directory from your local machi

In [28]:
# Assuming this is your query
# query = "How do I execute a command in a running docker container?"
query = prompt

# Get the encoding for the model
encoding = tiktoken.encoding_for_model("gpt-4o")

# Encode the query to get the tokens
tokens = encoding.encode(query)

# Count the number of tokens
num_tokens = len(tokens)

print(f"Number of tokens in the query: {num_tokens}")


Number of tokens in the query: 361


In [29]:
encoding.decode_single_token_bytes(63842)

b"You're"