#### Q1. Running Elastic
Run Elastic Search 8.4.3, and get the cluster information

In [4]:
import requests
import json
import os
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm
from openai import OpenAI


In [1]:
!curl localhost:9200

{
  "name" : "ffd9154b40fb",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "-YF-0yqhS-eCkU9H5luwWw",
  "version" : {
    "number" : "8.4.3",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "42f05b9372a9a4a470db3b52817899b99a76ee73",
    "build_date" : "2022-10-04T07:17:24.662462378Z",
    "build_snapshot" : false,
    "lucene_version" : "9.3.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}


In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [9]:
client = OpenAI()

In [6]:
es_client = Elasticsearch("http://localhost:9200")

In [71]:
index_settings ={
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions-3'
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-3'})

In [72]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [73]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                # "filter": {
                #     "term": {
                #         "course": "data-engineering-zoomcamp"
                #     }
                # }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)

    result_docs = []
    result_scores = []
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
        result_scores.append(hit['_score'])

    return result_docs, result_scores

In [74]:
query = "How do I execute a command in a running docker container?"
search_results, search_scores = elastic_search(query)

In [76]:
search_scores[0]

84.050095

In [77]:
def llm(prompt):
    response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user', 'content':prompt}],
        
    )
    return response.choices[0].message.content

Q4. Filtering

In [87]:
def elastic_search(query):
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
        

    return result_docs

In [88]:
query = "How do I execute a command in a running docker container?"
search_results = elastic_search(query)

In [89]:
search_results[2]

{'text': 'You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nIn the Dockerfile, you can provide the folder containing the files that you want to copy over. The basic syntax is as follows:\nCOPY ["src/predict.py", "models/xgb_model.bin", "./"]\t\t\t\t\t\t\t\t\t\t\tGopakumar Gopinathan',
 'section': '5. Deploying Machine Learning Models',
 'question': 'How do I copy files from a different folder into docker container’s working directory?',
 'course': 'machine-learning-zoomcamp'}

In [103]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}
    
CONTEXT:
{context}
""".strip()
    
    context=""
    for doc in search_results:    
        context= context + f"Q: {doc['question']}\nA: {doc['text']}\n\n".strip()
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [104]:
query = "How do I execute a command in a running docker container?"
search_results = elastic_search(query)
prompt = build_prompt(query, search_results)
len(prompt)

1462

In [105]:
prompt

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: How do I execute a command in a running docker container?\n    \nCONTEXT:\nQ: How do I debug a docker container?\nA: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)Q: How do I copy files from my local machine to docker container?\nA: You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_dir

In [106]:
import tiktoken

In [107]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [109]:
tokens = encoding.encode(prompt)
len(tokens)

322

In [114]:
token_1 = encoding.decode_single_token_bytes(63842)
token_1

b"You're"

In [115]:
original_prompt = encoding.decode(tokens)
original_prompt

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\nUse only the facts from the CONTEXT when answering the QUESTION.\n\nQUESTION: How do I execute a command in a running docker container?\n    \nCONTEXT:\nQ: How do I debug a docker container?\nA: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)Q: How do I copy files from my local machine to docker container?\nA: You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_dir

### Bonus: generating the answer (ungraded)

In [124]:
query = "How do I execute a command in a running docker container?"
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer
answer = rag(query)
answer

'To execute a command in a running Docker container, you can use the `docker exec` command. First, find the container ID of the running container by using the `docker ps` command. Then, execute your desired command within the container. Here’s how you can do it:\n\n1. List your running containers to find the container ID:\n   ```\n   docker ps\n   ```\n\n2. Execute a command in the specific container using its container ID:\n   ```\n   docker exec -it <container-id> <command>\n   ```\n\nFor example, to start a bash session in the running container:\n```\ndocker exec -it <container-id> bash\n```\n\nReplace `<container-id>` with the actual ID of your running container.'

### Bonus: calculating the costs (ungraded)

In [119]:
def calculate_cost(input_token_count, output_token_count):
    input_cost = (input_token_count/1000) * 0.005
    output_cost =(output_token_count/1000)* 0.015
    
    return input_cost+output_cost

In [122]:
input_token_count = 150
output_token_count= 250

cost_1000_req  = 1000*calculate_cost(input_token_count, output_token_count)
print (f"Cost of 1000 request at an average of {input_token_count} input tokens and {output_token_count} output tokens is ${cost_1000_req}")

cost of 1000 request at an average of 150 input tokens and 250 output tokens is $4.5


In [126]:
input_tokens = encoding.encode(prompt)
output_tokens = encoding.encode(answer)

input_token_count = len(input_tokens)
output_token_count= len(output_tokens)

cost_1000_req  = 1000*calculate_cost(input_token_count, output_token_count)
print (f"Cost of 1000 request at an average of {input_token_count} input tokens and {output_token_count} output tokens is ${cost_1000_req}")


Cost of 1000 request at an average of 322 input tokens and 150 output tokens is $3.86
