# Prepare Open AI client object

In [1]:
from openai import OpenAI

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
client=OpenAI(api_key=os.getenv('OPENAI_API_KEY'))

In [4]:
client

<openai.OpenAI at 0x7fb74de68370>

# Prepare documents for information retrieval

In [5]:
import minsearch

In [6]:
import json

In [7]:
with open('documents.json', 'rt') as f_in:
    docs_raw=json.load(f_in)

In [8]:
documents=[]

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [9]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)
index.fit(documents)

In [27]:
query ='the course has already started, can I still enroll?'

In [24]:
def search(query):
    
    boost={'question': 3.0, 'section': 0.5 } # By default, all parameters have value of 1.
    #Increasing or decreasing this value will change its relevance during the search process.

    results = index.search(
        query=query,
        boost_dict=boost,
        num_results=5,
        filter_dict={'course': 'data-engineering-zoomcamp'}
    )
    return results

In [28]:
search_results = search(query)

# Generate answers from OpenAI GPT

In [38]:
def build_prompt(query, search_results):
    
    prompt_template="""
    You are a course teaching assistant. Answer the QUESTION based on the CONTEXT.
    Use only the facts of the CONTEXT in the answer. 
    If the CONTEXT doesn't contain the answer, output NONE.

    QUESTION: {question}

    CONTEXT: {context}

    """.strip()
    
    context=''

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer:{doc['text']}\n\n"
        
    prompt = prompt_template.format(question=query, context=context).strip()
        
    return prompt

In [84]:
def llm(prompt):
    
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

Test generation without context

In [14]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user', 'content':q}]
)

In [15]:
response.choices[0].message.content

'Yes, whether you can still enroll in a course after it has started typically depends on the specific policies of the institution or platform offering the course. Here are some steps you can take:\n\n1. **Check the Course Policy**: Look at the enrollment guidelines provided by the institution or on the course website. Some courses have deadlines, while others may allow late enrollment.\n\n2. **Contact the Instructor or Administrator**: Reach out directly to the course instructor or the administration office. Explain your situation and ask if late enrollment is possible.\n\n3. **Catch Up on Missed Work**: If you are allowed to enroll, you may need to catch up on any missed lectures, assignments, or readings. Ask for any supplementary materials or advice on how to get up to speed.\n\n4. **Use Online Forums or Classmates**: If the course has an online forum or discussion board, use it to ask questions and get help. Connecting with classmates can also provide support and resources to help 

Test generation with context

In [40]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [41]:
answer = rag(query)

In [42]:
answer

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

In [43]:
rag('how can i make pancakes?')

'NONE'

# RAG with Elastic Search

In [44]:
from elasticsearch import Elasticsearch

In [46]:
es_client=Elasticsearch('http://localhost:9200')

In [47]:
#es_client.info()

ObjectApiResponse({'name': 'b359dd4497f3', 'cluster_name': 'docker-cluster', 'cluster_uuid': '_7h4zc2MQCKCk6sS0pWVbw', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [48]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

#Create an index in elastic search (equivelant to a table in sql database)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [49]:
for doc in documents:
    
    es_client.index(index=index_name, document=doc)

In [51]:
query= 'i just found out about this course, can i still join?'

In [52]:
def elastic_search(query):
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        #a question is 3 times more important than the text or section for the search results
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": { #limit questions about only one part of the documents available
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    search_results = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in search_results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [53]:
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at

In [54]:
def elastic_rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [55]:
elastic_rag(query)

'Yes, you can still join the course even after the start date. You are eligible to submit the homework assignments, but keep in mind that there will be deadlines for turning in the final projects. So make sure not to leave everything for the last minute.'

In [56]:
rag(query)

"Yes, you can still join the course. Even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects, so don't leave everything for the last minute."

# Homework

In [58]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
es_client=Elasticsearch('http://localhost:9200')

In [61]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions-homework"

#Create an index in elastic search (equivelant to a table in sql database)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions-homework'})

## Q2: function to add data to elastic is index

In [62]:
for doc in documents:
    
    es_client.index(index=index_name, document=doc)

In [59]:
query="How do I execute a command in a running docker container?"

In [65]:
def elastic_search(query):
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
             #   "filter": { #limit questions about only one part of the documents available
             #       "term": {
              #          "course": "data-engineering-zoomcamp"
              #      }
                #}
            }
        }
    }

    search_results = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in search_results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return search_results, result_docs

In [66]:
raw_results, result_docs = elastic_search(query)

## Q3: maximum score for results (aka top ranking result score) is 84.05

In [67]:
raw_results

ObjectApiResponse({'took': 65, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 865, 'relation': 'eq'}, 'max_score': 84.050095, 'hits': [{'_index': 'course-questions-homework', '_id': 'ZA81xpAB3cqpvcvBPyA8', '_score': 84.050095, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}}, {'_index': 'course-questions-homework', '_id': 'cg81xpAB3cqpvcvBLR5F', '_score': 75.54128, '_source': {'text': 'In case running pgcli  locally causes issues or you do not want to install it locally you can use it ru

In [68]:
def elastic_search_with_filter(query):
    
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": { #limit questions about only one part of the documents available
                    "term": {
                       "course": "machine-learning-zoomcamp"
                   }
             }
            }
        }
    }

    search_results = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in search_results['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return search_results, result_docs

In [69]:
raw_results, doc_results = elastic_search_with_filter(query)

## Q4: the 3rd question of the search result is "How do I copy files from a different folder into docker container’s working directory?"

In [71]:
raw_results

ObjectApiResponse({'took': 31, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 345, 'relation': 'eq'}, 'max_score': 84.050095, 'hits': [{'_index': 'course-questions-homework', '_id': 'ZA81xpAB3cqpvcvBPyA8', '_score': 84.050095, '_source': {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)', 'section': '5. Deploying Machine Learning Models', 'question': 'How do I debug a docker container?', 'course': 'machine-learning-zoomcamp'}}, {'_index': 'course-questions-homework', '_id': 'gw81xpAB3cqpvcvBQCA5', '_score': 51.04628, '_source': {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's 

In [70]:
doc_results

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': "You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\nHrithik Kumar Advani",
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I copy files from my local machine to docker container?',
 

## Q5: size of the prompt built is 1487

In [90]:
def build_prompt(query, search_results):
    
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()
    
    context_template = """
        Q: {question}
        A: {text}
        """.strip()

    context_list=[]
    
    for doc in search_results:
        context_list.append(context_template.format(**doc))
    
    context = '/n/n'.join(context_list)
    
    prompt = prompt_template.format(question=query, context=context).strip()
        
    return prompt

In [91]:
prompt = build_prompt(query, doc_results)

In [92]:
len(prompt)

1506

## Q6: estimate the number of tokens used by our prompt in chatgpt4 is 326

In [76]:
import tiktoken

In [78]:
encoding=tiktoken.encoding_for_model('gpt-4o')

In [79]:
tokens=encoding.encode(prompt)

In [81]:
len(tokens)

326

In [82]:
encoding.decode(tokens)

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n    Use only the facts from the CONTEXT when answering the QUESTION.\n\n    QUESTION: How do I execute a command in a running docker container?\n\n    CONTEXT:\n    \n\nQ: How do I debug a docker container?\n\nA: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)\n\n\n\nQ: How do I copy files from my local machine to docker container?\n\nA: You can copy files from your local machine into a Docker container using the docker cp command. Here\'s how to do it:\nTo copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basic syntax is as follows:\ndocker c

## Bonus questions: generate answer + calculate costs

In [85]:
llm(prompt)

'To execute a command in a running Docker container, you can follow these steps:\n\n1. First, find the ID of the running container by using the `docker ps` command:\n   ```sh\n   docker ps\n   ```\n2. Then, use the `docker exec` command along with the `-it` option and the container ID to execute a command inside the container. For example, to start a bash shell in the container, you would use:\n   ```sh\n   docker exec -it <container-id> bash\n   ```\n\nReplace `<container-id>` with the actual ID of your running container.'

In [86]:
# per request: 150 tokens sent and 250 tokens received

price_per_token_sent=0.005/1000
price_per_token_received=0.015/1000

request_1000_cost= 1000*(150*price_per_token_sent+250*price_per_token_received)
request_1000_cost

4.5