# (HW1) Json + Elastic Search + OpenAI API

# Import packages and modules

In [None]:
import minsearch
import json
from openai import OpenAI  # OpenAI API client
import pinecone  # Pinecone client library
# print(pinecone.__version__)  # Print Pinecone version (optional)
from pinecone import Pinecone, ServerlessSpec  # For setup and serverless config
import hashlib  # For hashing (e.g., file IDs)
import os  # OS operations and env vars
from datetime import datetime  # Date and time handling
from tqdm import tqdm  # Progress bars (e.g., looping through files)
import requests  # HTTP requests (e.g., for web scraping or API calls)
import re  # Regular expressions (e.g., for pattern matching in strings)
from bs4 import BeautifulSoup  # HTML parsing
from dotenv import load_dotenv  # Load .env file

# Load API Keys

In [None]:
load_dotenv()
# Now retrieve the keys
openai_key = os.getenv("OPENAI_API_KEY")
pinecone_key = os.getenv("PINECONE_API_KEY")

In [None]:
# Initialize the OpenAI client
client = OpenAI(
    api_key=openai_key
)

# Initialize Pinecone v3 client
pc = Pinecone(
    api_key=pinecone_key
)

# Load Json File

In [None]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        
        # Fix: normalize text fields to strings
        for field in ["question", "text", "section"]:
            if isinstance(doc.get(field, ""), list):
                doc[field] = " ".join(doc[field])
            elif doc.get(field) is None:
                doc[field] = ""
            else:
                doc[field] = str(doc[field])

        documents.append(doc)


In [None]:
#documents[0]

# Elastic Search

In [None]:
# docker run -it \
#     --rm \
#     --name elasticsearch \
#     -m 4GB \
#     -p 9200:9200 \
#     -p 9300:9300 \
#     -e "discovery.type=single-node" \
#     -e "xpack.security.enabled=false" \
#     docker.elastic.co/elasticsearch/elasticsearch:8.17.6

In [None]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch('http://localhost:9200')

In [None]:
es_client.info()

In [None]:
!!curl localhost:9200

In [None]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [None]:
documents[0]

In [None]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

# Functions

In [None]:
#question = 'How do execute a command on a Kubernetes pod?'

In [None]:
question = 'How do copy a file to a Docker container?'

In [None]:
def elastic_search(question):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": question,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [None]:
elastic_search(question)

In [None]:
def build_prompt(question, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: question: {doc['question']}\nanswer: {doc['text']}\n\n"
    prompt = prompt_template.format(question=question, context=context).strip()
    return prompt

In [None]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
import tiktoken

def rag(question):
    search_results = elastic_search(question)
    prompt = build_prompt(question, search_results)
    
    # Token counting using tiktoken
    encoding = tiktoken.encoding_for_model("gpt-4o")
    num_tokens = len(encoding.encode(prompt))
    print(f"Prompt length (in characters): {len(prompt)}")
    print(f"Prompt length (in tokens): {num_tokens}")
    
    answer = llm(prompt)
    return answer

In [169]:
rag(question)

  response = es_client.search(index=index_name, body=search_query)


2264
Prompt length (in characters): 2264
Prompt length (in tokens): 500


'To copy a file to a Docker container, you can use the `docker cp` command. The basic syntax is as follows:\n\n```bash\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\n```'