# Construcción de knowledge Base

In [46]:
import json

In [47]:
import requests

url = 'https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py'
response = requests.get(url)

with open('minsearch.py', 'wb') as file:
    file.write(response.content)

In [48]:
# Motor de busqueda creado previamente
import minsearch

In [49]:
with open('documents.json','rt') as f_in:
    docs_raw = json.load(f_in)

In [50]:

documents= []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [51]:
len(documents)
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [52]:
index = minsearch.Index(
    text_fields=["question","text","section"],
    keyword_fields=["course"]
)

In [53]:
q = 'the course has already started, can I still enroll?'

In [54]:
#Analiza los documentos y obtiene la información que pide index
index.fit(documents)

<minsearch.Index at 0x287cdf5d360>

In [55]:
#Prioriza las respuestas de acuerdo a las preguntas dadas y lo que escanea en los documentos
# Los parametros que se pasan para question y section, es la relevancia (o peso) que se le da a encontrar las palabras en cada una de las secciones señaladas 
boost = {'question':3.0,'section':0.5}

results = index.search(
    query = q,
    filter_dict={'course':'data-engineering-zoomcamp'}, #Filtro de la base de documentos
    boost_dict=boost,
    num_results=5
)

In [56]:
results

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 202

# Conectar Chatgpt 4o

In [57]:
from openai import OpenAI
import os

In [58]:
api_key = os.getenv('OPENAI_API_KEY')
client = OpenAI(api_key=api_key)

In [59]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user','content':q}]
)

In [60]:
response.choices[0].message.content

"It depends on the specific course and institution's policies. Here are a few steps you can take to find out:\n\n1. **Check Enrollment Policies:** Review the course's website or catalog for information on late enrollment policies. Some institutions allow late enrollment within a certain timeframe, while others may not.\n\n2. **Contact the Instructor:** Reach out to the course instructor directly via email or during their office hours. Explain your situation and ask if it’s possible to join the class late.\n\n3. **Consult the Registrar:** Contact the registrar’s office or the student services department at your institution. They can provide guidance on the administrative aspects of late enrollment.\n\n4. **Review Course Requirements:** Make sure you understand the course requirements and catch up on any missed materials. You might need to put in extra effort to cover what you missed.\n\n5. **Consider the Workload:** Assess whether you can handle the additional workload to catch up. Some

In [61]:
prompt_template ="""
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ Database. Use only the facts from the CONTEXT when answering the QUESTION.
if the CONTEXT doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT: {context}
"""

In [62]:
context = ""

for doc in results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n".strip()

In [63]:
print(context)

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.section: General course-related questions
question: Course - When will the course start?
answer: The purpose of this document is to capture frequently asked technical questions
The exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with th

In [64]:
prompt = prompt_template.format(question=q, context=context).strip()

In [65]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ Database. Use only the facts from the CONTEXT when answering the QUESTION.
if the CONTEXT doesn't contain the answer, output NONE

QUESTION: the course has already started, can I still enroll?

CONTEXT: section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.section: General course-related questions
question: Course - Can I follow the course after it finishes?
answer: Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.
You can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final

In [66]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role':'user','content':prompt}]
)

response.choices[0].message.content

'Yes, you can still enroll in the course after it has started. You are eligible to submit the homework, but be aware that there will be deadlines for turning in the final projects.'

# Cleaning and Modularizing Code

In [67]:
def search(query):
    boost = {'question':3.0,'section':0.5}

    results = index.search(
        query = query,
        filter_dict={'course':'data-engineering-zoomcamp'}, #Filtro de la base de documentos
        boost_dict=boost,
        num_results=10
    )

    return results

In [68]:
query = "How do I install Docker?"
search_results = search(query)

In [69]:
def build_prompt(query,search_results):
    prompt_template ="""
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ Database. Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: {context}
    """
    context = ""

    for doc in search_results:
            context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n".strip()

    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [70]:
query = "How do I run Kafka?"
search_results = search(query)
build_prompt(query,search_results)

'You\'re a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ Database. Use only the facts from the CONTEXT when answering the QUESTION.\n\n    QUESTION: How do I run Kafka?\n\n    CONTEXT: section: Module 6: streaming with kafka\nquestion: Java Kafka: How to run producer/consumer/kstreams/etc in terminal\nanswer: In the project directory, run:\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.javasection: Module 6: streaming with kafka\nquestion: Module “kafka” not found when trying to run producer.py\nanswer: Solution from Alexey: create a virtual environment and run requirements.txt and the python files in that environment.\nTo create a virtual env and install packages (run only once)\npython -m venv env\nsource env/bin/activate\npip install -r ../requirements.txt\nTo activate it (you\'ll need to run it every time you need the virtual env):\nsource env/bin/activate\nTo deactivate it:\ndeactivate\nThis works on M

In [71]:
def llm(prompt):  
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role':'user','content':prompt}]
    )

    return response.choices[0].message.content

In [72]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query,search_results)
    answer = llm(prompt)
    return answer

In [73]:
query = "How do I run Kafka?"

answer = rag(query)
print(answer)

To run Kafka, use the following command in your project directory:

```bash
java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java
```

Make sure to replace `<jar_name>` with the actual name of your .jar file.


# Search with Elasticsearch

## Run Docker Image on Terminal


docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3

## Comprobación de puerto

curl http://localhost:9200

In [74]:
from elasticsearch import Elasticsearch

In [84]:
es_client = Elasticsearch('http://localhost:9200')

In [85]:
es_client.info()

ObjectApiResponse({'name': 'd8f35b9253e1', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'LQGQ_gM-RWmCzo7elgGOoA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [86]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [87]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [88]:
from tqdm.auto import tqdm

In [89]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:16<00:00, 58.30it/s]


In [90]:
query = 'I just discovered the course, Can I still join it?'

In [91]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}

In [92]:
response = es_client.search(index=index_name, body=search_query)

In [93]:
response

ObjectApiResponse({'took': 42, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 405, 'relation': 'eq'}, 'max_score': 72.64078, 'hits': [{'_index': 'course-questions', '_id': 'Di7cQJABDeVeK5XbkG8g', '_score': 72.64078, '_source': {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}}, {'_index': 'course-questions', '_id': 'Ey7cQJABDeVeK5XbkG-s', '_score': 53.869614, '_source': {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next co

In [94]:
results_docs =[]

for hit in response['hits']['hits']:
    results_docs.append(hit['_source'])

In [95]:
results_docs

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [96]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)

    results_docs =[]

    for hit in response['hits']['hits']:
        results_docs.append(hit['_source'])
    
    return results_docs

In [97]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query,search_results)
    answer = llm(prompt)
    return answer

In [98]:
elastic_search(query)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [99]:
rag(query)

'Yes, you can still join the course even if you just discovered it. You are eligible to submit the homework, but please be mindful of the deadlines for turning in the final projects.'

## Detener imagen de Docker en la terminal 

docker stop ($docker ps -q)