In [20]:
from mistralai import Mistral
import yaml
import json
import requests 
from tqdm.auto import notebook_tqdm
import minsearch
from elasticsearch import Elasticsearch

In [5]:
with open('key.yaml', 'r') as file:
    config = yaml.safe_load(file)

In [6]:
api_key = config['mistral']
model = "mistral-small-latest"

In [7]:
client = Mistral(api_key=api_key)

chat_response = client.chat.complete(
    model=model,
    messages=[{"role":"user", "content":"What is the best French cheese?"}]
)


In [8]:
# print(chat_response.choices[0].message.content)
chat_response

ChatCompletionResponse(id='9119f351194c46b0b9cc1b1e29d1f4cb', object='chat.completion', model='mistral-small-latest', usage=UsageInfo(prompt_tokens=10, completion_tokens=412, total_tokens=422), created=1739369757, choices=[ChatCompletionChoice(index=0, message=AssistantMessage(content='Determining the "best" French cheese can be quite subjective, as it often depends on personal preferences, such as whether you prefer soft, hard, blue, or goat cheeses. However, some of the most renowned and beloved French cheeses include:\n\n1. **Camembert de Normandie**: A soft, bloomy rind cheese with a creamy interior, often considered one of the finest examples of French cheese.\n\n2. **Roquefort**: A blue cheese made from sheep\'s milk, known for its strong, tangy flavor and crumbly texture.\n\n3. **Brie de Meaux**: A soft cheese with a creamy texture and a rich, buttery flavor, often considered one of the finest cheeses in the world.\n\n4. **Comté**: A hard cheese made from cow\'s milk, known for 

In [9]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [10]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp'; thats for keywords

In [11]:
q = 'the course has already started, can I still enroll?'

In [12]:
index.fit(documents)

<minsearch.Index at 0x7231f3119bb0>

In [14]:
response = client.chat.complete(
    model=model,
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"Whether you can still enroll in a course that has already started depends on the policies of the educational institution or platform offering the course. Here are some general guidelines:\n\n1. **Check the Course Policy**: Many institutions and platforms have specific policies regarding late enrollment. Some may allow you to join late with certain conditions, while others may have strict deadlines.\n\n2. **Contact the Instructor or Administration**: Reach out to the course instructor, program administrator, or support team to inquire about the possibility of late enrollment. They can provide the most accurate and up-to-date information.\n\n3. **Review Syllabus and Requirements**: If late enrollment is allowed, you might need to catch up on missed content. Review the course syllabus to understand the workload and any prerequisites.\n\n4. **Technical and Logistical Considerations**: Ensure that you have access to all necessary materials, platforms, and resources. You might also need to 

In [15]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [16]:
def build_prompt(query, search_results):
    prompt_template = """
        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
        Use only the facts from the CONTEXT when answering the QUESTION.

        QUESTION: {question}

        CONTEXT: 
        {context}
        """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [17]:
def llm(prompt):
    response = client.chat.complete(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [18]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [19]:
rag(query)

"Based on the CONTEXT provided, here's how you can run Kafka:\n\n1. **Java Kafka**: In the project directory, run:\n   ```\n   java -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n   ```\n\n2. **Python Kafka**: Ensure Docker images are up and running and create a virtual environment:\n   ```\n   python -m venv env\n   source env/bin/activate (or env/Scripts/activate on Windows)\n   pip install -r ../requirements.txt\n   ```\n   Then run your Python files within this virtual environment:\n   ```\n   python <your_script>.py\n   ```\n   If you encounter a permission error with `./build.sh`, run:\n   ```\n   chmod +x build.sh\n   ```\n   If you encounter a `ModuleNotFoundError: No module named 'kafka.vendor.six.moves'`, install `kafka-python-ng`:\n   ```\n   pip install kafka-python-ng\n   ```"

We have a raw database in whatever format, in this case theye google docs files. We parse those files inot a better format, more structures, in this case they are a jsoin file. we use a self made search engine to build a context for the llm, in this case we select a series of the questions that are related to the query from the user and we parse that into a prompt to the llm so it can elaborate a human like response to the query. easier to manipulate and understand. promp, query, number of responses retrieved by the search engine those kind of things are what makes the llm work better.

## elastic search implementation

In [21]:
es_client = Elasticsearch(
    hosts=[{'host': 'localhost', 'port': 9200, 'scheme': 'http'}],
    timeout=30,
    max_retries=10,
    retry_on_timeout=True
)

# Prueba la conexión
if es_client.ping():
    print("Conectado a Elasticsearch!")
else:
    print("No se pudo conectar a Elasticsearch.")

Conectado a Elasticsearch!


  es_client = Elasticsearch(


In [22]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"


In [23]:
es_client.indices.create(index=index_name, body=index_settings)


ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [24]:
for doc in notebook_tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:04<00:00, 220.63it/s]


In [25]:
query = 'how do i run docker?'

In [26]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], #the power gives more importance to that part
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp" #we add a filter for one of the keywords on the documents 
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [27]:
elastic_search(query)

[{'text': "Answer: To run the provided code, ensure that the 'dlt[duckdb]' package is installed. You can do this by executing the provided installation command: !pip install dlt[duckdb]. If you’re doing it locally, be sure to also have duckdb pip installed (even before the duckdb package is loaded).",
  'section': 'Workshop 1 - dlthub',
  'question': 'How do I install the necessary dependencies to run the code?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'After you create a GitHub account, you should clone the course repo to your local machine using the process outlined in this video: Git for Everybody: How to Clone a Repository from GitHub\nHaving this local repository on your computer will make it easy for you to access the instructors’ code and make pull requests (if you want to add your own notes or make changes to the course content).\nYou will probably also create your own repositories that host your notes, versions of your file, to do this. Here is a great tutorial tha

In [28]:
def rag2(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [29]:
rag2(query)

'To run Docker, you need to follow these steps:\n\n1. **Install Docker**: First, ensure that Docker is installed on your system. You can download and install Docker from the official Docker website or use a package manager specific to your operating system.\n\n2. **Run Docker Containers**: Once Docker is installed, you can run Docker containers using the `docker run` command. For example:\n   ```sh\n   docker run <image_name>\n   ```\n\n3. **Check Docker Installation**: To verify that Docker is installed correctly, you can run:\n   ```sh\n   docker --version\n   ```\n\n4. **Pull Docker Image**: If you need a specific Docker image, you can pull it from Docker Hub using:\n   ```sh\n   docker pull <image_name>\n   ```\n\n5. **Build Docker Image**: If you have a Dockerfile, you can build a Docker image using:\n   ```sh\n   docker build -t <image_name> .\n   ```\n\n6. **Run Docker Container from Image**: After pulling or building an image, you can run a container from it using:\n   ```sh\n 