In [29]:
from mistralai import Mistral
import yaml
import json
import requests 
from tqdm.auto import notebook_tqdm
from elasticsearch import Elasticsearch
import time
from sentence_transformers import SentenceTransformer #embeddings

Concepts related to elastic search:
* documents: collection of fields with its associate values
* index: concept of organizing the documents in "table of contents" so it can be search more efficiently

In this case, what we are indexing is the embeddings, not the documents

In [30]:
docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

# elastic search wants evrything in a single level, the original json has different levels of hierarchy
for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

* First, we have to have an id for each document. We can use the heading of any id based on where it is stored, using a number or generate a unique id using the content of the document. id based on the order can change

In [31]:
import hashlib

def generate_document_id(doc):
    combined = f"{doc["course"]}-{doc["question"]}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [32]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [33]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [34]:
from collections import defaultdict #we want to know if there any duplicates in the questions

default dict sirve para manejar diccionarios que por defecto pueden tener claves inexistentes. un diccionario comun devolveria un error, sin emabrog estos lo que hacen es crear una clave nueva cuyo valor es el elemento que se pasa como default_factory en la definición. 

En este caso se usa como una especie de groupby. En mi default dict voy a crear una clave para cada hash cuyo valor es una lista y voy a rellenar dicha lista con los elementos que tengan ese hash. Si hay un has duplicado, es decir una pregunta repetida, la lista tendra mas de un elemento

In [35]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [36]:
len(hashes), len(documents)

(947, 948)

In [37]:
repeated_hashes = []
for k, values in hashes.items():
    if len(values)>1:
        print(k, len(values))
        repeated_hashes.append(k)

593f7569 2


In [38]:
repeated_hashes

['593f7569']

In [39]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [41]:
documents = [doc for doc in documents if doc['id'] not in repeated_hashes]


In [None]:
with open('docs_with_ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)


In [43]:
!head docs_with_ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


Now we are going to use and llm to generate the posibles queries.

In [44]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]

""".strip() # the prompt helps shape the result so has to be specific on how you want it to return it to you

In [45]:
with open(r'/workspaces/llm_search_engine/key.yaml', 'r') as file:
    config = yaml.safe_load(file)


api_key = config['mistral']
model = "mistral-small-latest"
client = Mistral(api_key=api_key)

In [46]:
# test example

doc = documents[5]

prompt = prompt_template.format(**doc)

In [47]:
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - how many Zoomcamps in a year?
answer: There are 3 Zoom Camps in a year, as of 2024. However, they are for separate courses:
Data-Engineering (Jan - Apr)
MLOps (May - Aug)
Machine Learning (Sep - Jan)
There's only one Data-Engineering Zoomcamp “live” cohort per year, for the certification. Same as for the other Zoomcamps.
They follow pretty much the same schedule for each cohort per zoomcamp. For Data-Engineering it is (generally) from Jan-Apr of the year. If you’re not interested in the Certificate, you can take any zoom camps at any time, at your own pace, out of sync with any “live” cohort.

Provide the output in parsable JSON 

In [48]:
chat_response = client.chat.complete(
    model=model,
    messages=[{"role":"user", "content":prompt}]
)

In [49]:
json_response = chat_response.choices[0].message.content

In [50]:
json.loads(json_response)

['How many Zoom Camps are there in a year and what are their schedules?',
 'Are there multiple live cohorts for the Data-Engineering Zoomcamp in a year?',
 'Can I take the Zoom Camps at my own pace if I am not interested in the certificate?',
 'What is the schedule for the Data-Engineering Zoomcamp?',
 'Are the schedules for each cohort in the Zoom Camps the same?']

In [None]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    chat_response = client.chat.complete(
        model=model,
        messages=[{"role":"user", "content":prompt}]
        )
    json_response = chat_response.choices[0].message.content
    
    return json_response

In [59]:
results = dict()

In [None]:
for doc in notebook_tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc) # limit time excedeed when calling the api, unable to parse the whole thing just three docs
    results[doc_id] = questions

  0%|          | 0/946 [00:00<?, ?it/s]

SDKError: API error occurred: Status 429
{"message":"Requests rate limit exceeded"}

In [69]:
results

{'c02e79ef': '["When is the course registration deadline?", "How can I subscribe to the course Google Calendar?", "What is the starting time of the \'Office Hours\' live session?", "Where can I join the course Telegram channel?", "How do I register in DataTalks.Club\'s Slack and join the relevant channel?"]',
 '1f6520ca': '[\n"What specific knowledge or previous courses are required to enroll in this data engineering course?",\n"What are the prerequisites for the DataTalksClub data engineering Zoomcamp?",\n"What background or prior learning should I have to effectively participate in this course?",\n"What are the prerequisites that I should have fulfilled before joining the GitHub repository for the DataTalksClub data engineering Zoomcamp?",\n"What are the prerequisites for the DataTalksClub data engineering course hosted on Zoom, as mentioned in the record?"\n]',
 '7842b56a': '["If I join the course after the start date, can I still submit the homeworks?","Is there a deadline for turn

In [68]:
results['7842b56a'] = '["If I join the course after the start date, can I still submit the homeworks?","Is there a deadline for turning in the final projects if I join the course after the start date?","Can I participate in the course without registering before the start date?","What happens if I wait until the last minute to submit my final project if I join after the start date?","Are there any benefits or disadvantages to joining the course after the start date?"]'

In [70]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [71]:
parsed_resulst

{'c02e79ef': ['When is the course registration deadline?',
  'How can I subscribe to the course Google Calendar?',
  "What is the starting time of the 'Office Hours' live session?",
  'Where can I join the course Telegram channel?',
  "How do I register in DataTalks.Club's Slack and join the relevant channel?"],
 '1f6520ca': ['What specific knowledge or previous courses are required to enroll in this data engineering course?',
  'What are the prerequisites for the DataTalksClub data engineering Zoomcamp?',
  'What background or prior learning should I have to effectively participate in this course?',
  'What are the prerequisites that I should have fulfilled before joining the GitHub repository for the DataTalksClub data engineering Zoomcamp?',
  'What are the prerequisites for the DataTalksClub data engineering course hosted on Zoom, as mentioned in the record?'],
 '7842b56a': ['If I join the course after the start date, can I still submit the homeworks?',
  'Is there a deadline for t

In [72]:
doc_index = {d['id']: d for d in documents}

In [73]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [74]:
import pandas as pd

In [75]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [76]:
df

Unnamed: 0,question,course,document
0,When is the course registration deadline?,data-engineering-zoomcamp,c02e79ef
1,How can I subscribe to the course Google Calen...,data-engineering-zoomcamp,c02e79ef
2,What is the starting time of the 'Office Hours...,data-engineering-zoomcamp,c02e79ef
3,Where can I join the course Telegram channel?,data-engineering-zoomcamp,c02e79ef
4,How do I register in DataTalks.Club's Slack an...,data-engineering-zoomcamp,c02e79ef
5,What specific knowledge or previous courses ar...,data-engineering-zoomcamp,1f6520ca
6,What are the prerequisites for the DataTalksCl...,data-engineering-zoomcamp,1f6520ca
7,What background or prior learning should I hav...,data-engineering-zoomcamp,1f6520ca
8,What are the prerequisites that I should have ...,data-engineering-zoomcamp,1f6520ca
9,What are the prerequisites for the DataTalksCl...,data-engineering-zoomcamp,1f6520ca


I get the example so i dont have to pay or tweak the mistral api

In [82]:
from io import StringIO

In [86]:
docs_url = 'https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/refs/heads/main/03-vector-search/eval/ground-truth-data.csv'
docs_response = requests.get(docs_url)
data = StringIO(docs_response.text)

In [None]:
df = pd.read_csv(data)

In [89]:
df.to_csv('ground_truth_data.csv', index=False)