In [1]:
import os
os.environ["SSL_CERT_FILE"] = "Fortinet_CA_SSL(15).cer"


In [2]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[19]

{'text': 'Yes, for simplicity (of troubleshooting against the recorded videos) and stability. [source]\nBut Python 3.10 and 3.11 should work fine.',
 'section': 'General course-related questions',
 'question': 'Environment - Is Python 3.9 still the recommended version to use in 2024?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
# # create an id
# # this isn't the best idea as when update the file that will change the ids 

# n = len(documents)

# for i in range(n):
#     documents[i]["id"] = i

In [5]:
documents[19]

{'text': 'Yes, for simplicity (of troubleshooting against the recorded videos) and stability. [source]\nBut Python 3.10 and 3.11 should work fine.',
 'section': 'General course-related questions',
 'question': 'Environment - Is Python 3.9 still the recommended version to use in 2024?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
# The issue of this method if someone changed the text that will change the id too
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [7]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [8]:
from collections import defaultdict

In [9]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [10]:
len(hashes), len(documents)
# they should produce the same length 

(947, 948)

In [11]:
# to get the dublication id 
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [12]:
hashes['593f7569']

#they start with the same text 

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [13]:
import json

In [14]:
# save the output to json 

with open('documents_with_ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [15]:
!head documents_with_ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


### Prepare user questions

In [16]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks like:

["question1", "question2", ..., "question5"]
""".strip()

In [17]:
from mistralai import Mistral
from mistralai.models import UserMessage

In [18]:
from dotenv import load_dotenv

In [19]:
load_dotenv()

True

In [20]:
api_key = os.getenv("API_KEY")

In [21]:
client = Mistral(api_key = api_key, )

In [22]:
doc = documents[2]
prompt = prompt_template.format(**doc)

In [23]:
print(prompt)

You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: General course-related questions
question: Course - Can I still join the course after the start date?
answer: Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.

Provide the output in parsable JSON without using code blocks like:

["question1", "question2", ..., "question5"]


In [25]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)
    
    response = client.chat.complete(
        model = "open-mistral-7b", 
        messages = [UserMessage(content=prompt)]
    )
    json_response = response.choices[0].message.content
    return json_response

In [26]:
from tqdm.auto import tqdm

In [27]:
results = {}

for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue 
        
    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [28]:
results

{'c02e79ef': '[\n  "What is the start date and time for the course?",\n  "How can I subscribe to the course\'s public Google Calendar?",\n  "Where can I register for the course before it starts?",\n  "Where can I find updates and announcements related to the course?",\n  "How can I join the course\'s community on Slack and access the relevant channel?"\n]',
 '1f6520ca': '[\n  "What are the prerequisites for enrolling in this course?",\n  "Are there any specific requirements before starting this course?",\n  "What knowledge or skills are necessary to take this course?",\n  "What should I have learned or be familiar with before taking this course?",\n  "What are the prerequisites that I need to meet to join this course?"\n]',
 '7842b56a': '[\n  {\n    "question": "Is it possible to join the course after the start date?",\n    "answer": "Yes, you can still join the course. However, you\'re still eligible to submit homeworks, and there will be deadlines for final projects, so it\'s best no

In [31]:
results['c02e79ef']

'[\n  "What is the start date and time for the course?",\n  "How can I subscribe to the course\'s public Google Calendar?",\n  "Where can I register for the course before it starts?",\n  "Where can I find updates and announcements related to the course?",\n  "How can I join the course\'s community on Slack and access the relevant channel?"\n]'

In [29]:
import pickle

In [32]:
with open('results.bin', 'wb') as f_out:
    pickle.dump(results, f_out)

In [33]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)


In [35]:
results['c02e79ef']

'[\n  "What is the start date and time for the course?",\n  "How can I subscribe to the course\'s public Google Calendar?",\n  "Where can I register for the course before it starts?",\n  "Where can I find updates and announcements related to the course?",\n  "How can I join the course\'s community on Slack and access the relevant channel?"\n]'

In [196]:
results

{'c02e79ef': '[\n  "What is the start date and time for the course?",\n  "How can I subscribe to the course\'s public Google Calendar?",\n  "Where can I register for the course before it starts?",\n  "Where can I find updates and announcements related to the course?",\n  "How can I join the course\'s community on Slack and access the relevant channel?"\n]',
 '1f6520ca': '[\n  "What are the prerequisites for enrolling in this course?",\n  "Are there any specific requirements before starting this course?",\n  "What knowledge or skills are necessary to take this course?",\n  "What should I have learned or be familiar with before taking this course?",\n  "What are the prerequisites that I need to meet to join this course?"\n]',
 '7842b56a': '[\n  {\n    "question": "Is it possible to join the course after the start date?",\n    "answer": "Yes, you can still join the course. However, you\'re still eligible to submit homeworks, and there will be deadlines for final projects, so it\'s best no

In [480]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    parsed_resulst[doc_id] = json.loads(json_questions)

In [481]:
doc_index = {d['id']: d for d in documents}

In [482]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [483]:
import pandas as pd

In [484]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [485]:
df.to_csv('ground-truth-data.csv', index=False)

In [486]:
!head ground-truth-data.csv

question,course,document
What is the start date and time for the course?,data-engineering-zoomcamp,c02e79ef
How can I subscribe to the course's public Google Calendar?,data-engineering-zoomcamp,c02e79ef
Where can I register for the course before it starts?,data-engineering-zoomcamp,c02e79ef
Where can I find updates and announcements related to the course?,data-engineering-zoomcamp,c02e79ef
How can I join the course's community on Slack and access the relevant channel?,data-engineering-zoomcamp,c02e79ef
What are the prerequisites for enrolling in this course?,data-engineering-zoomcamp,1f6520ca
Are there any specific requirements before starting this course?,data-engineering-zoomcamp,1f6520ca
What knowledge or skills are necessary to take this course?,data-engineering-zoomcamp,1f6520ca
What should I have learned or be familiar with before taking this course?,data-engineering-zoomcamp,1f6520ca


# Evaluation of Text Retrieval Techniques for RAG out

In [493]:
with open('documents_with_ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [494]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200') 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.delete(index=index_name, ignore_unavailable=True)
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [495]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [496]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)

    result_docs=[]
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])

    return result_docs

In [497]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp',
  'id': '7842b56a'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
  'section': 'General course-related questions',
  'question': 'Course - What can I do before the course starts?',
  'course': 'data-engineering-zoomcamp',
  'id': '63394d91'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it fin

In [499]:
import pandas as pd

In [500]:
df_ground_truth=pd.read_csv('ground-truth-data.csv')

In [501]:
df_ground_truth

Unnamed: 0,question,course,document
0,What is the start date and time for the course?,data-engineering-zoomcamp,c02e79ef
1,How can I subscribe to the course's public Goo...,data-engineering-zoomcamp,c02e79ef
2,Where can I register for the course before it ...,data-engineering-zoomcamp,c02e79ef
3,Where can I find updates and announcements rel...,data-engineering-zoomcamp,c02e79ef
4,How can I join the course's community on Slack...,data-engineering-zoomcamp,c02e79ef
...,...,...,...
4728,In what way can I remove infrastructure that w...,mlops-zoomcamp,886d1617
4729,How should I initialize Terraform for destroyi...,mlops-zoomcamp,886d1617
4730,What is the configuration I should use for Ter...,mlops-zoomcamp,886d1617
4731,What file should I use to provide variables fo...,mlops-zoomcamp,886d1617


In [503]:
ground_truth = df_ground_truth.to_dict(orient='records')

In [515]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = elastic_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4733 [00:00<?, ?it/s]

- hit-rate(recall)
- Mean Reciprocal Rank (MRR)

## hit-rate(recall)

In [526]:
def hit_rate(relevance_total):
    cnt = 0
    for line in relevance_total:
        if True in line:
            cnt = cnt+1

    return cnt/len(relevance_total)
        

In [528]:
hit_rate(relevance_total)

0.7796323684766533

## MRR

In [534]:
def mrr(relevance_total):
    total_score = 0.0
    
    for line in relevance_total:
        for rank in range(len(line)):
            if line[rank] == True:
                total_score = total_score + 1 / (rank + 1) ## starts from zerro

    return total_score /len(relevance_total) 


In [535]:
mrr(relevance_total)

0.6662969223184737

### use minsearch

In [536]:
import minsearch

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course", "id"]
)

index.fit(documents)

<minsearch.minsearch.Index at 0x7651ca2c57c0>

In [537]:
def minsearch_search(query, course):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': course},
        boost_dict=boost,
        num_results=5
    )

    return results

In [538]:
relevance_total = []

for q in tqdm(ground_truth):
    doc_id = q['document']
    results = minsearch_search(query=q['question'], course=q['course'])
    relevance = [d['id'] == doc_id for d in results]
    relevance_total.append(relevance)

  0%|          | 0/4733 [00:00<?, ?it/s]

In [539]:
hit_rate(relevance_total), mrr(relevance_total)

(0.818719628142827, 0.7250158461863515)

Compare with ES results:
```
(0.7796323684766533, 0.6662969223184737)
   ```     

create a function that can be used for both elastic search and minsearch

In [541]:
def evaluate(ground_truth, search_function):
    relevance_total = []

    for q in tqdm(ground_truth):
        doc_id = q['document']
        results = search_function(q)
        relevance = [d['id'] == doc_id for d in results]
        relevance_total.append(relevance)

    return{
        'hit_rate': hit_rate(relevance_total), 
        'mrr': mrr(relevance_total),
    
    }

In [542]:
evaluate(ground_truth, lambda q: elastic_search(q['question'], q['course']))

  0%|          | 0/4733 [00:00<?, ?it/s]

{'hit_rate': 0.7796323684766533, 'mrr': 0.6662969223184737}

In [543]:
evaluate(ground_truth, lambda q: minsearch_search(q['question'], q['course']))

  0%|          | 0/4733 [00:00<?, ?it/s]

{'hit_rate': 0.818719628142827, 'mrr': 0.7250158461863515}