# Prepare Open AI client object

In [1]:
from openai import OpenAI

In [2]:
from dotenv import load_dotenv
import os

load_dotenv()

True

In [3]:
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))

In [4]:
client

<openai.OpenAI at 0x7f1ad567c730>

# Prepare documents with IDs for GT generation

In [5]:
import json

In [6]:
with open("documents.json", "rt") as f_in:
    docs_raw = json.load(f_in)

In [7]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict["documents"]:
        doc["course"] = course_dict["course"]
        documents.append(doc)

In [8]:
import hashlib


def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [9]:
for doc in documents:
    doc["id"] = generate_document_id(doc)

In [10]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

In [11]:
from collections import defaultdict

In [12]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc["id"]
    hashes[doc_id].append(doc)

In [13]:
len(hashes), len(documents)

(947, 948)

This means that the unique IDs generated are not actually unique and a different method should be used in principle. In this case, this is really a duplicate question so we can just leave it like this.

In [14]:
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [15]:
hashes["593f7569"]

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [16]:
with open("documents-with-ids.json", "wt") as f_out:
    json.dump(documents, f_out, indent=2)

In [17]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


# Generate ground truth using chatgpt

In [18]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [19]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model="gpt-4o", messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [21]:
from tqdm.auto import tqdm

In [None]:
results = {}
for doc in tqdm(documents):
    doc_id = doc["id"]
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [25]:
results["1f6520ca"]

'[\n  "What are the course prerequisites?",\n  "Are there any prerequisites I need to know before starting?",\n  "Where can I find details about course requirements?",\n  "Is there a list of prerequisites available for the course?",\n  "Can you tell me about the prerequisites for enrollment?"\n]'

In [26]:
results["593f7569"]

'[\n  "Is it more efficient to create the server in the Python file or run gunicorn directly?",\n  "What are the benefits of letting the Python script create the server compared to running gunicorn?",\n  "Are there any differences in performance between creating the server in Python file and running gunicorn directly?",\n  "Why might someone choose to let the Python file create the server instead of running gunicorn directly?",\n  "What is a simpler method for starting the server: using the Python script or running gunicorn?"\n]'

In [35]:
len(results)

947

In [36]:
import pickle

# with open('GT_documents.bin', 'wb') as file:
#    pickle.dump(results, file)

with open("GT_documents.bin", "rb") as f_in:
    results = pickle.load(f_in)

In [43]:
parsed_results = {}

In [57]:
for doc_id, json_questions in results.items():

    if doc_id in parsed_results:
        continue

    parsed_results[doc_id] = json.loads(json_questions)

#### Pipeline to correct typos from LLM

In [53]:
json_questions

'[\n  "Will I still receive a certificate if I miss a major project?",\n  Is missing the midterm detrimental to getting my certificate?",\n  What happens if I skipped the midterm project but want a certificate?",\n  Can I qualify for a certificate despite not completing the midterm project?",\n  If I didn\'t submit the midterm project, is a certificate still an option?"\n]'

In [55]:
json.loads(
    '[\n  "Will I still receive a certificate if I miss a major project?",\n  "Is missing the midterm detrimental to getting my certificate?",\n  "What happens if I skipped the midterm project but want a certificate?",\n " Can I qualify for a certificate despite not completing the midterm project?",\n  "If I didn\'t submit the midterm project, is a certificate still an option?"\n]'
)

['Will I still receive a certificate if I miss a major project?',
 'Is missing the midterm detrimental to getting my certificate?',
 'What happens if I skipped the midterm project but want a certificate?',
 ' Can I qualify for a certificate despite not completing the midterm project?',
 "If I didn't submit the midterm project, is a certificate still an option?"]

In [56]:
results[doc_id] = (
    '[\n  "Will I still receive a certificate if I miss a major project?",\n  "Is missing the midterm detrimental to getting my certificate?",\n  "What happens if I skipped the midterm project but want a certificate?",\n " Can I qualify for a certificate despite not completing the midterm project?",\n  "If I didn\'t submit the midterm project, is a certificate still an option?"\n]'
)

In [58]:
parsed_results

{'c02e79ef': ['What is the starting date and time of the course?',
  "When will the first 'Office Hours' live session be?",
  "How can I subscribe to the course's public Google Calendar?",
  'Where should I register before the course starts?',
  'Which platforms should I join for announcements and discussions?'],
 '1f6520ca': ['What are the course prerequisites?',
  'Are there any prerequisites I need to know before starting?',
  'Where can I find details about course requirements?',
  'Is there a list of prerequisites available for the course?',
  'Can you tell me about the prerequisites for enrollment?'],
 '7842b56a': ['Is it possible to join the course after it has already started?',
  'Can I participate in the course activities if I register late?',
  'Am I still eligible to submit homeworks if I miss the registration deadline?',
  'Are there any deadlines for final projects if I join the course late?',
  'Can I complete the course without registering from the start?'],
 '0bbf41ec'

In [60]:
doc_index = {d["id"]: d for d in documents}

In [62]:
final_results = []

for doc_id, questions in parsed_results.items():
    course = doc_index[doc_id]["course"]
    for q in questions:
        final_results.append((q, course, doc_id))

In [63]:
import pandas as pd

In [64]:
df = pd.DataFrame(final_results, columns=["question", "course", "document"])

In [67]:
df.head(2)

Unnamed: 0,question,course,document
0,What is the starting date and time of the course?,data-engineering-zoomcamp,c02e79ef
1,When will the first 'Office Hours' live sessio...,data-engineering-zoomcamp,c02e79ef


In [65]:
df.to_csv("ground-truth-data.csv", index=False)

In [66]:
!head ground-truth-data.csv

question,course,document
What is the starting date and time of the course?,data-engineering-zoomcamp,c02e79ef
When will the first 'Office Hours' live session be?,data-engineering-zoomcamp,c02e79ef
How can I subscribe to the course's public Google Calendar?,data-engineering-zoomcamp,c02e79ef
Where should I register before the course starts?,data-engineering-zoomcamp,c02e79ef
Which platforms should I join for announcements and discussions?,data-engineering-zoomcamp,c02e79ef
What are the course prerequisites?,data-engineering-zoomcamp,1f6520ca
Are there any prerequisites I need to know before starting?,data-engineering-zoomcamp,1f6520ca
Where can I find details about course requirements?,data-engineering-zoomcamp,1f6520ca
Is there a list of prerequisites available for the course?,data-engineering-zoomcamp,1f6520ca
