In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [3]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [4]:
import hashlib

def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

In [5]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [6]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp',
 'id': '1f6520ca'}

In [7]:
from collections import defaultdict

In [8]:
hashes = defaultdict(list)

for doc in documents:
    doc_id = doc['id']
    hashes[doc_id].append(doc)

In [9]:
len(hashes), len(documents)

(947, 948)

In [10]:
#hashes that are not unique
for k, values in hashes.items():
    if len(values) > 1:
        print(k, len(values))

593f7569 2


In [11]:
hashes['593f7569']

[{'text': "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'},
 {'text': "They both do the same, it's just less typing from the script.",
  'section': '6. Decision Trees and Ensemble Learning',
  'question': 'Does it matter if we let the Python file create the server or if we run gunicorn directly?',
  'course': 'machine-learning-zoomcamp',
  'id': '593f7569'}]

In [12]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [13]:
from groq import Groq

In [14]:
client = Groq() 

In [23]:
prompt = prompt_template.format(**doc)

In [24]:
response = client.chat.completions.create(
        model='qwen/qwen3-32b',
        messages=[{"role": "user", "content": prompt}]
    )

json_response = response.choices[0].message.content
json_response

'<think>\nOkay, let\'s see. I need to create five questions that a student might ask based on the provided FAQ record. The record is about destroying infrastructure in AWS using Terraform via GitHub Actions.\n\nFirst, the original question is about destroying infrastructure, but the solution involves using Terraform commands locally. So maybe a student could ask how to do that. The answer mentions terraform init and terraform destroy with specific flags. \n\nI should rephrase the original question into different wording. Maybe start with "How do I remove..." instead of "How to destroy...". Then check if the answer is covered. Also, need to ensure the questions use as few words from the record as possible.\n\nAnother angle: why use the --reconfigure flag in the terraform init command? That\'s part of the solution. So a question about the purpose of the --reconfigure option when initializing Terraform.\n\nAlso, the -backend-config option is used. A student might ask why they\'re specifyi

In [30]:
import re, json

In [34]:
match = re.search(r'\[.*\]', json_response, re.DOTALL)

In [33]:
json.loads(match.group())

['How do I remove AWS infrastructure provisions from the GitHub Actions deployment?',
 'What Terraform command sequence should I use to eliminate resources set up via the CD-Deploy Action?',
 'Could you explain why the --reconfigure flag is included during Terraform initialization before deletion?',
 'Which specific variable configuration file should be referenced when executing terraform destroy for this module?',
 'Is it necessary to run the infrastructure teardown process locally instead of through GitHub Actions?']

In [36]:
match.group()

'["How do I remove AWS infrastructure provisions from the GitHub Actions deployment?", "What Terraform command sequence should I use to eliminate resources set up via the CD-Deploy Action?", "Could you explain why the --reconfigure flag is included during Terraform initialization before deletion?", "Which specific variable configuration file should be referenced when executing terraform destroy for this module?", "Is it necessary to run the infrastructure teardown process locally instead of through GitHub Actions?"]'

In [41]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='qwen/qwen3-32b',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    #match = re.search(r'\[.*\]', json_response, re.DOTALL)
    
    # Find all JSON-like lists in the response
    matches = re.findall(r'\[.*?\]', json_response, re.DOTALL)
    if not matches:
        raise ValueError("No JSON list found in response")

    # Try parsing from the last one
    for match in reversed(matches):
        try:
            return json.loads(match)
        except json.JSONDecodeError:
            continue

    raise ValueError("Could not parse a valid JSON array from response")

In [42]:
doc = documents[2]
doc

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp',
 'id': '7842b56a'}

In [43]:
generate_questions(doc)

['Is there a deadline to join the course after it has started?',
 'Can homework be submitted without registering for the course?',
 "What's the deadline for submitting the final project if I join late?",
 'Do I need to register to complete and submit homework assignments?',
 'What happens if I register for the course after the initial start date?']

In [20]:
from tqdm.auto import tqdm

In [44]:
results = {}

In [45]:
for doc in tqdm(documents):
    doc_id = doc['id']
    if doc_id in results:
        continue
        
    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [57]:
results['7842b56a']

['Is it possible to enroll in the course after the official start date?',
 "Can I submit homework assignments if I didn't register on time?",
 'Are there strict deadlines for the final project even if I join late?',
 'If I start the course late, am I still required to complete all homework assignments?',
 "Will my final project deadline be the same as everyone else's if I join after the course starts?"]

In [51]:
doc_index = {d['id']: d for d in documents}

In [53]:
final_results = []

for doc_id, questions in results.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [55]:
import pandas as pd

In [56]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])
df.head()

Unnamed: 0,question,course,document
0,When is the exact date and time for the course...,data-engineering-zoomcamp,c02e79ef
1,Where can I find the official calendar for cou...,data-engineering-zoomcamp,c02e79ef
2,What steps are required to register for the up...,data-engineering-zoomcamp,c02e79ef
3,How do I join the live session for the initial...,data-engineering-zoomcamp,c02e79ef
4,Is there a specific communication platform for...,data-engineering-zoomcamp,c02e79ef


In [None]:
#df.to_csv('ground-truth-data.csv', index=False)