In [8]:
import io
import requests
import docx

In [23]:
def clean_line(line):
    """
    Clean a line of text by strpping leading/training whitespace and specific unwanted characters.
    """
    line = line.strip()
    line = line.strip('\u001b') # Remove BOM if present
    return line

In [24]:
def read_faq(file_id):
    """
    Reads a Google Docs exported as DOCX and extracts FAQ sections and questions.

    Args:
    file_id (str): The ID of the Google Docs document to be exported as DOCX.

    Returns:
    List[Dict]: A list of dictionaries, each containing a section, question, and answer
    """

    # Build the URL for downloading the DOCX file
    url = f'https://docs.google.com/document/d/{file_id}/export?format=docx'

    # Download the DOCX file
    response = requests.get(url)
    response.raise_for_status() # Raise an exception for HTTP errors

    # Open the DOCX file
    with io.BytesIO(response.content) as f_in:
        doc = docx.Document(f_in)

    # Initialize variables for storing the extracted data
    questions = []
    section_heading_style = 'heading 1'
    question_heading_style = 'heading 2'

    answer_text_so_far = ''
    question_title = ''
    section_title = ''

    # Iterate through each paragraph in the document
    for p in doc.paragraphs:
        p_style = p.style.name.lower()
        p_text = p.text

        if len(p_text) == 0:
            continue

        if p_style == section_heading_style:
            section_title = p_text    

        if p_style == question_heading_style:
            # if there is an ongoing answer, save it before starting a new question
            answer_text_so_far = answer_text_so_far.strip()
            if answer_text_so_far != '' and section_title != '' and question_title != '':
                questions.append({
                    'section': section_title,
                    'question': question_title,
                    'text': answer_text_so_far
                })
                answer_text_so_far = ''

            question_title = p_text
            continue
                
        # Accumulate answer text
        answer_text_so_far += '\n' + p_text
        
    # Save the last accumulated answer
    answer_text_so_far = answer_text_so_far.strip()
    if answer_text_so_far != '' and section_title != '' and question_title != '':
        questions.append({
            'section': section_title,
            'question': question_title,
            'text': answer_text_so_far
        })

    return questions

In [25]:
faq_documents = {
    'data-engineering-zoomcamp': '19bnYs80DwuUimHM65UV3sylsCn2j1vziPOwzBwQrebw',
    'machine-learning-zoomcamp': '1LpPanc33QJJ6BSsyxVg-pWNMplal84TdZtq10naIhD8',
    'mlops-zoomcamp': '12TlBfhIiKtyBv8RnsoJR6F72bkPDGEvPOItJIxaEzE0',
}

In [26]:
documents = []
for course, file_id in faq_documents.items():
    print(course)
    course_documets = read_faq(file_id)
    documents.append({'course': course, 'documents': course_documets})

data-engineering-zoomcamp
machine-learning-zoomcamp
mlops-zoomcamp


In [27]:
import json

In [28]:
with open('documents.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [29]:
!head documents.json

[
  {
    "course": "data-engineering-zoomcamp",
    "documents": [
      {
        "section": "General course-related questions",
        "question": "Course - When will the course start\uff1f",
        "text": "The purpose of this document is to capture frequently asked technical questions\nGeneral course-related questions\nThe next cohort starts in Jan 2025. More info at DTC Article.\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel."
      },
      {
