In [2]:
import io
import zipfile
import requests
import frontmatter

In [3]:
def read_repo_data(repo_owner, repo_name):
    base_url = "https://codeload.github.com"
    repo_url = f"{base_url}/{repo_owner}/{repo_name}/zip/refs/heads/main"
    
    resp = requests.get(repo_url)
    if resp.status_code != 200:
        raise Exception(f"Failed to download repository: {resp.status_code}")

    repository_data = []
    with zipfile.ZipFile(io.BytesIO(initial_bytes=resp.content)) as zf:
        for file_info in zf.infolist():
            filename = file_info.filename.lower()
            if not (filename.endswith(".md") or filename.endswith(".mdx")):
                continue
            try:
                with zf.open(file_info) as f_in:
                    content = f_in.read()
                    post = frontmatter.loads(content)
                    data = post.to_dict()
                    data['filename'] = filename
                    repository_data.append(data)
            except Exception as e:
                print(f"Error processing {filename}: {e}")
                continue
        return repository_data

In [4]:
dtc_faq = read_repo_data('DataTalksClub', 'faq')

In [5]:
dtc_faq[2]

{'id': '9e508f2212',
 'question': 'Course: When does the course start?',
 'sort_order': 1,
 'content': "The next cohort starts January 13th, 2025. More info at [DTC](https://datatalks.club/blog/guide-to-free-online-courses-at-datatalks-club.html).\n\n- Register before the course starts using this [link](https://airtable.com/shr6oVXeQvSI5HuWD).\n- Join the [course Telegram channel with announcements](https://t.me/dezoomcamp).\n- Don’t forget to register in DataTalks.Club's Slack and join the channel.",
 'filename': 'faq-main/_questions/data-engineering-zoomcamp/general/001_9e508f2212_course-when-does-the-course-start.md'}

In [6]:
evidently_docs = read_repo_data('evidentlyai', 'docs')

In [7]:
evidently_docs[4]

{'title': 'Product updates',
 'description': 'Latest releases.',
 'content': '<Update label="2025-07-18" description="Evidently v0.7.11">\n  ## **Evidently 0.7.11**\n\n  Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.11).\n\nExample notebooks:\n- Synthetic data generation: [code example](https://github.com/evidentlyai/evidently/blob/main/examples/cookbook/datagen.ipynb)\n\n</Update>\n\n<Update label="2025-07-09" description="Evidently v0.7.10">\n  ## **Evidently 0.7.10**\n    Full release notes on [Github](https://github.com/evidentlyai/evidently/releases/tag/v0.7.10).\n  \nNEW: automated prompt optimization. Read the release blog on [prompt optimization for LLM judges](https://www.evidentlyai.com/blog/llm-judge-prompt-optimization).\n\nExample notebooks:\n- Code review binary LLM judge prompt optimization: [code example](https://github.com/evidentlyai/evidently/blob/main/examples/cookbook/prompt_optimization_code_review_example.ipynb)\n- Topi

In [8]:
evidently_docs[45]

{'title': 'LLM regression testing',
 'description': 'How to run regression testing for LLM outputs.',
 'filename': 'docs-main/examples/llm_regression_testing.mdx'}

In [9]:
def sliding_window(seq, size, step):
    if size <= 0 or step <= 0:
        raise ValueError("size and step must be positive")
    if size <= step:
        raise ValueError("size must be greater than step")
    n = len(seq)
    chunks = []
    for i in range(0, n, step):
        chunk = seq[i: i+size]
        chunks.append({'start': i, 'chunk': chunk})
        if i + size >= n:
            break
    return chunks
        
        

In [10]:
evidently_chunks = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    chunks = sliding_window(doc_content, 2000, 1000)
    for chunk in chunks:
        chunk.update(doc_copy) #adding the rest info about the doc back
    evidently_chunks.extend(chunks)  #bulk method for appending all chunnks

In [12]:
evidently_chunks[0:5]

[{'start': 0,
  'chunk': '<Note>\n  If you\'re not looking to build API reference documentation, you can delete\n  this section by removing the api-reference folder.\n</Note>\n\n## Welcome\n\nThere are two ways to build API documentation: [OpenAPI](https://mintlify.com/docs/api-playground/openapi/setup) and [MDX components](https://mintlify.com/docs/api-playground/mdx/configuration). For the starter kit, we are using the following OpenAPI specification.\n\n<Card\n  title="Plant Store Endpoints"\n  icon="leaf"\n  href="https://github.com/mintlify/starter/blob/main/api-reference/openapi.json"\n>\n  View the OpenAPI specification file\n</Card>\n\n## Authentication\n\nAll API endpoints are authenticated using Bearer tokens and picked up from the specification file.\n\n```json\n"security": [\n  {\n    "bearerAuth": []\n  }\n]\n```',
  'title': 'Introduction',
  'description': 'Example section for showcasing API endpoints',
  'filename': 'docs-main/api-reference/introduction.mdx'},
 {'start'

In [45]:
import re
text = evidently_docs[45]['content']
paragraphs = re.split(r"\n\s*\n", text.strip())

In [47]:
len(paragraphs)

153

In [48]:
import re

def split_markdown_by_level(text, level=2):
    """
    Split markdown text by a specific header level.
    
    :param text: Markdown text as a string
    :param level: Header level to split on
    :return: List of sections as strings
    """
    # This regex matches markdown headers
    # For level 2, it matches lines starting with "## "
    header_pattern = r'^(#{' + str(level) + r'} )(.+)$'
    pattern = re.compile(header_pattern, re.MULTILINE)

    # Split and keep the headers
    parts = pattern.split(text)
    
    sections = []
    for i in range(1, len(parts), 3):
        # We step by 3 because regex.split() with
        # capturing groups returns:
        # [before_match, group1, group2, after_match, ...]
        # here group1 is "## ", group2 is the header text
        header = parts[i] + parts[i+1]  # "## " + "Title"
        header = header.strip()

        # Get the content after this header
        content = ""
        if i+2 < len(parts):
            content = parts[i+2].strip()

        if content:
            section = f'{header}\n\n{content}'
        else:
            section = header
        sections.append(section)
    
    return sections


In [49]:
evidently_chunks_headers = []

for doc in evidently_docs:
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')
    sections = split_markdown_by_level(doc_content, level=2)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks_headers.append(section_doc)


In [51]:
evidently_chunks_headers[46]

{'title': 'Add dashboard panels (UI)',
 'description': 'How to design your Dashboard with custom Panels.',
 'filename': 'docs-main/docs/platform/dashboard_add_panels_ui.mdx',
 'section': '## Adding Panels\n\nYou can add any number of Panels to your Dashboard, including text panels, counters, pie charts, line plots, and bar plots (grouped and stacked). When you create a Panel, you pull the corresponding value from multiple Reports and show it over time or using the specified aggregation (sum, average, last).\n\n<Info>\n  Check the preview and description of each Panel here: [How to add panels via Python API](dashboard_add_panels).\n</Info>\n\n**How to add a Panel:**\n\n- Enter "Edit" mode on the Dashboard (top right corner).\n- Click on the "Add Panel" button next to it.\n- Follow the prompts to configure the panel.\n- Use the preview to review your setup.\n- Click "Save" and select the Tab where you want to add the Panel.\n\nHere is an example of the panel configuration view:\n\n![](/i

In [52]:
from openai import OpenAI

openai_client = OpenAI()


def llm(prompt, model='gpt-4o-mini'):
    messages = [
        {"role": "user", "content": prompt}
    ]

    response = openai_client.responses.create(
        model='gpt-4o-mini',
        input=messages
    )

    return response.output_text

In [53]:
prompt_template = """
Split the provided document into logical sections
that make sense for a Q&A system.

Each section should be self-contained and cover
a specific topic or concept.

<DOCUMENT>
{document}
</DOCUMENT>

Use this format:

## Section Name

Section content with all relevant details

---

## Another Section Name

Another section content

---
""".strip()


In [54]:
def intelligent_chunking(text):
    prompt = prompt_template.format(document=text)
    response = llm(prompt)
    sections = response.split('---')
    sections = [s.strip() for s in sections if s.strip()]
    return sections


In [None]:
from tqdm.auto import tqdm

evidently_chunks = []

for doc in tqdm(evidently_docs[0:5]):
    doc_copy = doc.copy()
    doc_content = doc_copy.pop('content')

    sections = intelligent_chunking(doc_content)
    for section in sections:
        section_doc = doc_copy.copy()
        section_doc['section'] = section
        evidently_chunks.append(section_doc)

In [58]:
print(evidently_chunks[0])

{'title': 'Create Plant', 'openapi': 'POST /plants', 'filename': 'docs-main/api-reference/endpoint/create.mdx', 'section': "It seems you've provided a placeholder document without any actual content. If you could share the specific content of the document, I'd be happy to help you organize it into logical sections suitable for a Q&A system. Please provide the text or information you'd like me to work with!"}
