## Documentation Preprocessing

### Objective
This component prepares scraped documentation for analysis by transforming Markdown into a structured text format that enhances readability for language models. It focuses on preserving semantic structure while removing elements that could interfere with quality assessment.

### Technical Approach
#### Core Technologies
- **Frontmatter**: For handling YAML metadata in Markdown files
- **Python-Markdown**: For converting Markdown to HTML
- **BeautifulSoup**: For structured HTML parsing and text extraction
- **Regex**: For pattern-based cleaning and transformation

#### Key Features
1. **Content Cleaning**:
   - Removes YAML frontmatter while preserving content
   - Replaces code blocks with placeholders to maintain context
   - Converts images to descriptive placeholders
   - Strips formatting while preserving semantic meaning

2. **Structure Preservation**:
   - Maintains heading hierarchy with explicit level markers
   - Processes lists with indentation to show nesting
   - Preserves section breaks with visual separators
   - Maintains paragraph structure with proper spacing

3. **Format Optimization**:
   - Converts bold/italic to plain text to reduce noise
   - Removes URLs while keeping link text
   - Eliminates HTML tags for cleaner text analysis
   - Normalizes whitespace and line breaks

### Why This Approach?
- **Analysis Readiness**: Creates a format optimized for LLM processing while preserving document structure
- **Noise Reduction**: Removes non-essential elements that could bias quality assessments
- **Context Preservation**: Maintains technical context through placeholders (code blocks, images)
- **Consistency**: Ensures uniform input format for reliable analysis results


In [6]:
import os
import re
import frontmatter
import markdown
from bs4 import BeautifulSoup
from pathlib import Path

def clean_markdown(content):
    """Remove non-essential elements while preserving structure"""
    # Remove YAML frontmatter
    try:
        post = frontmatter.loads(content)
        content = post.content
    except:
        pass  # Continue if no frontmatter
    
    # Remove code blocks but keep placeholders
    content = re.sub(r'```.*?```', '[CODE_BLOCK]', content, flags=re.DOTALL)
    
    # Remove images with alt text preservation
    content = re.sub(r'!\[(.*?)\]\(.*?\)', r'[IMAGE: \1]', content)
    
    # Convert bold/italic to plain text
    content = re.sub(r'\*\*(.*?)\*\*', r'\1', content)
    content = re.sub(r'\*(.*?)\*', r'\1', content)
    
    # Preserve links but remove URLs
    content = re.sub(r'\[(.*?)\]\(.*?\)', r'\1', content)
    
    # Remove HTML tags
    content = re.sub(r'<[^>]+>', '', content)
    
    # Preserve section breaks
    content = re.sub(r'\n{3,}', '\n\n[SECTION_BREAK]\n\n', content)
    
    return content.strip()

def convert_to_structured_text(md_content):
    """Convert markdown to analysis-ready structured text"""
    # Convert to HTML
    html = markdown.markdown(md_content)
    
    # Parse with BeautifulSoup
    soup = BeautifulSoup(html, 'html.parser')
    
    # Process headings with hierarchy markers
    heading_map = {'h1': 1, 'h2': 2, 'h3': 3, 'h4': 4, 'h5': 5, 'h6': 6}
    for tag, level in heading_map.items():
        for heading in soup.find_all(tag):
            heading.insert_before(f"\n\nHEADING_{level}: ")
            heading.replace_with(f"{heading.text}\n{'=' * (len(heading.text) + 10)}")
    
    # Process lists with indentation
    list_count = 0
    for list_tag in soup.find_all(['ul', 'ol']):
        list_count += 1
        list_tag.insert_before(f"\n\nLIST_START_{list_count}")
        for li in list_tag.find_all('li'):
            li_text = li.get_text().strip()
            indent = '  ' * (len(li.find_parents(['ul', 'ol'])) - 1)
            li.replace_with(f"\n{indent}- {li_text}")
        list_tag.append(f"\nLIST_END_{list_count}")
        list_tag.unwrap()
    
    # Process paragraphs
    for p in soup.find_all('p'):
        p.insert_before("\n\n")
        p.append("\n")
    
    # Get text and clean up
    text = soup.get_text()
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'\[SECTION_BREAK\]', '\n' + '-'*50 + '\n', text)
    return text.strip()

def preprocess_file(input_file, output_dir):
    """Process single markdown file with MoEngage-specific cleaning"""
    # Create output directory if it doesn't exist
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    with open(input_file, 'r', encoding='utf-8') as f:
        content = f.read()
    
    cleaned = clean_markdown(content)
    structured_text = convert_to_structured_text(cleaned)
    
    # Create output filename with "preprocessed_" prefix
    filename = os.path.basename(input_file)
    output_file = os.path.join(output_dir, f"preprocessed_{filename}")
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(structured_text)
    
    print(f"Preprocessed file saved to: {output_file}")
    return output_file


INPUT_FILE = "web_sdk_overview.md"  
OUTPUT_DIR = "preprocessed_data"  
if __name__ == "__main__":
    preprocessed_file = preprocess_file(INPUT_FILE, OUTPUT_DIR)
    print(f"Preprocessing complete for: {INPUT_FILE}")
    print(f"Output saved to: {preprocessed_file}")

Preprocessed file saved to: preprocessed_data\preprocessed_web_sdk_overview.md
Preprocessing complete for: web_sdk_overview.md
Output saved to: preprocessed_data\preprocessed_web_sdk_overview.md
