If scraped articles lack headers. chunking might be a problem.
We try to promote text spans to headers by some static rules.  

In [2]:
import re
from typing import List, Tuple

class HeaderDetector:
    """
    Detects headings in poorly-structured markdown and converts them to proper headers.
    """
    
    def __init__(self):
        # Common heading keywords that appear at start of sections
        self.heading_keywords = {
            'introduction', 'overview', 'background', 'summary', 'conclusion',
            'abstract', 'methods', 'results', 'discussion', 'references',
            'symptoms', 'treatment', 'diagnosis', 'causes', 'prevention',
            'what is', 'how to', 'why', 'when', 'where'
        }
    
    def is_likely_heading(self, line: str, next_line: str = "") -> Tuple[bool, int]:
        """
        Determines if a line is likely a heading and returns (is_heading, level).
        
        Args:
            line: Current line to check
            next_line: Following line for context
            
        Returns:
            (is_heading: bool, level: int) where level is 1-3
        """
        stripped = line.strip()
        
        # Skip empty lines
        if not stripped:
            return False, 0
        
        # Already a markdown header
        if re.match(r'^#{1,6}\s', stripped):
            return False, 0
        
        # Pattern 1: Bold text on its own line (common in HTML → MD conversion)
        # **Text** or __Text__
        bold_match = re.match(r'^\*\*(.+?)\*\*$|^__(.+?)__$', stripped)
        if bold_match:
            text = bold_match.group(1) or bold_match.group(2)
            if len(text.split()) <= 10:  # Headings are usually short
                return True, 2
        
        # Pattern 2: ALL CAPS (but not single words or very long)
        if stripped.isupper() and 2 <= len(stripped.split()) <= 12:
            # Avoid false positives like "COVID-19" or "USA"
            if not re.match(r'^[A-Z0-9\-]+$', stripped.replace(' ', '')):
                return True, 2
        
        # Pattern 3: Short line (< 60 chars) followed by longer content
        if len(stripped) < 60 and len(next_line.strip()) > 60:
            # Check if it ends with punctuation (likely not a heading)
            if not stripped.endswith(('.', ',', ';', ':', '!', '?')):
                # Contains heading keywords
                if any(keyword in stripped.lower() for keyword in self.heading_keywords):
                    return True, 2
                # Starts with number pattern like "1.", "1.1", "Section 1"
                if re.match(r'^\d+\.?\d*\.?\s+[A-Z]', stripped):
                    return True, 2
        
        # Pattern 4: Numbered sections (1., 1.1, etc.)
        if re.match(r'^\d+\.\s+[A-Z]', stripped) and len(stripped) < 80:
            return True, 2
        
        # Pattern 5: Question format headings
        if stripped.endswith('?') and len(stripped.split()) <= 12:
            if stripped[0].isupper():
                return True, 3
        
        # Pattern 6: Starts with common heading words and is title case
        words = stripped.split()
        if len(words) >= 2 and len(words) <= 10:
            first_word = words[0].lower().rstrip(':')
            if first_word in self.heading_keywords:
                # Check if title case (most words capitalized)
                title_case_count = sum(1 for w in words if w[0].isupper())
                if title_case_count >= len(words) * 0.6:
                    return True, 2
        
        return False, 0
    
    def process_markdown(self, content: str) -> str:
        """
        Process markdown content and add proper headers where detected.
        
        Args:
            content: Raw markdown content
            
        Returns:
            Processed markdown with proper headers
        """
        lines = content.split('\n')
        processed_lines = []
        i = 0
        
        while i < len(lines):
            line = lines[i]
            next_line = lines[i + 1] if i + 1 < len(lines) else ""
            
            is_heading, level = self.is_likely_heading(line, next_line)
            
            if is_heading:
                # Remove bold formatting if present
                cleaned = re.sub(r'^\*\*(.+?)\*\*$|^__(.+?)__$', r'\1\2', line.strip())
                # Convert to proper markdown header
                header = '#' * level + ' ' + cleaned
                processed_lines.append(header)
            else:
                processed_lines.append(line)
            
            i += 1
        
        return '\n'.join(processed_lines)
    
    def process_file(self, input_path: str, output_path: str = None):
        """
        Process a markdown file and optionally save to a new file.
        
        Args:
            input_path: Path to input markdown file
            output_path: Path to output file (if None, overwrites input)
        """
        with open(input_path, 'r', encoding='utf-8') as f:
            content = f.read()
        
        processed = self.process_markdown(content)
        
        output_path = output_path or input_path
        with open(output_path, 'w', encoding='utf-8') as f:
            f.write(processed)
        
        return processed


# Example usage
if __name__ == "__main__":
    detector = HeaderDetector()
    
    # Test with sample content
    sample_md = """---
title: "Sample Document"
---

**Introduction**

This is some introductory text that goes on for a while and explains
the basic concepts of the document.

METHODS AND PROCEDURES

Here we describe the methods used in the study. This section contains
detailed information about how we conducted the research.

What are the methods used?

Here we describe the methods used in the study. This section contains
detailed information about how we conducted the research.

**What are the symptoms?**

The main symptoms include fever, cough, and fatigue. These can vary
in severity depending on the individual case.

1. Treatment Options

There are several treatment approaches available:
- Medication
- Physical therapy
- Surgery

Conclusion

This document provides a comprehensive overview of the topic.
"""
    
    processed = detector.process_markdown(sample_md)
    print(processed)

---
title: "Sample Document"
---

## Introduction

This is some introductory text that goes on for a while and explains
the basic concepts of the document.

## METHODS AND PROCEDURES

Here we describe the methods used in the study. This section contains
detailed information about how we conducted the research.

### What are the methods used?

Here we describe the methods used in the study. This section contains
detailed information about how we conducted the research.

## What are the symptoms?

The main symptoms include fever, cough, and fatigue. These can vary
in severity depending on the individual case.

## 1. Treatment Options

There are several treatment approaches available:
- Medication
- Physical therapy
- Surgery

Conclusion

This document provides a comprehensive overview of the topic.

