# PDF to Markdown Converter
## Hybrid Pipeline with Quality Detection & Claude Vision Fallback

This notebook converts PDFs to clean, well-structured Markdown using a cost-efficient hybrid approach:

1. **MarkItDown** - Fast, free initial extraction
2. **Heuristic Quality Checks** - Instant validation of extraction quality
3. **Claude Quality Assessment** - AI-powered quality scoring
4. **Claude Vision Fallback** - Direct PDF analysis for complex documents

---

## Cell 1: Install Dependencies

In [29]:
# Install with all optional dependencies
!pip install -q 'markitdown[all]' anthropic


print("✓ Dependencies installed successfully")

✓ Dependencies installed successfully


In [30]:
# Verify PDF dependencies
import subprocess
result = subprocess.run(["pip", "show", "pdfminer.six"], capture_output=True, text=True)
if "Name: pdfminer.six" in result.stdout:
    print("✓ pdfminer.six installed (required for MarkItDown PDF support)")
else:
    print("✗ pdfminer.six NOT found - installing directly...")
    !pip install -q pdfminer.six
    print("✓ Installed pdfminer.six")

✓ pdfminer.six installed (required for MarkItDown PDF support)


## Cell 2: Configuration & Initialization

In [37]:
import os
import base64
import json
import re
from pathlib import Path
from google.colab import userdata
from anthropic import Anthropic
from markitdown import MarkItDown

# =============================================================================
# CONFIGURATION
# =============================================================================

# API Setup
ANTHROPIC_API_KEY = userdata.get('Claude_Colab')
client = Anthropic(api_key=ANTHROPIC_API_KEY)

# Model Configuration
MODEL = "claude-haiku-4-5-20251001"
MAX_TOKENS = 8192

# Quality Thresholds
QUALITY_THRESHOLDS = {
    "min_words_per_page": 50,      # Minimum words expected per page
    "max_whitespace_ratio": 0.40,   # Maximum whitespace allowed
    "min_quality_score": 7,         # Minimum Claude quality score (1-10)
    "min_headers_for_long_doc": 1,  # Minimum headers for docs > 500 words
}

# File Paths
INPUT_PDF = "/content/input.pdf""
OUTPUT_MD = "/content/output.md"

# Initialize MarkItDown
markitdown = MarkItDown()

print("✓ Configuration loaded")
print(f"  Model: {MODEL}")
print(f"  Input: {INPUT_PDF}")
print(f"  Output: {OUTPUT_MD}")

✓ Configuration loaded
  Model: claude-haiku-4-5-20251001
  Input: /content/pdfs/Ryder-Investor-Overview-June-2024.pdf
  Output: /content/output.md


## Cell 3: Quality Assessment Functions

In [38]:
def heuristic_check(markdown: str, page_count: int = 1) -> dict:
    """
    Perform fast, free heuristic checks on extracted markdown.

    Returns:
        dict: {"passed": bool, "issues": list, "metrics": dict}
    """
    issues = []

    # Metric calculations
    total_chars = len(markdown)
    whitespace_chars = sum(1 for c in markdown if c.isspace())
    whitespace_ratio = whitespace_chars / total_chars if total_chars > 0 else 1

    words = markdown.split()
    word_count = len(words)
    words_per_page = word_count / max(page_count, 1)

    header_count = len(re.findall(r'^#{1,6}\s', markdown, re.MULTILINE))

    # Check for table integrity (pipes with inconsistent columns)
    table_lines = [l for l in markdown.split('\n') if '|' in l]
    table_issues = False
    if table_lines:
        col_counts = [l.count('|') for l in table_lines]
        if col_counts and max(col_counts) - min(col_counts) > 2:
            table_issues = True

    # Check for repeated character artifacts (OCR noise)
    artifact_patterns = [
        r'\.{5,}',      # Multiple dots
        r'\|{3,}',      # Multiple pipes
        r'_{5,}',       # Multiple underscores
        r'\s{10,}',     # Excessive whitespace
    ]
    artifact_count = sum(len(re.findall(p, markdown)) for p in artifact_patterns)

    # Evaluate against thresholds
    if words_per_page < QUALITY_THRESHOLDS["min_words_per_page"]:
        issues.append(f"Low word count: {words_per_page:.0f} words/page (min: {QUALITY_THRESHOLDS['min_words_per_page']})")

    if whitespace_ratio > QUALITY_THRESHOLDS["max_whitespace_ratio"]:
        issues.append(f"High whitespace ratio: {whitespace_ratio:.1%} (max: {QUALITY_THRESHOLDS['max_whitespace_ratio']:.0%})")

    if word_count > 500 and header_count < QUALITY_THRESHOLDS["min_headers_for_long_doc"]:
        issues.append(f"No headers found in long document ({word_count} words)")

    if table_issues:
        issues.append("Inconsistent table structure detected")

    if artifact_count > 10:
        issues.append(f"Excessive extraction artifacts detected ({artifact_count} patterns)")

    metrics = {
        "word_count": word_count,
        "words_per_page": words_per_page,
        "whitespace_ratio": whitespace_ratio,
        "header_count": header_count,
        "artifact_count": artifact_count,
        "table_line_count": len(table_lines),
    }

    return {
        "passed": len(issues) == 0,
        "issues": issues,
        "metrics": metrics
    }


def claude_quality_check(markdown: str) -> dict:
    """
    Use Claude to assess the quality of extracted markdown.

    Returns:
        dict: {"score": int, "issues": list}
    """
    # Truncate if too long to save tokens
    sample = markdown[:8000] if len(markdown) > 8000 else markdown

    prompt = f"""Evaluate this markdown extraction from a PDF. Rate the quality from 1-10 based on:
- Completeness (does it appear to capture all content?)
- Formatting (are headers, lists, tables properly structured?)
- Readability (is it clean and well-organized?)
- Artifacts (are there extraction errors, garbled text, or noise?)

Respond with ONLY a JSON object, no other text:
{{"score": N, "issues": ["issue1", "issue2"]}}

If there are no issues, return an empty array for issues.

---
MARKDOWN TO EVALUATE:
{sample}
---"""

    try:
        response = client.messages.create(
            model=MODEL,
            max_tokens=500,
            messages=[{"role": "user", "content": prompt}]
        )

        result_text = response.content[0].text.strip()

        # Clean up response if needed
        if result_text.startswith('```'):
            result_text = re.sub(r'^```json?\n?', '', result_text)
            result_text = re.sub(r'\n?```$', '', result_text)

        result = json.loads(result_text)
        return {
            "score": result.get("score", 0),
            "issues": result.get("issues", []),
            "tokens_used": response.usage.input_tokens + response.usage.output_tokens
        }

    except json.JSONDecodeError as e:
        print(f"  ⚠ Failed to parse quality response: {e}")
        return {"score": 0, "issues": ["Failed to parse quality assessment"], "tokens_used": 0}
    except Exception as e:
        print(f"  ⚠ Quality check error: {e}")
        return {"score": 0, "issues": [str(e)], "tokens_used": 0}


print("✓ Quality assessment functions loaded")

✓ Quality assessment functions loaded


## Cell 4: Conversion Functions

In [39]:
def markitdown_extract(pdf_path: str) -> dict:
    """
    Extract markdown from PDF using MarkItDown.

    Returns:
        dict: {"success": bool, "markdown": str, "error": str|None}
    """
    try:
        result = markitdown.convert(pdf_path)
        return {
            "success": True,
            "markdown": result.text_content,
            "error": None
        }
    except Exception as e:
        return {
            "success": False,
            "markdown": "",
            "error": str(e)
        }


def claude_vision_extract(pdf_path: str) -> dict:
    """
    Extract markdown from PDF using Claude's native PDF/vision capability.

    Returns:
        dict: {"success": bool, "markdown": str, "tokens_used": int, "error": str|None}
    """
    try:
        # Read and encode PDF
        with open(pdf_path, "rb") as f:
            pdf_data = base64.standard_b64encode(f.read()).decode("utf-8")

        prompt = """Convert this PDF to clean, well-structured Markdown. Follow these guidelines:

1. **Headers**: Use appropriate header levels (# ## ###) to reflect document hierarchy
2. **Tables**: Convert all tables to proper markdown table format
3. **Lists**: Use bullet points or numbered lists where appropriate
4. **Content**: Preserve ALL text content accurately - do not summarize or omit
5. **Charts/Graphics**: Add descriptive placeholders like [Chart: Revenue by Quarter] or [Image: Company Logo]
6. **Formatting**: Remove extraction artifacts, fix spacing issues, ensure clean output

Output ONLY the markdown content, no explanations or preamble."""

        response = client.messages.create(
            model=MODEL,
            max_tokens=MAX_TOKENS,
            messages=[
                {
                    "role": "user",
                    "content": [
                        {
                            "type": "document",
                            "source": {
                                "type": "base64",
                                "media_type": "application/pdf",
                                "data": pdf_data
                            }
                        },
                        {
                            "type": "text",
                            "text": prompt
                        }
                    ]
                }
            ]
        )

        return {
            "success": True,
            "markdown": response.content[0].text,
            "tokens_used": response.usage.input_tokens + response.usage.output_tokens,
            "error": None
        }

    except Exception as e:
        return {
            "success": False,
            "markdown": "",
            "tokens_used": 0,
            "error": str(e)
        }


def claude_cleanup(markdown: str) -> dict:
    """
    Clean up and improve MarkItDown output using Claude.

    Returns:
        dict: {"success": bool, "markdown": str, "tokens_used": int, "error": str|None}
    """
    try:
        prompt = f"""Clean up and improve this markdown extracted from a PDF. Your task:

1. Fix any formatting inconsistencies (headers, lists, tables)
2. Remove extraction artifacts and noise
3. Ensure proper markdown table formatting
4. Fix spacing and line break issues
5. Preserve ALL original content - do not summarize or remove information
6. Use consistent header hierarchy

Output ONLY the cleaned markdown, no explanations.

---
MARKDOWN TO CLEAN:
{markdown}
---"""

        response = client.messages.create(
            model=MODEL,
            max_tokens=MAX_TOKENS,
            messages=[{"role": "user", "content": prompt}]
        )

        return {
            "success": True,
            "markdown": response.content[0].text,
            "tokens_used": response.usage.input_tokens + response.usage.output_tokens,
            "error": None
        }

    except Exception as e:
        return {
            "success": False,
            "markdown": markdown,  # Return original on failure
            "tokens_used": 0,
            "error": str(e)
        }


print("✓ Conversion functions loaded")

✓ Conversion functions loaded


## Cell 5: Main Processing Pipeline

In [40]:
def process_pdf(pdf_path: str, verbose: bool = True) -> dict:
    """
    Main processing pipeline with automatic quality detection and fallback.

    Args:
        pdf_path: Path to the PDF file
        verbose: Print progress and diagnostics

    Returns:
        dict: {
            "success": bool,
            "markdown": str,
            "path_taken": str,
            "total_tokens": int,
            "diagnostics": dict
        }
    """
    diagnostics = {
        "markitdown_extraction": None,
        "heuristic_check": None,
        "claude_quality_check": None,
        "final_conversion": None
    }
    total_tokens = 0

    if verbose:
        print(f"\n{'='*60}")
        print(f"Processing: {pdf_path}")
        print(f"{'='*60}\n")

    # =========================================================================
    # Step 1: MarkItDown Extraction
    # =========================================================================
    if verbose:
        print("[1/4] MarkItDown Extraction...")

    mit_result = markitdown_extract(pdf_path)
    diagnostics["markitdown_extraction"] = {
        "success": mit_result["success"],
        "char_count": len(mit_result["markdown"]) if mit_result["success"] else 0,
        "error": mit_result["error"]
    }

    if not mit_result["success"]:
        if verbose:
            print(f"  ✗ MarkItDown failed: {mit_result['error']}")
            print("  → Falling back to Claude Vision...\n")

        # Direct fallback to Claude Vision
        vision_result = claude_vision_extract(pdf_path)
        total_tokens += vision_result.get("tokens_used", 0)
        diagnostics["final_conversion"] = {
            "method": "claude_vision_fallback",
            "reason": "markitdown_failed",
            "tokens": vision_result.get("tokens_used", 0)
        }

        return {
            "success": vision_result["success"],
            "markdown": vision_result["markdown"],
            "path_taken": "Claude Vision (MarkItDown failed)",
            "total_tokens": total_tokens,
            "diagnostics": diagnostics
        }

    if verbose:
        print(f"  ✓ Extracted {len(mit_result['markdown']):,} characters\n")

    # =========================================================================
    # Step 2: Heuristic Quality Check
    # =========================================================================
    if verbose:
        print("[2/4] Heuristic Quality Check...")

    # Estimate page count (rough: ~3000 chars per page)
    estimated_pages = max(1, len(mit_result["markdown"]) // 3000)
    heuristic_result = heuristic_check(mit_result["markdown"], estimated_pages)
    diagnostics["heuristic_check"] = heuristic_result

    if verbose:
        print(f"  Metrics:")
        for key, value in heuristic_result["metrics"].items():
            if isinstance(value, float):
                print(f"    - {key}: {value:.2f}")
            else:
                print(f"    - {key}: {value}")

    if not heuristic_result["passed"]:
        if verbose:
            print(f"  ✗ Heuristic check failed:")
            for issue in heuristic_result["issues"]:
                print(f"    - {issue}")
            print("  → Falling back to Claude Vision...\n")

        vision_result = claude_vision_extract(pdf_path)
        total_tokens += vision_result.get("tokens_used", 0)
        diagnostics["final_conversion"] = {
            "method": "claude_vision_fallback",
            "reason": "heuristic_check_failed",
            "tokens": vision_result.get("tokens_used", 0)
        }

        if verbose:
            print(f"  {'✓' if vision_result['success'] else '✗'} Claude Vision: {vision_result.get('tokens_used', 0):,} tokens\n")

        return {
            "success": vision_result["success"],
            "markdown": vision_result["markdown"],
            "path_taken": "Claude Vision (Heuristic failed)",
            "total_tokens": total_tokens,
            "diagnostics": diagnostics
        }

    if verbose:
        print(f"  ✓ Heuristic check passed\n")

    # =========================================================================
    # Step 3: Claude Quality Assessment
    # =========================================================================
    if verbose:
        print("[3/4] Claude Quality Assessment...")

    quality_result = claude_quality_check(mit_result["markdown"])
    total_tokens += quality_result.get("tokens_used", 0)
    diagnostics["claude_quality_check"] = quality_result

    if verbose:
        print(f"  Score: {quality_result['score']}/10")
        if quality_result["issues"]:
            print(f"  Issues:")
            for issue in quality_result["issues"]:
                print(f"    - {issue}")
        print(f"  Tokens used: {quality_result.get('tokens_used', 0):,}\n")

    if quality_result["score"] < QUALITY_THRESHOLDS["min_quality_score"]:
        if verbose:
            print(f"  ✗ Quality score below threshold ({QUALITY_THRESHOLDS['min_quality_score']})")
            print("  → Falling back to Claude Vision...\n")

        vision_result = claude_vision_extract(pdf_path)
        total_tokens += vision_result.get("tokens_used", 0)
        diagnostics["final_conversion"] = {
            "method": "claude_vision_fallback",
            "reason": f"quality_score_{quality_result['score']}_below_{QUALITY_THRESHOLDS['min_quality_score']}",
            "tokens": vision_result.get("tokens_used", 0)
        }

        if verbose:
            print(f"  {'✓' if vision_result['success'] else '✗'} Claude Vision: {vision_result.get('tokens_used', 0):,} tokens\n")

        return {
            "success": vision_result["success"],
            "markdown": vision_result["markdown"],
            "path_taken": f"Claude Vision (Quality score: {quality_result['score']}/10)",
            "total_tokens": total_tokens,
            "diagnostics": diagnostics
        }

    if verbose:
        print(f"  ✓ Quality check passed\n")

    # =========================================================================
    # Step 4: Claude Cleanup
    # =========================================================================
    if verbose:
        print("[4/4] Claude Cleanup...")

    cleanup_result = claude_cleanup(mit_result["markdown"])
    total_tokens += cleanup_result.get("tokens_used", 0)
    diagnostics["final_conversion"] = {
        "method": "markitdown_plus_cleanup",
        "reason": "quality_check_passed",
        "tokens": cleanup_result.get("tokens_used", 0)
    }

    if verbose:
        print(f"  {'✓' if cleanup_result['success'] else '✗'} Cleanup complete: {cleanup_result.get('tokens_used', 0):,} tokens\n")

    return {
        "success": cleanup_result["success"],
        "markdown": cleanup_result["markdown"],
        "path_taken": "MarkItDown + Claude Cleanup",
        "total_tokens": total_tokens,
        "diagnostics": diagnostics
    }


print("✓ Main pipeline loaded")

✓ Main pipeline loaded


## Cell 6: Execute Conversion

In [41]:
# Check if input file exists
if not Path(INPUT_PDF).exists():
    print(f"❌ Error: Input file not found: {INPUT_PDF}")
    print("\nPlease upload your PDF to /content/ or update INPUT_PDF path in Cell 2.")
else:
    # Process the PDF
    result = process_pdf(INPUT_PDF, verbose=True)

    # Summary
    print(f"{'='*60}")
    print("SUMMARY")
    print(f"{'='*60}")
    print(f"Status: {'✓ Success' if result['success'] else '✗ Failed'}")
    print(f"Path taken: {result['path_taken']}")
    print(f"Total tokens used: {result['total_tokens']:,}")
    print(f"Output length: {len(result['markdown']):,} characters")

    if result["success"]:
        # Save output
        with open(OUTPUT_MD, "w", encoding="utf-8") as f:
            f.write(result["markdown"])
        print(f"\n✓ Saved to: {OUTPUT_MD}")

        # Preview
        print(f"\n{'='*60}")
        print("PREVIEW (first 2000 characters)")
        print(f"{'='*60}\n")
        print(result["markdown"][:2000])
        if len(result["markdown"]) > 2000:
            print(f"\n... [{len(result['markdown']) - 2000:,} more characters]")
    else:
        print(f"\n❌ Conversion failed. Check diagnostics above.")


Processing: /content/pdfs/Ryder-Investor-Overview-June-2024.pdf

[1/4] MarkItDown Extraction...
  ✓ Extracted 25,632 characters

[2/4] Heuristic Quality Check...
  Metrics:
    - word_count: 3688
    - words_per_page: 461.00
    - whitespace_ratio: 0.17
    - header_count: 0
    - artifact_count: 4
    - table_line_count: 0
  ✗ Heuristic check failed:
    - No headers found in long document (3688 words)
  → Falling back to Claude Vision...

  ✓ Claude Vision: 49,046 tokens

SUMMARY
Status: ✓ Success
Path taken: Claude Vision (Heuristic failed)
Total tokens used: 49,046
Output length: 27,500 characters

✓ Saved to: /content/output.md

PREVIEW (first 2000 characters)

# Investor Overview
## June 2024

![Ryder Logo](ryder-logo.png)

![Fleet and Operations Image](ryder-operations.png)

---

## Safe Harbor and Non-GAAP Financial Measures

### Note Regarding Forward-Looking Statements

Certain statements and information included in this news release are "forward-looking statements" under th

## Cell 7: Batch Processing (Optional)

Use this cell to process multiple PDFs in a directory.

In [42]:
def batch_process(input_dir: str, output_dir: str, verbose: bool = False) -> list:
    """
    Process all PDFs in a directory.

    Args:
        input_dir: Directory containing PDF files
        output_dir: Directory to save markdown files
        verbose: Print detailed progress for each file

    Returns:
        list: Results for each processed file
    """
    input_path = Path(input_dir)
    output_path = Path(output_dir)
    output_path.mkdir(parents=True, exist_ok=True)

    pdf_files = list(input_path.glob("*.pdf")) + list(input_path.glob("*.PDF"))

    if not pdf_files:
        print(f"No PDF files found in {input_dir}")
        return []

    print(f"Found {len(pdf_files)} PDF files to process\n")

    results = []
    total_tokens = 0

    for i, pdf_file in enumerate(pdf_files, 1):
        print(f"[{i}/{len(pdf_files)}] {pdf_file.name}...", end=" ")

        result = process_pdf(str(pdf_file), verbose=verbose)
        result["filename"] = pdf_file.name
        results.append(result)
        total_tokens += result["total_tokens"]

        if result["success"]:
            output_file = output_path / f"{pdf_file.stem}.md"
            with open(output_file, "w", encoding="utf-8") as f:
                f.write(result["markdown"])
            print(f"✓ ({result['total_tokens']:,} tokens)")
        else:
            print(f"✗ Failed")

    # Summary
    successful = sum(1 for r in results if r["success"])
    print(f"\n{'='*60}")
    print(f"BATCH SUMMARY")
    print(f"{'='*60}")
    print(f"Processed: {len(pdf_files)} files")
    print(f"Successful: {successful}")
    print(f"Failed: {len(pdf_files) - successful}")
    print(f"Total tokens: {total_tokens:,}")
    print(f"Output directory: {output_dir}")

    return results


# Example usage (uncomment to run):
# batch_results = batch_process("/content/pdfs", "/content/markdown_output")

## Cell 8: Diagnostics & Debugging (Optional)

Use this cell to inspect detailed diagnostics from the last run.

In [43]:
# Display full diagnostics from last run
if 'result' in dir() and result is not None:
    print("Full Diagnostics:")
    print(json.dumps(result["diagnostics"], indent=2))
else:
    print("No results available. Run Cell 6 first.")


Full Diagnostics:
{
  "markitdown_extraction": {
    "success": true,
    "char_count": 25632,
    "error": null
  },
  "heuristic_check": {
    "passed": false,
    "issues": [
      "No headers found in long document (3688 words)"
    ],
    "metrics": {
      "word_count": 3688,
      "words_per_page": 461.0,
      "whitespace_ratio": 0.16604244694132334,
      "header_count": 0,
      "artifact_count": 4,
      "table_line_count": 0
    }
  },
  "claude_quality_check": null,
  "final_conversion": {
    "method": "claude_vision_fallback",
    "reason": "heuristic_check_failed",
    "tokens": 49046
  }
}
