In [1]:
conda install -c conda-forge python-dotenv

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.3.1
    latest version: 25.7.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [2]:
conda install -c conda-forge pymupdf

Channels:
 - conda-forge
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done


    current version: 25.3.1
    latest version: 25.7.0

Please update conda by running

    $ conda update -n base -c conda-forge conda



# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [4]:
# Cell 2: Import libraries and setup
import json
import boto3
import fitz  # PyMuPDF
import os
from urllib.parse import unquote_plus
from dotenv import load_dotenv
import pandas as pd
from botocore.config import Config
from IPython.display import display, Markdown, HTML

# Load environment variables from .env file
load_dotenv()

# Initialize AWS clients
config = Config(
    connect_timeout=300,
    read_timeout=300,
 )
session = boto3.Session(region_name=os.getenv('AWS_REGION', 'us-east-1'))
s3 = session.client('s3')
bedrock = session.client('bedrock-runtime', config = config, region_name='us-east-1')

print("✅ Libraries imported and AWS clients initialized")



✅ Libraries imported and AWS clients initialized


In [5]:
# Cell 3: Configuration
class Config:
    # S3 Configuration
    INPUT_BUCKET = os.getenv('INPUT_BUCKET', 'pdftomarkdown')
    OUTPUT_BUCKET = os.getenv('OUTPUT_BUCKET', 'pdftomarkdown-output')
    IMAGES_FOLDER = os.getenv('IMAGES_FOLDER', 'extracted_images')
    BEDROCK_REGION = os.getenv('BEDROCK_REGION', 'us-east-1')
    
    # Processing settings
    MAX_PAGES_PER_CHUNK = 5
    MAX_INPUT_TOKENS = 150000
    
    def display_config(self):
        print("📋 Current Configuration:")
        print(f"   Input Bucket: {self.INPUT_BUCKET}")
        print(f"   Output Bucket: {self.OUTPUT_BUCKET}")
        print(f"   Images Folder: {self.IMAGES_FOLDER}")
        print(f"   Bedrock Region: {self.BEDROCK_REGION}")

config = Config()
config.display_config()

📋 Current Configuration:
   Input Bucket: pdftomarkdown
   Output Bucket: pdftomarkdown-output
   Images Folder: extracted_images
   Bedrock Region: us-east-1


In [6]:
# Cell 4: Main processing functions
def download_pdf_from_s3(bucket, key):
    """Download PDF from S3 and return content"""
    print(f"📥 Downloading PDF: s3://{bucket}/{key}")
    try:
        pdf_obj = s3.get_object(Bucket=bucket, Key=key)
        pdf_content = pdf_obj['Body'].read()
        print(f"✅ Downloaded {len(pdf_content)} bytes")
        return pdf_content
    except Exception as e:
        print(f"❌ Error downloading PDF: {e}")
        raise

def extract_text_and_images(pdf_content, base_filename, output_bucket, images_folder):
    """Extract text and images from PDF"""
    print("🔍 Opening PDF and extracting content...")
    
    # Open PDF with PyMuPDF
    doc = fitz.open(stream=pdf_content, filetype="pdf")
    page_contents = []
    total_images_extracted = 0
    
    print(f"📄 Processing {len(doc)} pages...")
    
    # Extract text and images from each page
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)
        
        # Extract text
        text = page.get_text()
        
        # Extract and upload images
        image_list = page.get_images()
        page_images = []
        
        if image_list:
            print(f"   Page {page_num + 1}: Found {len(image_list)} images")
        
        for img_index, img in enumerate(image_list):
            try:
                xref = img[0]
                pix = fitz.Pixmap(doc, xref)
                
                if pix.n - pix.alpha < 4:  # GRAY or RGB
                    img_data = pix.tobytes("png")
                    img_filename = f"{base_filename}_page_{page_num + 1}_img_{img_index + 1}.png"
                    img_key = f"{images_folder}/{img_filename}"
                    
                    # Upload image to S3
                    s3.put_object(
                        Bucket=output_bucket,
                        Key=img_key,
                        Body=img_data,
                        ContentType='image/png'
                    )
                    
                    img_url = f"https://{output_bucket}.s3.amazonaws.com/{img_key}"
                    page_images.append({
                        'filename': img_filename,
                        'url': img_url,
                        'index': img_index + 1
                    })
                    total_images_extracted += 1
                    
                    print(f"      ✅ Uploaded: {img_filename}")
                
                pix = None
                
            except Exception as e:
                print(f"      ❌ Error processing image {img_index}: {e}")
        
        page_contents.append({
            'page_number': page_num + 1,
            'text': text,
            'images': page_images,
            'text_length': len(text)
        })
    
    doc.close()
    
    print(f"✅ Extraction complete: {total_images_extracted} images, {len(page_contents)} pages")
    
    return page_contents, total_images_extracted

In [7]:
# Cell 5: Claude conversion functions
def convert_with_claude_bedrock(pdf_content_pages):
    """Use Claude Sonnet 4.0 via AWS Bedrock for conversion"""
    
    print("🤖 Converting to Markdown using Claude Sonnet 4.0...")
    
    # Combine all pages into one document
    full_content = "\n\n--- PAGE BREAK ---\n\n".join([
        f"PAGE {page['page_number']}:\n{page['text']}" 
        for page in pdf_content_pages
    ])
    
    # Create image reference list
    image_refs = []
    for page in pdf_content_pages:
        for img in page['images']:
            image_refs.append(f"- Page {page['page_number']}, Image {img['index']}: {img['filename']} -> {img['url']}")
    
    image_reference_text = "\n".join(image_refs) if image_refs else "No images found in document"
    
    prompt = f"""You are a precise document conversion tool. Your task is to convert the given PDF content to Markdown format with a strict 1:1 conversion. Do not interpret, summarize, or alter the content in any way.

EXTRACTED PDF CONTENT:
{full_content}

AVAILABLE IMAGES (reference these where they logically appear in the content):
{image_reference_text}

CONVERSION INSTRUCTIONS:
1. Convert to well-structured Markdown maintaining exact content
2. Preserve all information, measurements, and technical details exactly as written
3. Use proper Markdown syntax for tables, headers, lists, and formatting
4. Insert image references where they logically belong using: step numbers !(image_url)
5. Maintain the document's hierarchical structure and formatting
6. Keep all version numbers, dates, page numbers, and step numbers exact
7. Preserve any special formatting like bold text, bullet points, numbered lists
8. Do not add any content that wasn't in the original
9. Do not remove any content from the original
10. Strictly maintain the table structure

Convert the above PDF content to Markdown format:"""

    try:
        # Check if content is too large
        if len(prompt) > config.MAX_INPUT_TOKENS:
            print("📄 Content too large, processing in chunks...")
            return process_large_document_in_chunks(pdf_content_pages)
        
        print(f"📤 Sending to Claude (content length: {len(prompt)} chars)...")
        
        response = bedrock.invoke_model(
            modelId='us.anthropic.claude-sonnet-4-20250514-v1:0',
            body=json.dumps({
                "anthropic_version": "bedrock-2023-05-31",
                "max_tokens": 65536,
                "messages": [
                    {
                        "role": "user",
                        "content": prompt
                    }
                ]
            })
        )
        
        result = json.loads(response['body'].read())
        markdown_content = result['content'][0]['text']
        
        print(f"✅ Claude conversion completed. Output length: {len(markdown_content)} characters")
        return markdown_content
        
    except Exception as e:
        print(f"❌ Error with Bedrock Claude: {e}")
        print("🔄 Falling back to basic conversion...")
        return fallback_markdown_conversion(pdf_content_pages)

def process_large_document_in_chunks(pdf_content_pages):
    """Process large documents by splitting into chunks"""

    #system_prompt = "You are a precise document conversion tool. Your task is to convert the given PDF content to Markdown format with a strict 1:1 conversion. Do not interpret, summarize, or alter the content in any way."
    chunk_size = config.MAX_PAGES_PER_CHUNK
    markdown_parts = []
    total_chunks = (len(pdf_content_pages) + chunk_size - 1) // chunk_size
    
    print(f"📑 Processing {total_chunks} chunks of {chunk_size} pages each...")
    
    for i in range(0, len(pdf_content_pages), chunk_size):
        chunk_num = i // chunk_size + 1
        chunk_pages = pdf_content_pages[i:i + chunk_size]
        
        print(f"   Processing chunk {chunk_num}/{total_chunks}: Pages {chunk_pages[0]['page_number']}-{chunk_pages[-1]['page_number']}")
        
        # Create prompt for this chunk
        chunk_content = "\n\n--- PAGE BREAK ---\n\n".join([
            f"PAGE {page['page_number']}:\n{page['text']}" 
            for page in chunk_pages
        ])
        
        # Create image reference list for this chunk
        chunk_image_refs = []
        for page in chunk_pages:
            for img in page['images']:
                chunk_image_refs.append(f"- Page {page['page_number']}, Image {img['index']}: {img['filename']} -> {img['url']}")
        
        chunk_image_text = "\n".join(chunk_image_refs) if chunk_image_refs else "No images in this chunk"
        
        prompt = f"""Convert this PDF content chunk to Markdown format. Maintain exact content and structure.

CONTENT CHUNK (Pages {chunk_pages[0]['page_number']}-{chunk_pages[-1]['page_number']}):
{chunk_content}

IMAGES FOR THIS CHUNK:
{chunk_image_text}

Convert to Markdown maintaining all original formatting and content:"""

        try:
            response = bedrock.invoke_model(
                modelId='us.anthropic.claude-sonnet-4-20250514-v1:0',
                body=json.dumps({
                    "anthropic_version": "bedrock-2023-05-31",
                    "max_tokens": 50000,
                    "messages": [
                        {
                            "role": "user",
                            "content": prompt
                        }
                    ]
                })
            )
            
            result = json.loads(response['body'].read())
            chunk_markdown = result['content'][0]['text']
            markdown_parts.append(chunk_markdown)
            
            print(f"      ✅ Chunk {chunk_num} completed")
            
        except Exception as e:
            print(f"      ❌ Error processing chunk {chunk_num}: {e}")
            # Add fallback for this chunk
            fallback_chunk = fallback_markdown_conversion(chunk_pages)
            markdown_parts.append(fallback_chunk)
    
    print("✅ All chunks processed, combining results...")
    return "\n\n".join(markdown_parts)

def fallback_markdown_conversion(pdf_content_pages):
    """Simple fallback conversion if Claude fails"""
    
    print("🔄 Using fallback conversion method...")
    
    markdown_content = "# PDF Document\n\n"
    
    for page in pdf_content_pages:
        markdown_content += f"## Page {page['page_number']}\n\n"
        
        # Basic text conversion
        text_lines = page['text'].split('\n')
        for line in text_lines:
            line = line.strip()
            if line:
                markdown_content += f"{line}\n\n"
        
        # Add images
        for img in page['images']:
            markdown_content += f"![Image {img['index']}]({img['url']})\n\n"
    
    return markdown_content

In [8]:
# Cell 6: Main processing function
def process_pdf(input_bucket, pdf_key, output_bucket=None, images_folder=None):
    """Main function to process a PDF from S3"""
    
    if output_bucket is None:
        output_bucket = config.OUTPUT_BUCKET
    if images_folder is None:
        images_folder = config.IMAGES_FOLDER
    
    print(f"🚀 Starting PDF processing...")
    print(f"   Source: s3://{input_bucket}/{pdf_key}")
    print(f"   Output: s3://{output_bucket}")
    print("-" * 60)
    
    try:
        # Step 1: Download PDF
        pdf_content = download_pdf_from_s3(input_bucket, pdf_key)
        
        # Step 2: Extract content
        base_filename = os.path.splitext(os.path.basename(pdf_key))[0]
        page_contents, total_images = extract_text_and_images(
            pdf_content, base_filename, output_bucket, images_folder
        )
        
        # Step 3: Convert with Claude
        markdown_content = convert_with_claude_bedrock(page_contents)
        
        # Step 4: Save markdown to S3
        markdown_key = pdf_key.replace('.pdf', '.md')
        s3.put_object(
            Bucket=output_bucket,
            Key=markdown_key,
            Body=markdown_content.encode('utf-8'),
            ContentType='text/markdown'
        )
        
        print("-" * 60)
        print("🎉 Processing completed successfully!")
        
        # Create summary
        summary = {
            'source_file': f's3://{input_bucket}/{pdf_key}',
            'markdown_file': f's3://{output_bucket}/{markdown_key}',
            'images_extracted': total_images,
            'pages_processed': len(page_contents),
            'markdown_size': len(markdown_content)
        }
        
        # Display results
        display_results(summary, page_contents[:3])  # Show first 3 pages
        
        return summary, markdown_content
        
    except Exception as e:
        print(f"❌ Error processing PDF: {e}")
        raise

def display_results(summary, sample_pages):
    """Display processing results in a nice format"""
    
    print("\n📊 Processing Summary:")
    print(f"   ✅ Source: {summary['source_file']}")
    print(f"   ✅ Output: {summary['markdown_file']}")
    print(f"   ✅ Pages: {summary['pages_processed']}")
    print(f"   ✅ Images: {summary['images_extracted']}")
    print(f"   ✅ Markdown size: {summary['markdown_size']:,} characters")
    
    # Create a DataFrame for page summary
    page_data = []
    for page in sample_pages:
        page_data.append({
            'Page': page['page_number'],
            'Text Length': page['text_length'],
            'Images': len(page['images']),
            'Sample Text': page['text'][:100] + "..." if len(page['text']) > 100 else page['text']
        })
    
    if page_data:
        print(f"\n📋 Sample Pages (showing first {len(page_data)} pages):")
        df = pd.DataFrame(page_data)
        display(df)

In [9]:
# Cell 7: Utility functions
def list_pdfs_in_bucket(bucket_name, prefix=""):
    """List all PDF files in an S3 bucket"""
    
    print(f"📁 Listing PDFs in s3://{bucket_name}/{prefix}")
    
    try:
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=prefix)
        
        if 'Contents' not in response:
            print("   No files found")
            return []
        
        pdf_files = [obj['Key'] for obj in response['Contents'] if obj['Key'].lower().endswith('.pdf')]
        
        print(f"   Found {len(pdf_files)} PDF files:")
        for i, pdf in enumerate(pdf_files[:10], 1):  # Show first 10
            size_mb = response['Contents'][i-1]['Size'] / (1024*1024)
            print(f"   {i:2d}. {pdf} ({size_mb:.1f} MB)")
        
        if len(pdf_files) > 10:
            print(f"   ... and {len(pdf_files) - 10} more files")
        
        return pdf_files
        
    except Exception as e:
        print(f"   ❌ Error listing files: {e}")
        return []

def preview_markdown(markdown_content, max_lines=50):
    """Preview the generated markdown"""
    
    lines = markdown_content.split('\n')
    preview_lines = lines[:max_lines]
    
    print(f"📖 Markdown Preview (first {len(preview_lines)} lines):")
    print("-" * 60)
    
    for i, line in enumerate(preview_lines, 1):
        print(f"{i:3d}: {line}")
    
    if len(lines) > max_lines:
        print(f"... and {len(lines) - max_lines} more lines")
    
    print("-" * 60)
    
    # Also display as rendered markdown (first 1000 chars)
    if len(markdown_content) > 0:
        preview_content = markdown_content[:1000]
        if len(markdown_content) > 1000:
            preview_content += "\n\n... (truncated)"
        
        print("\n🔍 Rendered Preview:")
        display(Markdown(preview_content))

def batch_process_pdfs(bucket_name, pdf_list, max_files=None):
    """Process multiple PDFs in batch"""
    
    if max_files:
        pdf_list = pdf_list[:max_files]
    
    print(f"🔄 Starting batch processing of {len(pdf_list)} PDFs...")
    
    results = []
    
    for i, pdf_key in enumerate(pdf_list, 1):
        print(f"\n📄 Processing file {i}/{len(pdf_list)}: {pdf_key}")
        print("=" * 80)
        
        try:
            summary, markdown = process_pdf(bucket_name, pdf_key)
            results.append({
                'file': pdf_key,
                'status': 'success',
                'summary': summary
            })
        except Exception as e:
            print(f"❌ Failed to process {pdf_key}: {e}")
            results.append({
                'file': pdf_key,
                'status': 'failed',
                'error': str(e)
            })
    
    # Summary
    successful = sum(1 for r in results if r['status'] == 'success')
    failed = len(results) - successful
    
    print(f"\n🏁 Batch processing completed!")
    print(f"   ✅ Successful: {successful}")
    print(f"   ❌ Failed: {failed}")
    
    return results

In [10]:
# Cell 8: Example usage and testing

# Example 1: Process a single PDF
def example_single_pdf():
    """Example: Process a single PDF"""
    
    # Configure your file details
    input_bucket = "pdftomarkdown"
    pdf_file = "Inputfiles/Customer 1 - iPhone 8 Plus Battery Replacement.pdf"
    
    try:
        summary, markdown = process_pdf(input_bucket, pdf_file)
        
        # Preview the results
        preview_markdown(markdown)
        
        return summary, markdown
        
    except Exception as e:
        print(f"Example failed: {e}")

# Example 2: List and select PDFs
def example_list_and_select():
    """Example: List PDFs and select one to process"""
    
    bucket_name = "pdftomarkdown"
    
    # List available PDFs
    pdf_files = list_pdfs_in_bucket(bucket_name)
    
    if pdf_files:
        # Process the first PDF as example
        selected_pdf = pdf_files[0]
        print(f"\n🎯 Processing selected file: {selected_pdf}")
        
        summary, markdown = process_pdf(bucket_name, selected_pdf)
        preview_markdown(markdown, max_lines=30)
        
        return summary, markdown
    else:
        print("No PDF files found to process")

# Example 3: Batch processing
def example_batch_processing():
    """Example: Process multiple PDFs"""
    
    bucket_name = "pdftomarkdown"
    
    # List PDFs
    pdf_files = list_pdfs_in_bucket(bucket_name)
    
    if pdf_files:
        # Process first 3 files as example
        results = batch_process_pdfs(bucket_name, pdf_files, max_files=3)
        
        # Display results summary
        success_df = pd.DataFrame([
            {
                'File': r['file'],
                'Status': r['status'],
                'Pages': r['summary']['pages_processed'] if r['status'] == 'success' else 'N/A',
                'Images': r['summary']['images_extracted'] if r['status'] == 'success' else 'N/A'
            }
            for r in results
        ])
        
        display(success_df)
        return results
    else:
        print("No PDF files found for batch processing")

# Uncomment the example you want to run:
# summary, markdown = example_single_pdf()
# summary, markdown = example_list_and_select()
# results = example_batch_processing()


# S3 Buckets
INPUT_BUCKET="pdftomarkdown"
OUTPUT_BUCKET="pdftomarkdown-output"
IMAGES_FOLDER="extracted_images"

# Bedrock Configuration
BEDROCK_REGION='us-east-1'


## Usage Examples:
# Cell 9: Quick start example
# Update these values with your actual bucket and file
INPUT_BUCKET = "pdftomarkdown"
PDF_FILE = "Inputfiles/Customer 1 - iPhone 8 Plus Battery Replacement.pdf"

# Process the PDF
summary, markdown_content = process_pdf(INPUT_BUCKET, PDF_FILE)

# Preview the results
preview_markdown(markdown_content, max_lines=20)

🚀 Starting PDF processing...
   Source: s3://pdftomarkdown/Inputfiles/Customer 1 - iPhone 8 Plus Battery Replacement.pdf
   Output: s3://pdftomarkdown-output
------------------------------------------------------------
📥 Downloading PDF: s3://pdftomarkdown/Inputfiles/Customer 1 - iPhone 8 Plus Battery Replacement.pdf
✅ Downloaded 2258521 bytes
🔍 Opening PDF and extracting content...
📄 Processing 16 pages...
   Page 1: Found 2 images
      ✅ Uploaded: Customer 1 - iPhone 8 Plus Battery Replacement_page_1_img_1.png
      ✅ Uploaded: Customer 1 - iPhone 8 Plus Battery Replacement_page_1_img_2.png
   Page 2: Found 4 images
      ✅ Uploaded: Customer 1 - iPhone 8 Plus Battery Replacement_page_2_img_1.png
      ✅ Uploaded: Customer 1 - iPhone 8 Plus Battery Replacement_page_2_img_2.png
      ✅ Uploaded: Customer 1 - iPhone 8 Plus Battery Replacement_page_2_img_3.png
      ✅ Uploaded: Customer 1 - iPhone 8 Plus Battery Replacement_page_2_img_4.png
   Page 3: Found 2 images
      ✅ Uploaded: C

Unnamed: 0,Page,Text Length,Images,Sample Text
0,1,1163,2,iPhone 8 Plus Battery Replacement \nVersion 1 ...
1,2,722,4,iPhone 8 Plus Battery Replacement \nVersion 1 ...
2,3,1008,2,iPhone 8 Plus Battery Replacement \nVersion 1 ...


📖 Markdown Preview (first 20 lines):
------------------------------------------------------------
  1: # iPhone 8 Plus Battery Replacement
  2: **Version 1** | **January 17, 2018** | **Page 1 of 16**
  3: 
  4: | Major Steps | Key Details | Safety/Exceptions/Reasons/Images |
  5: |-------------|-------------|----------------------------------|
  6: | **1. Pentalobe Screws** | **1B. Power off your iPhone before beginning disassembly.** <br><br> **1C. Remove the two 3.5 mm pentalobe screws from the bottom edge of the iPhone.** | **1A. Before you begin, discharge your iPhone battery below 25%. A charged lithium-ion battery can catch fire and/or explode if accidentally punctured.** <br><br> **1C.** ![](https://pdftomarkdown-output.s3.amazonaws.com/extracted_images/Customer%201%20-%20iPhone%208%20Plus%20Battery%20Replacement_page_1_img_1.png) <br><br> **1D. Opening the iPhone's display will compromise its waterproof seals. Have replacement seals ready before you proceed past this step, or t

# iPhone 8 Plus Battery Replacement
**Version 1** | **January 17, 2018** | **Page 1 of 16**

| Major Steps | Key Details | Safety/Exceptions/Reasons/Images |
|-------------|-------------|----------------------------------|
| **1. Pentalobe Screws** | **1B. Power off your iPhone before beginning disassembly.** <br><br> **1C. Remove the two 3.5 mm pentalobe screws from the bottom edge of the iPhone.** | **1A. Before you begin, discharge your iPhone battery below 25%. A charged lithium-ion battery can catch fire and/or explode if accidentally punctured.** <br><br> **1C.** ![](https://pdftomarkdown-output.s3.amazonaws.com/extracted_images/Customer%201%20-%20iPhone%208%20Plus%20Battery%20Replacement_page_1_img_1.png) <br><br> **1D. Opening the iPhone's display will compromise its waterproof seals. Have replacement seals ready before you proceed past this step, or take care to avoid liquid exposure if you reassemble your iPhone without replacing the seals.** |
| **2. Opening Procedure** | **

... (truncated)