# Pandoc Sidecar Demo

This notebook demonstrates how to use the pandoc/extra CLI to render LaTeX files.

## Prerequisites

Before running this demo, ensure you have the following installed:
- Docker (for containerized pandoc)
- Minikube (for Kubernetes deployment)
- Python 3.12+ with subprocess module

In [2]:
import subprocess
import os
import sys

## 1. Pull the pandoc/extra image

In [None]:
def pull_image(image, platform) -> str:
    if platform:
        cmd = ['docker', 'pull', '--platform', platform, image]
    else:
        cmd = ['docker', 'pull', image]
    
    return subprocess.run(cmd, capture_output=True, text=True, check=True).stdout

# Pull the pandoc/extra:3.7 image
# note, the --platform linux/amd64 is required as my dev machine is arm based.
try:
    result = pull_image(
        image='pandoc/extra:3.7', 
        platform='linux/amd64'
        )
    
    if result:
        print("Output:", result.strip())
except subprocess.CalledProcessError as e:
    if e.stderr:
        print("Error:", e.stderr.strip())
except FileNotFoundError:
    print("Docker not found. Please install Docker first.")

Output: 3.7: Pulling from pandoc/extra
Digest: sha256:a703d335fa237f8fc3303329d87e2555dca5187930da38bfa9010fa4e690933a
Status: Image is up to date for pandoc/extra:3.7
docker.io/pandoc/extra:3.7


### Verify Docker image and test basic functionality

In [11]:
# Verify the pandoc/extra image and test basic functionality
def image_version(image) -> str:
    return subprocess.run(['docker', 'run', '--rm', image, '--version'], capture_output=True, text=True, check=True).stdout

try:
    # Check if image exists locally
    result = subprocess.run(['docker', 'images', 'pandoc/extra:3.7', '--format', 'table'], 
                          capture_output=True, text=True, check=True)
    print("✓ pandoc/extra:3.7 image found locally")
    print("Image details:")
    print(result.stdout)
    
    # Test pandoc version in container
    print("\nTesting pandoc version in container...")
    result = image_version('pandoc/extra:3.7')
    print("✓ Pandoc container working")
    print("Version info:")
    print(result.split('\n')[0])  # First line contains version
    
except subprocess.CalledProcessError as e:
    print(f"✗ Error testing Docker image: {e}")
    if e.stderr:
        print("Error details:", e.stderr.strip())

✓ pandoc/extra:3.7 image found locally
Image details:
REPOSITORY     TAG       IMAGE ID       CREATED      SIZE
pandoc/extra   3.7       a703d335fa23   8 days ago   1.63GB


Testing pandoc version in container...
✓ Pandoc container working
Version info:
pandoc 3.7.0.2


## 3. Setup input and output directories

In [None]:
# Setup input and output directories
input_dir = "input"
output_dir = "output"

# Create directories if they don't exist
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

print(f"✓ Input directory: {input_dir}")
print(f"✓ Output directory: {output_dir}")

# Scan for input files
def get_input_files(directory, extensions=None):
    """Get all files from input directory with specified extensions"""
    if extensions is None:
        extensions = ['.tex', '.md', '.rst', '.docx', '.html']
    
    files = []
    if os.path.exists(directory):
        for file in os.listdir(directory):
            if any(file.lower().endswith(ext) for ext in extensions):
                files.append(file)
    return sorted(files)

# Find all processable files
input_files = get_input_files(input_dir)
print(f"\nFound {len(input_files)} files to process:")
for file in input_files:
    file_path = os.path.join(input_dir, file)
    file_size = os.path.getsize(file_path) if os.path.exists(file_path) else 0
    print(f"{file} ({file_size:,} bytes)")

if not input_files:
    print("No files found in input directory. Add some .tex, .md, or other supported files to get started.")

✓ Input directory: input
✓ Output directory: output

Found 1 files to process:
  📄 basic_example.tex (235 bytes)


## 4. Process file with pandoc CLI

In [None]:
# Batch process all input files with pandoc CLI
def pandoc_convert_file(input_file, output_format, input_dir, output_dir, image):
    """convert a single file."""
    input_path = os.path.join(input_dir, input_file)
    base_name = os.path.splitext(input_file)[0]
    
    # determine pandoc options
    if output_format == 'pdf':
        pandoc_args = ['--pdf-engine=pdflatex']
    elif output_format == 'html':
        pandoc_args = ['--standalone', '--mathjax']
    elif output_format == 'md':
        pandoc_args = ['--to=markdown']
    elif output_format == 'docx':
        pandoc_args = []
    else:
        raise Exception('unsupported output format')
        
    output_file = f"{base_name}.{output_format}"
    output_path = os.path.join(output_dir, output_file)
    
    current_dir = os.path.abspath('.')
    cmd = [
        'docker', 'run', '--platform', 'linux/amd64', '--rm',
        '-v', f'{current_dir}:/data',
        image,
        f'/data/{input_path}',
        '-o', f'/data/{output_path}'
    ] + pandoc_args
    
    result = subprocess.run(cmd, capture_output=True, text=True, check=True)
    
    if not os.path.exists(output_path):
        raise Exception(f'{output_format.upper()} output file not created')
    
    return output_path


# Process all input files
# note: during production, only the main file will need to be processed,
# Each file may have a collection of images, supporting docs, etc.
if input_files:  
    for input_file in input_files:
        print(f"\nProcessing: {input_file}")
        file_result = pandoc_convert_file(
            input_file, 
            output_format='pdf',
            input_dir='./input',
            output_dir='./output',
            image='pandoc/extra:3.7'
        )

    # Summary
    print(f"\nConverted: {file_result}")
else:
    print("No input files to process.")

Processing files with local pandoc (if available):

Processing: basic_example.tex

📊 Summary: Converted ./output/basic_example.pdf


## 5. Cleanup output files

In [None]:
# Setup input and output directories
input_dir = "input"
output_dir = "output"

# Scan for input files
def remove_files(directory, extensions=None) -> list[str]:
    """Remove all files in a directory"""
    files = []
    if os.path.exists(directory):
        for file in os.listdir(directory):
            # add the name to the files list
            # Delete the file
    return files