# Convert MBL pdf to HTML
The MBL policy is a complext set of tables sued to stucture the 
informatoin. This extensive use of tables did not convert well 
using standard tools such as mathpix or marker so we we use Claude sonnet 
to convert the 

In [1]:
import os
from io import BytesIO
from pypdf import PdfReader, PdfWriter
import base64
from anthropic import Anthropic
from loguru import logger
from IPython.display import HTML
from dotenv import load_dotenv
from bs4 import BeautifulSoup

load_dotenv()

True

In [2]:
def pdf_to_html(pdf_base64_string: str) -> str:
        
    prompt = """Convert the PDF to a valid HTML including the <html>, <head> and <body> tags. Do not summarize or change the text, 
                just convert the PDF to HTML keeping the formatting as close as 
                possible to the original PDF. Remove page headers, footers, and page numbers."""
    messages = [
        {
            "role": 'user',
            "content": [
                {"type": "document", "source": {"type": "base64", "media_type": "application/pdf", "data": pdf_base64_string}},
                {"type": "text", "text": prompt}
            ]
        }
    ]

    # While PDF support is in beta, you must pass in the correct beta header
    client = Anthropic(default_headers={"anthropic-beta": "pdfs-2024-09-25"})
    
    # For now, only claude-3-5-sonnet-20241022 supports PDFs
    result = client.messages.create(
            model="claude-3-5-sonnet-20241022",
            max_tokens=8192,
            messages=messages
        )

    cost_cents = (result.usage.input_tokens * 3/1_000_000 + result.usage.output_tokens * 15/1_000_000) *100
    logger.info(f"cost ¢{cost_cents:.2f}, input_tokens={result.usage.input_tokens}, output_tokens={result.usage.output_tokens}")

    return result.content[0].text


def convert_page(page:str, n:int, cache_dir:str) -> str:
    
    file_name = f"{cache_dir}/page_{n}.html"
    if os.path.exists(file_name):
        logger.info(f"Using cached result in {file_name}")
        with open(file_name, "r") as f:
            return f.read()
        
    logger.info(f"Converting page {n}")
    # Create a new PDF writer with a single page
    writer = PdfWriter()
    writer.add_page(page)

    # Write to a BytesIO object
    pdf_bytes = BytesIO()
    writer.write(pdf_bytes)

    # PDF encoding as base64
    base64_string = base64.standard_b64encode(pdf_bytes.getvalue()).decode("utf-8")
    result = pdf_to_html(base64_string)
    
    # Cache the result to an HTML file
    with open(file_name, "w") as f:
        logger.info(f"Writing {file_name}")
        f.write(result)

    return result


# Read the PDF and write page 6 to memory
def convert_pdf(pdf_path:str, skip_pages: list = []) -> str:
    reader = PdfReader(pdf_path)
    style= """<style>
        table {
            border-collapse: collapse;
            width: 100%;
            margin-bottom: 20px;
        }
        th, td {
            border: 1px solid black;
            padding: 8px;
            vertical-align: top;
        }
    </style>"""
    doc_soup = BeautifulSoup(f"<html><head>{style}</head><body></body></html>", 'html.parser')
    
    base_name, _ = os.path.splitext(os.path.basename(pdf_path))
    directory = os.path.dirname(pdf_path)
    html_path = f"{directory}/{base_name}.html"
    cache_dir = f"{directory}/.html_cache"
    if not os.path.exists(cache_dir):
        os.makedirs(cache_dir)

    for i, page in enumerate(reader.pages):
        html = convert_page(page, i, cache_dir)
        if i in skip_pages:
            logger.info(f"Skipping page {i}")
        else:
            if page_soup := BeautifulSoup(html, 'html.parser').body:
                doc_soup.body.extend(page_soup.contents)
            else:
                logger.warning(f"Page {i} has no body")
    
    with open(html_path, "w") as f:
        f.write(str(doc_soup))
    
    return str(doc_soup)


In [4]:
policy_file = "./Manual/macquarie_residential_home_loans_credit_guidelines.pdf" 
html = convert_pdf(policy_file, skip_pages=[0, 1,2])
# HTML(html)

[32m2025-02-10 10:54:07.188[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_page[0m:[36m40[0m - [1mConverting page 0[0m


[32m2025-02-10 10:54:24.013[0m | [1mINFO    [0m | [36m__main__[0m:[36mpdf_to_html[0m:[36m27[0m - [1mcost ¢1.82, input_tokens=1883, output_tokens=839[0m
[32m2025-02-10 10:54:24.014[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_page[0m:[36m55[0m - [1mWriting ./Manual/.html_cache/page_0.html[0m
[32m2025-02-10 10:54:24.019[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_pdf[0m:[36m88[0m - [1mSkipping page 0[0m
[32m2025-02-10 10:54:24.022[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_page[0m:[36m40[0m - [1mConverting page 1[0m
[32m2025-02-10 10:54:41.179[0m | [1mINFO    [0m | [36m__main__[0m:[36mpdf_to_html[0m:[36m27[0m - [1mcost ¢1.72, input_tokens=2039, output_tokens=742[0m
[32m2025-02-10 10:54:41.180[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_page[0m:[36m55[0m - [1mWriting ./Manual/.html_cache/page_1.html[0m
[32m2025-02-10 10:54:41.182[0m | [1mINFO    [0m | [36m__main__[0m:[36mconvert_pdf[0m: