# PDF to Markdown

In [28]:
from pyhere import here
import sys
import os
from openai import OpenAI
from IPython.display import display_markdown

sys.path.append(os.path.abspath('../../src'))

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

## Extract: PDF -> Base64 Image

In [19]:
from pdftools.pdf2image import pdf_to_base64_images

In [35]:
cochran_img_base64 = pdf_to_base64_images(here("pdf/textbook/Cochran_1977_SamplingTechniques_Ch1.pdf"))

## LLM: OCR to Markdown

In [96]:
def ocr_single_image_to_markdown(base64_image, 
                                 model = "gpt-4o",
                                 md_format = "Github-flavored markdown",
                                 heading_lv_max = "H2"
                                 ):
    system_prompt = f"""
    You are an advanced OCR-based data extraction tool designed to convert text, tables, and structured content from images into {md_format}. Ensure the output retains the original layout and information integrity as closely as possible. Include headers, bullet points, or tables where appropriate, and optimize for readability in Markdown syntax.
    - The maximal heading level is {heading_lv_max}. 
    - Return markdown text output without enclosing in code block. If the image is blank or no appropriate content can be extracted, return empty text string ("").  
    """
    
    response = client.chat.completions.create(
        model=model,
        messages=[
            {
                "role": "system",
                "content": system_prompt
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Convert data from this image to markdown text"},
                    {"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}", "detail": "high"}}
                ]
            }
        ],
        temperature=0.0,
    )
    return response.choices[0].message.content


## Extract One Image

In [72]:
cochran_img_md_0 = ocr_single_image_to_markdown(cochran_img_base64)

In [44]:
display_markdown(print(cochran_img_md_0))

## Sampling Techniques

*third edition*

**WILLIAM G. COCHRAN**  
*Professor of Statistics, Emeritus  
Harvard University*

**JOHN WILEY & SONS**  
New York • Chichester • Brisbane • Toronto • Singapore


## Extract Multiple Image

In [95]:
def ocr_image_to_markdown(base64_images: list[str] | str, **kwarg):
    
    # Single Image
    is_single_image = all([len(x) == 1 for x in base64_images])
    if is_single_image:
        md_text = ocr_single_image_to_markdown(base64_images, **kwarg)
        return md_text
    # Multiple Images
    try:
        
        md_text_ls = [ocr_single_image_to_markdown(base64_image, **kwarg) for base64_image in base64_images] 
        md_text_ls_rm_blank = list(filter(None, md_text_ls)) # Remove blank string ("")
        md_text = "\n\n---\n\n".join(md_text_ls_rm_blank)
        return md_text
    
    except Exception as e:
        print(f"Error extract Image: {e}")
        return []
    

In [97]:
cochran_img_md_05 = ocr_image_to_markdown(cochran_img_base64[0:5], model = "gpt-4o-mini")

In [98]:
display_markdown(print(cochran_img_md_05))

# Sampling Techniques

## third edition

**WILLIAM G. COCHRAN**  
Professor of Statistics, Emeritus  
Harvard University

---

**JOHN WILEY & SONS**  
New York • Chichester • Brisbane • Toronto • Singapore

---

Copyright © 1977, by John Wiley & Sons, Inc.  
All rights reserved. Published simultaneously in Canada.  

Reproduction or translation of any part of this work beyond that permitted by Sections 107 or 108 of the 1976 United States Copyright Act without the permission of the copyright owner is unlawful. Requests for permission or further information should be addressed to the Permissions Department, John Wiley & Sons, Inc.  

## Library of Congress Cataloging in Publication Data:  
Cochran, William Gemmell, 1909-  
Sampling techniques.  

(Wiley series in probability and mathematical statistics)  
Includes bibliographical references and index.  
1. Sampling (Statistics) 1. Title.  

QA276.6.C6 1977 001.4222 77-728  
ISBN 0-471-16240-X  

Printed in the United States of America  

### HowTo

In [None]:
all([len(x) == 1 for x in cochran_img_base64[0]])

True

In [92]:
# Remove blank string
list(filter(None, ["A", "", ""])) 

['A']