# PDF to Markdown with Memory

In [1]:
from pyhere import here
import sys
import os
from pathlib import Path
from openai import OpenAI
from IPython.display import display_markdown

sys.path.append(os.path.abspath('../..'))

api_key = os.getenv("OPENAI_API_KEY")
client = OpenAI(api_key=api_key)

## Final Wrapper

In [71]:
from src.openai_tools.ocr_md_batch import ocr_pdf_batch_to_markdown
from src.fs import write_text_file

In [70]:
%%time
tirads_md = ocr_pdf_batch_to_markdown(here("pdf/paper/TI-RADS_A User’s Guide.pdf"),
                                       model = "gpt-4o", batch_size = 3)


CPU times: user 522 ms, sys: 27.4 ms, total: 549 ms
Wall time: 2min 9s


In [72]:
write_text_file(tirads_md, here("output/markdown/TI-RADS_A-User-Guide_batched.md"))

Text successfully written to /Users/kittipos/Documents/LLM/llm-notes/output/markdown/TI-RADS_A-User-Guide_batched.md.


## PDF -> Base64 Image

In [2]:
from src.pdftools import pdf_to_base64_images

cochran_img_base64 = pdf_to_base64_images(here("pdf/textbook/Cochran_1977_SamplingTechniques_Ch1.pdf"))
cochran_pg14_img_base64 = pdf_to_base64_images(here("pdf/textbook/Cochran_1977_SamplingTechniques_Ch1_pg14.pdf"))

In [3]:
cochran_img_base64[0:3]

['iVBORw0KGgoAAAANSUhEUgAAAYAAAAJPCAIAAAD685QUAAB1o0lEQVR4nO3dd3wU1f4//pnZXrLZzaZn00kvpJCQEDAQQm+hqyhNRIoK9nbVqyhgV7CCAgKiFOkt9FACgZCekN57drO97878/ji/u598Q7mo6OTq+/mHD9zMzpwt89pzzpxzBqcoCgMAADoQdBcAAPDPBQEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwEEAKANBBAAgDYQQAAA2kAAAQBoAwFED4qirFarVquluyAA0IlJdwH+cSiKksvlly9fzs/PHzJkyPTp0+kuEQC0gQD665AkefLkyUuXLlVUVNTW1qrVapFIRHeh/g6ampqKioqmTJlCEFCj/x8DAfTXoSiqurq6pKTk3LlzJpPJ1dX1jpsVFxfX19fHxcX5+fkxGIy/uJD/K9Rq9ZUrVwoLCwsKCnp6enQ63YQJE9hs9l9z6JKSEovFEhsb6+bm9hcc8W9s4AaQ3W6vqalpa2uzWCwcDic4ONjf35/uQv0hBEHMnj179OjRx48ff/XVV++4zc2bN9evX19QUBAVFfX5558HBQX9xYW8m8rKyiNHjqSmpgqFQhaLZbVaeTye3W63WCyObWw2m8Vi4fP5/Z

## OCR batched of Images to List of Markdown

In [16]:
from src.openai_tools.mem_vision import get_completions_vision_mem_df

def ocr_batch_image_to_markdown(base64_images, 
                                model = "gpt-4o",
                                md_format = "Github-flavored markdown",
                                heading_lv_max = "H2"
                                ):
    system_prompt = f"""
    You are an advanced OCR-based data extraction tool designed to convert text, tables, and structured content from images of document pages into {md_format}. 
    - Each image will represent a single page of a document.
    - Ensure the output retains the original layout and information integrity as closely as possible. Include headers, bullet points, or tables where appropriate, and optimize for readability in Markdown syntax.
    
    Here are the Markdown specification:
    **Heading level:** The highest level of heading is {heading_lv_max}. 
    **LaTeX Math expression**
    - Inline: surround the inline expression with dollar symbols, for example: $1+1 = 2$
    - Blocks: delimit the block expression with two dollar symbols, for example:
      $$
      E = m \times c^2 
      $$
    
    Return markdown text output without enclosing in code block. If any page is blank or no appropriate content can be extracted, return empty text string ("").  
    """
    
    response_df = get_completions_vision_mem_df(image_prompt="Convert data from this page to markdown text",
                                                image_prompt_next="Next page",
                                                base64_images=base64_images,
                                                system_prompt=system_prompt,
                                                model=model)
    
    response_ls = response_df["assistant_text"].to_list()
    
    return response_ls

In [18]:
cochran_img_md_0_3 = ocr_batch_image_to_markdown(base64_images = cochran_img_base64[0:3])

In [19]:
display_markdown(print(cochran_img_md_0_3[0])) 
print("\n\n---\n\n")
display_markdown(print(cochran_img_md_0_3[1])) 
print("\n\n---\n\n")
display_markdown(print(cochran_img_md_0_3[2])) 

## Sampling Techniques

*third edition*

**WILLIAM G. COCHRAN**

*Professor of Statistics, Emeritus  
Harvard University*

**JOHN WILEY & SONS**  
New York • Chichester • Brisbane • Toronto • Singapore


---


Copyright © 1977, by John Wiley & Sons, Inc.

All rights reserved. Published simultaneously in Canada.

Reproduction or translation of any part of this work beyond that permitted by Sections 107 or 108 of the 1976 United States Copyright Act without the permission of the copyright owner is unlawful. Requests for permission or further information should be addressed to the Permissions Department, John Wiley & Sons, Inc.

**Library of Congress Cataloging in Publication Data:**  
Cochran, William Gemmell, 1909—
Sampling techniques.

(Wiley series in probability and mathematical statistics)  
Includes bibliographical references and index.  
1. Sampling (Statistics) I. Title.

QA276.6.C6 1977 001.4'222 77-728  
ISBN 0-471-16240-X

Printed in the United States of America

40 39 38 37 3

## Extract Multiple Images Batches

In [63]:
def ocr_image_batch_to_markdown(base64_images: list[str] | str, batch_size = 3, **kwarg):
    """Convert one or multiple base64-encoded images to Markdown text with batched memory."""
    
    # Single Page
    is_single_image = all([len(x) == 1 for x in base64_images])
    if is_single_image:
        md_text = ocr_batch_image_to_markdown(base64_images, **kwarg)
        return md_text
    
    # Multiple Pages
    base64_images_batched = _slice_list(base64_images, batch_size = batch_size)
    
    out_ls_nested = [] # Will be Nested list
    
    for base64_images_ls in base64_images_batched:
        md_text_ls = ocr_batch_image_to_markdown(base64_images_ls, **kwarg)
        md_text_ls_rm_blank = list(filter(None, md_text_ls)) # Remove blank string ("")
        out_ls_nested.append(md_text_ls_rm_blank) 
        
    out_ls = [item for sublist in out_ls_nested for item in sublist] # Un-nest List
    md_text = "\n\n---\n\n".join(out_ls)
        
    return md_text
    

In [65]:
cochran_img_md_0_9 = ocr_image_batch_to_markdown(cochran_img_base64[0:9])

In [66]:
cochran_img_md_0_9

"## Sampling Techniques\n\n*third edition*\n\n**WILLIAM G. COCHRAN**\n\n*Professor of Statistics, Emeritus  \nHarvard University*\n\n**JOHN WILEY & SONS**  \nNew York · Chichester · Brisbane · Toronto · Singapore\n\n---\n\nCopyright © 1977, by John Wiley & Sons, Inc.\n\nAll rights reserved. Published simultaneously in Canada.\n\nReproduction or translation of any part of this work beyond that permitted by Sections 107 or 108 of the 1976 United States Copyright Act without the permission of the copyright owner is unlawful. Requests for permission or further information should be addressed to the Permissions Department, John Wiley & Sons, Inc.\n\n**Library of Congress Cataloging in Publication Data:**\n\nCochran, William Gemmell, 1909-  \nSampling techniques.\n\n(Wiley series in probability and mathematical statistics)  \nIncludes bibliographical references and index.\n\n1. Sampling (Statistics) I. Title.  \nQA276.6.C6 1977 001.4'222 77-728  \nISBN 0-471-16240-X\n\nPrinted in the United 

In [67]:
from pathlib import Path

file_path = Path(here("output/markdown/Cochran_1977_SamplingTechniques_Ch1_Batch_0-9.md"))

with open(file_path, 'w', encoding='utf-8') as file:
    # Write the string of text to the file
    file.write(cochran_img_md_0_9)

#### HowTo: Slice List to Sublist with Batch Size

In [35]:
def _slice_list(ls, batch_size):
    """Split a list into sublists with a maximum length of `batch_size`."""
    return [ls[i:(i + batch_size)] for i in range(0, len(ls), batch_size)]

# Example usage:
print(_slice_list(["a", "b", "c", "d", "e", "f", "g"], 2))
print(_slice_list(["a", "b", "c", "d", "e", "f", "g"], 3))

[['a', 'b'], ['c', 'd'], ['e', 'f'], ['g']]
[['a', 'b', 'c'], ['d', 'e', 'f'], ['g']]
