# PDF Processing with LayoutLMv3

This notebook demonstrates how to process PDF files using LayoutLMv3 for layout understanding and image extraction.
CUDA Tooljit require 12.1++ 

## create environment

GPU

conda create --name LMMRAGwithGPU python=3.10

In [None]:
%pip install -r reqgpu.txt
%pip install --extra-index-url https://miropsota.github.io/torch_packages_builder detectron2==0.6+pt2.3.1cu121
%pip install pillow==8.4.0

In [None]:
# Cell 1: Import necessary libraries
import os  
import cv2  
import yaml  
import time  
import pytz  
import datetime  
import json  
from PIL import Image  
from modules.extract_pdf import load_pdf_fitz  
from modules.layoutlmv3.model_init import Layoutlmv3_Predictor

In [None]:
# Cell 2: Define layout model initialization function
def layout_model_init(weight):  
    model = Layoutlmv3_Predictor(weight)  
    return model

In [None]:
# Cell 3: Define function to save images
def save_image(image, output_dir, basename, page_idx, img_idx, img_format='png'):  
    os.makedirs(output_dir, exist_ok=True)  
    image_path = os.path.join(output_dir, f'{basename}_page_{page_idx + 1}_img_{img_idx + 1}.{img_format}')  
    image.save(image_path)  
    # print(f'Saved image {img_idx + 1} from page {page_idx + 1} to {image_path}')  
    return image_path

In [None]:
# Cell 4: Setup environment and parse arguments
pdf_path = 'demofile.pdf'  # Update as needed
output_dir = 'output'  # Update as needed

tz = pytz.timezone('Asia/Shanghai')  
now = datetime.datetime.now(tz)  
print(now.strftime('%Y-%m-%d %H:%M:%S'))  
print('Started!')

In [None]:
# Cell 5: Load model configurations and initialize the model
with open('configs/model_configs.yaml') as f:  
    model_configs = yaml.load(f, Loader=yaml.FullLoader)  

dpi = model_configs['model_args']['pdf_dpi']  
layout_model = layout_model_init(model_configs['model_args']['layout_weight'])  
print(now.strftime('%Y-%m-%d %H:%M:%S'))  
print('Model init done!')

In [None]:
# Cell 6: Start processing the PDF
start = time.time()  
if os.path.isdir(pdf_path):  
    all_pdfs = [os.path.join(pdf_path, name) for name in os.listdir(pdf_path)]  
else:  
    all_pdfs = [pdf_path]  
print("Total files:", len(all_pdfs))  

for idx, single_pdf in enumerate(all_pdfs):  
    print(f'Processing PDF: {single_pdf}')  
    try:  
        img_list = load_pdf_fitz(single_pdf, dpi=dpi)  
        print(f'Loaded {len(img_list)} pages from {single_pdf}')  
    except Exception as e:  
        img_list = None  
        print(f"Error loading PDF {single_pdf}: {e}")  
    if img_list is None:  
        continue

In [None]:
# Cell 7: Create directories for saving images
basename = os.path.basename(single_pdf)[0:-4]  
whole_page_dir = os.path.join(output_dir, f'{basename}_wholepageimage')  
os.makedirs(whole_page_dir, exist_ok=True)  
crop_img_dir = os.path.join(output_dir, f'{basename}_cropimage')  
os.makedirs(crop_img_dir, exist_ok=True)  

captions = []

In [None]:
# Cell 8: Process each page in the PDF
for page_idx, image in (enumerate(img_list)):  
    img_H, img_W = image.shape[0], image.shape[1]  
    layout_res = layout_model(image, ignore_catids=[])  

    # Save the whole page image  
    whole_page_image_path = os.path.join(whole_page_dir, f'{basename}_page_{page_idx + 1}.png')  
    Image.fromarray(cv2.cvtColor(image, cv2.COLOR_RGB2BGR)).save(whole_page_image_path)  
    #print(f'Saved whole page image for page {page_idx + 1} to {whole_page_image_path}')  

    img_idx = 0  
    for item in layout_res['layout_dets']:  
        category_id = item['category_id']  
        poly = item['poly']  
        xmin, ymin, xmax, ymax = int(poly[0]), int(poly[1]), int(poly[4]), int(poly[5])  

        if category_id == 3:  # image category is 3  
            crop_img = image[ymin:ymax, xmin:xmax]  
            crop_img_pil = Image.fromarray(cv2.cvtColor(crop_img, cv2.COLOR_RGB2BGR))  
            image_path = save_image(crop_img_pil, crop_img_dir, basename, page_idx, img_idx)  
            img_idx += 1  

            # Find the corresponding image name  
            image_name = f'{basename}_page_{page_idx + 1}_img_{img_idx}.png'  
            caption_entry = {  
                'image': image_name,  
                'page': page_idx + 1,  
                'caption': ''  
            }  
            captions.append(caption_entry)

In [None]:
# Cell 9: Save the caption information to a JSON file
info_output_path = os.path.join(output_dir,f'{basename}.json')  
with open(info_output_path, 'w', encoding='utf-8') as f:  
    json.dump(captions, f, ensure_ascii=False, indent=4)  
print(f'Saved captions to {info_output_path}')