In [1]:

from collections import namedtuple
from PIL import Image, ImageDraw, ImageFont
from google.cloud import vision
from enum import Enum
%matplotlib inline
import matplotlib.pyplot as plt
import os
import json
from google.protobuf.json_format import MessageToJson
from google.protobuf import json_format
from pdf2image import convert_from_path
import PyPDF2


import sys
sys.path.append("..")
from models.bounding_box import FeatureType, Point, BoundingBox
from utils.cv_preprocess import draw_boxes, group_bounding_boxes, merge_box_groups

from utils.file_utils import create_dir, prepare_image_local, prepare_image_web, load_counter, save_counter, save_json

from utils.nlp_preprocess import load_spacy_models, is_english, is_chinese, split_dish_info, is_word_relevant

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../menu-ocr-390814-027d51b70720.json'


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# def extract_text_from_pdf(pdf_path, output_folder):
#     try:
#         # Open the PDF file
#         with open(pdf_path, 'rb') as file:
#             # Create a PDF file reader object
#             pdf_reader = PyPDF2.PdfReader(file)
            
#             # Loop through each page in the PDF
#             for page_number in range(len(pdf_reader.pages)):
#                 # Get the page
#                 page = pdf_reader.pages[page_number]
                
#                 # Extract text from the page
#                 text = page.extract_text ()
                
#                 # Create a text file and save the extracted text
#                 output_path = f"{output_folder}/page_{page_number + 1}.txt"
#                 with open(output_path, 'w', encoding='utf-8') as output_file:
#                     output_file.write(text)
                
#                 print(f"Text extracted from page {page_number + 1} and saved to {output_path}")
                
#     except Exception as e:
#         print(f"An error occurred: {str(e)}")

# # Example usage
# extract_text_from_pdf('../downloads/Ma-La-Menu_07-2023.pdf', "./pdf")


In [3]:
filein = '../downloads/Ma-La-Menu_07-2023.pdf'
base_name = os.path.basename(filein)
file_name_without_extension = os.path.splitext(base_name)[0]


raw_ocr_directory = '../output/raw_ocr/'
raw_ocr_filename = file_name_without_extension + "_raw_annotation.json"
raw_ocr_path = os.path.join(raw_ocr_directory, raw_ocr_filename)

preprocessed_ocr_directory = '../output/prep_ocr/'
preprocessed_ocr_filename = file_name_without_extension + "_prep_ocr.json"
preprocessed_ocr_path = os.path.join(preprocessed_ocr_directory, preprocessed_ocr_filename)

menu_image_directory = '../output/menu_image/'
create_dir(menu_image_directory)


In [4]:
def get_total_pages(pdf_path):
    """Retrieve the total number of pages in the PDF."""
    with open(pdf_path, 'rb') as file:
        pdf_reader = PyPDF2.PdfReader(file)
        total_pages = len(pdf_reader.pages)
    return total_pages

def pdf_page_to_image(pdf_path, page_number, export_filename):
    """Converts a specific page of a PDF to an image."""
    images = convert_from_path(pdf_path, first_page=page_number, last_page=page_number)
    if images:
        image_path = export_filename
        images[0].save(image_path, 'PNG')
        return image_path
    else:
        raise Exception("Page not converted to image")

def detect_text(image_path):
    """Detects text in the file."""
    client = vision_v1.ImageAnnotatorClient()

    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision_v1.Image(content=content)

    response = client.document_text_detection(
        image=image, 
        image_context={"language_hints": ["zh", "en"]}
    )
    document_json = MessageToJson(response.full_text_annotation._pb)

    if not os.path.exists(raw_ocr_directory):
        os.makedirs(raw_ocr_directory)

    with open(raw_ocr_path, 'w', encoding='utf-8') as json_file:
        json_file.write(document_json)

In [5]:
def extract_all_pages(pdf_path):
    """Process all pages in the PDF and extract text."""
    total_pages = get_total_pages(pdf_path)  # This function should return the number of pages in the PDF.
    base_name = os.path.basename(pdf_path)
    file_name_without_extension = os.path.splitext(base_name)[0]

    for page_number in range(1, total_pages + 1):
        try:
            # Prepare the specific filename for this page
            export_filename = f"{file_name_without_extension}_page_{page_number}.png"
            menu_image_path = os.path.join(menu_image_directory, export_filename)


            # Convert page to image with a new filename
            image_path = pdf_page_to_image(pdf_path, page_number, menu_image_path)

            # Here, you might want to do something with the image (e.g., save it, analyze it, etc.)
            print(f"Page {page_number} has been processed and saved as {export_filename}")

        except Exception as e:
            print(f"An error occurred on page {page_number}: {str(e)}")

In [6]:
extract_all_pages('../downloads/Ma-La-Menu_07-2023.pdf')

Page 1 has been processed and saved as Ma-La-Menu_07-2023_page_1.png
Page 2 has been processed and saved as Ma-La-Menu_07-2023_page_2.png
Page 3 has been processed and saved as Ma-La-Menu_07-2023_page_3.png
Page 4 has been processed and saved as Ma-La-Menu_07-2023_page_4.png
Page 5 has been processed and saved as Ma-La-Menu_07-2023_page_5.png
Page 6 has been processed and saved as Ma-La-Menu_07-2023_page_6.png
Page 7 has been processed and saved as Ma-La-Menu_07-2023_page_7.png
Page 8 has been processed and saved as Ma-La-Menu_07-2023_page_8.png
Page 9 has been processed and saved as Ma-La-Menu_07-2023_page_9.png
Page 10 has been processed and saved as Ma-La-Menu_07-2023_page_10.png
Page 11 has been processed and saved as Ma-La-Menu_07-2023_page_11.png
Page 12 has been processed and saved as Ma-La-Menu_07-2023_page_12.png
