In [14]:

from collections import namedtuple
from PIL import Image, ImageDraw, ImageFont
from google.cloud import vision
from enum import Enum
%matplotlib inline
import matplotlib.pyplot as plt
import os
import json
from google.protobuf.json_format import MessageToJson
from google.protobuf import json_format

import sys
sys.path.append("..")
from models.bounding_box import FeatureType, Point, BoundingBox
from utils.cv_preprocess import draw_boxes, group_bounding_boxes, merge_box_groups

from utils.file_utils import prepare_image_local, prepare_image_web, load_counter, save_counter, save_json

from utils.nlp_preprocess import *

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../menu-ocr-390814-99b5549068a5.json'


In [15]:
# Directory containing images
image_directory = '../cleaned_corpus/for_report/'

# Directories for storing OCR data
raw_ocr_directory = '../downloaded_menu/raw_ocr/'
preprocessed_ocr_directory = '../downloaded_menu/prep_ocr/'

In [16]:
# Initialize Google Cloud Vision client
client = vision.ImageAnnotatorClient()

# Traverse through all images in the directory
for file_name in os.listdir(image_directory):
    if file_name.endswith('.png') or file_name.endswith('.jpg') or file_name.endswith('.webp'):
        file_path = os.path.join(image_directory, file_name)
        file_name_without_extension = os.path.splitext(file_name)[0]

        # Construct raw OCR file path
        raw_ocr_filename = file_name_without_extension + "_raw_annotation.json"
        raw_ocr_path = os.path.join(raw_ocr_directory, raw_ocr_filename)

        # Check if OCR file already exists
        if os.path.exists(raw_ocr_path):
            print(f'{raw_ocr_path} exists, move to next one')
            continue

        # Prepare image for OCR
        image = prepare_image_local(file_path)

        # Perform OCR
        response = client.document_text_detection(
            image=image, 
            image_context={"language_hints": ["zh", "en"]}
        )

        # Convert OCR response to JSON
        document_json = MessageToJson(response.full_text_annotation._pb)

        # Create directory if it doesn't exist
        if not os.path.exists(raw_ocr_directory):
            os.makedirs(raw_ocr_directory)

        # Save OCR data to file
        with open(raw_ocr_path, 'w', encoding='utf-8') as json_file:
            json_file.write(document_json)

        # Load, increment, and save the counter
        current_count = load_counter()
        current_count += 1
        save_counter(current_count)

In [17]:
# filein = '../downloaded_menu/cleaned_img/13._Phoenix_Palace_0_page_12.png'
# base_name = os.path.basename(filein)
# file_name_without_extension = os.path.splitext(base_name)[0]


# raw_ocr_directory = '../downloaded_menu/raw_ocr/'
# raw_ocr_filename = file_name_without_extension + "_raw_annotation.json"
# raw_ocr_path = os.path.join(raw_ocr_directory, raw_ocr_filename)

# preprocessed_ocr_directory = '../downloaded_menu/prep_ocr/'
# preprocessed_ocr_filename = file_name_without_extension + "_prep_ocr.json"
# preprocessed_ocr_path = os.path.join(preprocessed_ocr_directory, preprocessed_ocr_filename)

# image = prepare_image_local(filein)

# client = vision.ImageAnnotatorClient()

# response = client.document_text_detection(
#     image=image, 
#     image_context={"language_hints": ["zh", "en"]}
# )

# # Load existing counter
# current_count = load_counter()

# # Increment counter
# current_count += 1

# # Save updated counter
# save_counter(current_count)

# document_json = MessageToJson(response.full_text_annotation._pb)

# if not os.path.exists(raw_ocr_directory):
#     os.makedirs(raw_ocr_directory)

# with open(raw_ocr_path, 'w', encoding='utf-8') as json_file:
#     json_file.write(document_json)