In [None]:
import os
import re
import cv2
import json
import time
import easyocr
import pandas as pd
from tqdm import tqdm
from PIL import Image
from pathlib import Path
from google.cloud import vision
from google.cloud import storage
from concurrent.futures import ThreadPoolExecutor, as_completed

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "fannilla-dev.json"

In [None]:
vision_client = vision.ImageAnnotatorClient()
storage_client = storage.Client()

BUCKET_NAME = 'chum_bucket_stuff'

BUCKET = storage_client.bucket(BUCKET_NAME)
SRC_DIR = Path("/Volumes/external_drive")
TEMP_DIR = Path("./temp")
TEMP_DIR.mkdir(exist_ok=True)

In [None]:
df = pd.read_csv('datasets/images_high_res_dataset.csv')
df.sort_values(by=['gallery_category', 'gallery_name'], inplace=True)

In [None]:
# df['blobs'] = SRC_DIR / df.gallery_category / df.gallery_name / df.filename
df['blobs'] = TEMP_DIR / df.gallery_category / df.gallery_name / df.filename
all_blobs = df['blobs'].tolist()

In [None]:
def download_blob(filepath: Path) -> Path:
    _bucket_file_path = filepath.as_posix().replace(TEMP_DIR.as_posix(), "pics")
    blob = BUCKET.blob(_bucket_file_path)
    B = Path(_bucket_file_path.replace("pics", "temp"))
    B.parent.mkdir(parents=True, exist_ok=True)
    blob.download_to_filename(B)
    return B


def request_ocr(blob_location: str) -> vision.AnnotateImageResponse:
    image = vision.Image()
    image.source.image_uri = blob_location
    request = {
        "image": image,
        "features": [
            {
                "type_": vision.Feature.Type.TEXT_DETECTION
            }
        ],
    }
    return vision_client.annotate_image(request)


def convert_string_to_vertices_list(vertices_string):
    pattern = re.compile(r'x:\s*(\d+)\s*y:\s*(\d+)')
    matches = pattern.findall(vertices_string)
    return [{'x': int(x), 'y': int(y)} for x, y in matches]


def get_vertices_from_response(response: vision.AnnotateImageResponse):
    if not response.text_annotations:
        return None

    bounding_poly = response.text_annotations[0].bounding_poly
    vertices = str(bounding_poly.vertices)

    return convert_string_to_vertices_list(vertices)


def crop_image(local_image_path: Path, vertices: list) -> Path | None:
    try:
        y_vertices = [int(vertex['y']) for vertex in vertices]
        upper = min(y_vertices)
        lower = max(y_vertices)

        image = Image.open(local_image_path)
        width, height = image.size

        if abs(lower - upper) > 0.25 * height:
            return

        if upper < height / 2:
            cropped_image = image.crop(
                (0, lower, width, height)  # (left, upper, right, and lower)
            )

        else:
            cropped_image = image.crop(
                (0, 0, width, upper)  # (left, upper, right, and lower)
            )

        local_image_path = Path(local_image_path).parent / "cropped" / Path(local_image_path).name
        local_image_path.parent.mkdir(parents=True, exist_ok=True)
        cropped_image.save(local_image_path)
        return local_image_path

    except Exception as e:
        return None


def save_file_to_storage(local_file_path: str, gcp_file_path: str):
    _blob = BUCKET.blob(gcp_file_path)
    _blob.upload_from_filename(local_file_path)


def main_process(blob_name: Path):
    cropped_file_path = blob_name.parent / 'cropped' / blob_name.name
    vertices_path = cropped_file_path.parent / f"{cropped_file_path.with_suffix('').name}.json"
    cropped_file_path.parent.mkdir(parents=True, exist_ok=True)

    if vertices_path.exists():
        with open(vertices_path, 'r') as json_file:
            vertices = json.load(json_file)
    else:
        gcp_file = f"gs://{BUCKET_NAME}/pics{blob_name.as_posix().replace(SRC_DIR.as_posix(), '')}"
        response = request_ocr(gcp_file)
        vertices = get_vertices_from_response(response)

    local_image_path = blob_name
    if vertices:  # If OCR found vertices
        local_image_path_cropped = crop_image(local_image_path, vertices)

        if local_image_path_cropped:  # if image cropping was successful with given vertices
            with open(vertices_path, 'w') as json_file:
                json.dump(vertices, json_file, indent=4)


def get_local_ocr(image_path: str):
    ocr_reader = easyocr.Reader(['en'])

    image = cv2.imread(image_path)

    results = ocr_reader.readtext(gray_image)

    vertices = []
    for result in results:
        formatted_bbox = [{'x': int(point[0]), 'y': int(point[1])} for point in result[0]]
        vertices.extend(formatted_bbox)

    return vertices, image


def crop_and_save_cv2_image(vertices: list, image, local_image_path) -> None:
    try:
        y_vertices = [int(vertex['y']) for vertex in vertices]
        upper = min(y_vertices)
        lower = max(y_vertices)

        height, width = image.shape[:2]

        if abs(lower - upper) > 0.25 * height:
            return None

        if upper < height / 2:
            cropped_image = image[lower:height, 0:width]  # Crop lower part of the image
        else:
            cropped_image = image[0:upper, 0:width]  # Crop upper part of the image

        local_image_path.parent.mkdir(parents=True, exist_ok=True)
        cv2.imwrite(local_image_path.as_posix(), cropped_image)

    except Exception as e:
        return None


def remote_process(blob_name: Path):
    downloaded_image_path = download_blob(blob_name)
    cropped_file_path = blob_name.parent / 'cropped' / blob_name.name
    vertices_path = cropped_file_path.parent / f"{cropped_file_path.with_suffix('').name}.json"
    cropped_file_path.parent.mkdir(parents=True, exist_ok=True)

    # if not vertices_path.exists():  # If OCR found vertices
    vertices, image = get_local_ocr(downloaded_image_path)
    # crop_and_save_cv2_image(
    #     vertices, image, cropped_file_path
    # )

    if vertices:  # if image cropping was successful with given vertices
        with open(vertices_path, 'w') as json_file:
            json.dump(vertices, json_file, indent=4)

In [None]:
blobs = [
    b for b in sorted(all_blobs)
    if '/amateur/' in b.as_posix()
]
len(blobs)

# GCP OCR

In [None]:
# pause = 60
# futures = []
# 
# with ThreadPoolExecutor(max_workers=16) as executor:
#     for i, blob in enumerate(blobs):
#         futures.append(executor.submit(main_process, blob))
# 
#         if i != 0 and i % 1700 == 0:
#             start = time.time()
#             for future in tqdm(as_completed(futures), total=len(futures), desc="Processing images"):
#                 future.result()
# 
#             while time.time() - start <= pause:
#                 time.sleep(2)
# 
#             futures = []

# Cloud processing

In [None]:
remote_process(blobs[0])

In [None]:
_blobs = blobs[:100]
with ThreadPoolExecutor() as executor:
    for blob in tqdm(_blobs, total=len(_blobs), desc="Processing images"):
        remote_process(blob)