In [1]:
import os
import cv2
from google.cloud import vision
import boto3
import json
import random

from dotenv import load_dotenv

load_dotenv()

In [2]:
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'lacuentax-adb671932cd7.json'

In [3]:
s3_client = boto3.client(
  's3',
  aws_access_key_id=os.getenv('aws_access_key_id'),
  aws_secret_access_key=os.getenv('aws_secret_access_key'),
  region_name=os.getenv('region_name')
)

bucket_name = 'la-cuentax'

In [4]:
def get_texts_coordinates_api(image_cv):
    """
    Consume google vision API to get image texts

    image_cv: cv2/numpy

    return: list of text with coordinates
    """
    client = vision.ImageAnnotatorClient(client_options={'api_endpoint': 'eu-vision.googleapis.com'})
    image_vision = vision.Image(content=cv2.imencode('.jpeg', image_cv)[1].tobytes())
        
    response = client.text_detection(image=image_vision)
    texts = response.text_annotations

    texts_coordinates = []

    for text in texts:
        coordinates = [(vertex.x, vertex.y) for vertex in text.bounding_poly.vertices]

        texts_coordinates.append({
            'text': text.description,
            'coo': coordinates
        })

    return texts_coordinates

In [5]:
def save_text_coordinates(uid, image_cv):
    """
    Get text coordinates for an image, either from cached pickle file or from API
    
    Args:
        image_path (str): Uid of the image
        image_cv: OpenCV image object
        
    Returns:
        list: Text coordinates data
    """
    # Check if json file exists
    if os.path.exists(f'texts_coordinates/{uid}.json'):
        return True
    
    # If we get here, either file doesn't exist or image_id not in cache
    print(f"Getting new data from API for image {uid}")
    texts_coordinates = get_texts_coordinates_api(image_cv)

    with open(f'texts_coordinates/{uid}.json', 'w') as f:
        json.dump(texts_coordinates[1:], f)
    return True

In [6]:
def download_image_from_s3(s3_client, bucket_name, s3_key, local_path):
    try:
        # Download the file
        s3_client.download_file(bucket_name, s3_key, local_path)
        print(f"Successfully downloaded {s3_key} to {local_path}")
    except Exception as e:
        print(f"Error downloading file: {e}")


In [7]:
def save_text_coordinates(uids, s3_client, bucket_name):    
    # Process each UID
    for uid in uids:
        # Define paths
        local_image_path = f'cuentas/{uid}.jpeg'
        s3_key = f'{uid}.jpeg'
        
        # Download image if it doesn't exist locally
        if not os.path.exists(local_image_path):
            print(f"Downloading image for UID: {uid}")
            download_image_from_s3(s3_client, bucket_name, s3_key, local_image_path)
        
        # Process text coordinates if image exists locally
        if os.path.exists(local_image_path):
            image_cv = cv2.imread(local_image_path)
            if image_cv is not None:
                save_text_coordinates(uid, image_cv)
            else:
                print(f"Error: Could not read image for UID: {uid}")
        else:
            print(f"Error: Image not found for UID: {uid}")

In [8]:
def get_random_jpeg_files_from_s3(s3_client, bucket_name, n, ids_already_saved=None):
    """
    Get n random JPEG files from the specified S3 bucket.

    Args:
        s3_client: Boto3 S3 client
        bucket_name (str): Name of the S3 bucket
        n (int): Number of random JPEG files to retrieve

    Returns:
        list: List of n random JPEG file keys from the S3 bucket
    """
    # List all files in the S3 bucket
    response = s3_client.list_objects_v2(Bucket=bucket_name)
    files = [item['Key'] for item in response.get('Contents', [])]

    # Filter for JPEG files
    jpeg_files = [f for f in files if f.endswith('.jpeg')]

    if ids_already_saved:
        jpeg_files = [f for f in jpeg_files if f.split('.')[0] not in ids_already_saved]

    # Select n random JPEG files
    if len(jpeg_files) < n:
        print(f"Requested {n} files, but only {len(jpeg_files)} JPEG files available. Returning all available files.")
        return jpeg_files
    else:
        return random.sample(jpeg_files, n)

In [None]:
# Images with OCR results
files = os.listdir('./texts_coordinates')
uids_with_texts_coos = [f.split('.')[0] for f in files]
print('Nb images with OCR res:', len(uids_with_texts_coos))

bad_image = [
    '2720b437-bb67-49fc-a8cd-be1ad6b19322',
    '04bc47b0-6d5e-46e4-8cd6-444da957972b',
    '13d1f973-d1e7-4a58-b572-d396d253e42f',
]

supermarket_image = [
    '22fced9f-f189-4fee-a8d3-b577ddcfafc5',
    '2900fc9e-817a-469e-997a-eb3ddfab6e50',
    '0f7ee436-2254-44ee-af73-fbbe7d245f37',
    '3b4defca-8970-427f-979a-d5120aab2fc5',
    '18752e6d-f814-4329-8f79-409070b84164',
    '3f7b9c7f-7d0c-45de-acf0-ac290e3b2425',
]

files_to_download = get_random_jpeg_files_from_s3(s3_client, bucket_name, 50, uids_with_texts_coos + bad_image + supermarket_image)
uids_to_download = [filename.split('.')[0] for filename in files_to_download]
print(len(uids_to_download))

Nb images with OCR res: 172
50


In [11]:
process_batch_images(uids_to_download, s3_client, bucket_name)

Downloading image for UID: 1894a6ff-a8ce-4477-89a8-dde330d19f6c
Successfully downloaded 1894a6ff-a8ce-4477-89a8-dde330d19f6c.jpeg to cuentas/1894a6ff-a8ce-4477-89a8-dde330d19f6c.jpeg
Getting new data from API for image 1894a6ff-a8ce-4477-89a8-dde330d19f6c
Downloading image for UID: 1b76af5c-a34b-45c6-922c-c98ead5758a5
Successfully downloaded 1b76af5c-a34b-45c6-922c-c98ead5758a5.jpeg to cuentas/1b76af5c-a34b-45c6-922c-c98ead5758a5.jpeg
Getting new data from API for image 1b76af5c-a34b-45c6-922c-c98ead5758a5
Downloading image for UID: 2056c078-18a3-443e-883d-98ae79677298
Successfully downloaded 2056c078-18a3-443e-883d-98ae79677298.jpeg to cuentas/2056c078-18a3-443e-883d-98ae79677298.jpeg
Getting new data from API for image 2056c078-18a3-443e-883d-98ae79677298
Downloading image for UID: 11b9b924-ffff-42e1-bcba-2029993dcc3a
Successfully downloaded 11b9b924-ffff-42e1-bcba-2029993dcc3a.jpeg to cuentas/11b9b924-ffff-42e1-bcba-2029993dcc3a.jpeg
Getting new data from API for image 11b9b924-ffff