In [None]:
# Compress and filter

> Compress the data and save them in pyarrow format, so that easy can be transferred and filter them

In [1]:
#| default_exp compress_and_filter

In [19]:
#| export 
import os
from PIL import Image
from pathlib import Path
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from typing import Union, List, Tuple, Callable
from tqdm.auto import tqdm
import os
from fastcore.all import *

In [21]:
#| export
def apply_functions(
        fn: Union[Path, str], # Name fo the file
        functions:Union[List[Callable], None]=None
    ):
    'Apply a list of functions to a file'

    results = {}
    for func in functions:

        try:
            result = func(fn)

            if result is not None:
                results[func.__name__] = result
        except Exception as e:
            print(f'Error in {func.__name__} for {fn}: {e}')
    return results

In [16]:
#| export
def convert_images_to_parquet(
        image_directory:Union[str, Path],  # directory containing images
        output_file:Union[str, Path],      # output parquet file
        file_name_func:List[Callable]=None, # functions to apply to the filename
        file_exts:str='.png'
        ):
    ' Convert images in a directory to a parquet file '
    images = image_directory.ls(file_exts=file_exts) 
    n_ims = len(images)

    data = []
    for i, im in enumerate(tqdm(images, total=n_ims)):
        fn = im.name
        with Image.open(im) as img:
            img_byte_arr = io.BytesIO()
            img.save(img_byte_arr, format='PNG')
            img_byte_arr = img_byte_arr.getvalue()


            # Apply functions to filename and collect metadata
            if file_name_func is not None:
                metadata = apply_functions(filename, file_name_func)

            data_entry = {'filename': fn, 'image_data': img_byte_arr}
            if file_name_func is not None:
                data_entry.update(metadata)


            # Append the binary data and filename to the list
            data.append(data_entry)

    # Convert the list to a pandas DataFrame
    df = pd.DataFrame(data)
    table = pa.Table.from_pandas(df)
    pq.write_table(table, output_file)

In [17]:
data_path = os.getenv('DATA_PATH')

In [18]:
im_src = Path(f'{data_path}/easy_front/Cropped_Images_png')

convert_images_to_parquet(im_src, f'{data_path}/easy_front/Cropped_Images.parquet')

  0%|          | 0/1639 [00:00<?, ?it/s]