In [3]:
import os
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from PIL import Image

def read_images_from_folder(folder_path):
    images = []
    filenames = []
    for filename in os.listdir(folder_path):
        if filename.endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):
            file_path = os.path.join(folder_path, filename)
            try:
                with Image.open(file_path) as img:
                    img = img.convert('RGB')  # Ensure all images are in RGB format
                    img_array = np.array(img)
                    img_flatten = img_array.flatten()  # Flatten the 2D image array into 1D
                    images.append(img_flatten)
                    filenames.append(filename)
            except Exception as e:
                print(f"Error processing file {file_path}: {e}")

    return filenames, np.array(images)

def create_dataframe(filenames, images):
    df = pd.DataFrame(images)
    df.insert(0, 'filename', filenames)
    return df

def write_to_parquet(df, output_file):
    table = pa.Table.from_pandas(df)
    pq.write_table(table, output_file)

# Folder path containing the images
folder_path = '../datasets/ASL_train/A_ASL'
# Output Parquet file path
output_file = '../datasets/ASL_train_A_ASL.parquet'

# Read images and create DataFrame
filenames, images = read_images_from_folder(folder_path)
df = create_dataframe(filenames, images)

# Write DataFrame to Parquet file
write_to_parquet(df, output_file)


  table = pa.Table.from_pandas(df)
