**Data Augmentation:** Apply data augmentation techniques to increase the diversity of your training dataset. Common augmentations include:

- Random rotation
- Random cropping
- Random flipping (horizontal and vertical)
- Random lighting changes

These augmentations help the model generalize better to variations in the data.

In [30]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.image import ImageDataGenerator
import pandas as pd
import numpy as np
import os
from PIL import Image
from sklearn.model_selection import train_test_split
import h5py  # We'll use the h5py library to save NumPy arrays efficiently
from tqdm import tqdm

In [21]:
image_folder = '../data/reduced_images_10000/'
reduced_images = os.listdir(image_folder)
reduced_labels = pd.read_csv("../data/reduced_labels_10000.csv")

In [22]:
# Match labels with images based on the 'id' column
x_data = []  # List to store the images
y_data = []  # List to store the corresponding labels

for filename in reduced_images:
    # Extract the identifier (e.g., image_id) from the filename
    image_id = filename.split('.')[0]
    
    # Find the corresponding label in the DataFrame using the identifier
    label_row = reduced_labels[reduced_labels['id'] == image_id]
    
    # If a matching label is found, append the image and label to the lists
    if not label_row.empty:
        label = label_row['label'].values[0]  # Assuming 'label' is the column with labels
        image_path = os.path.join(image_folder, filename)
        
        # Load and preprocess the image
        image = Image.open(image_path)
        image = image.resize((96, 96))  # Resize to 96x96
        image = np.array(image)  # Convert to NumPy array
        
        # You may need to perform additional preprocessing here
        # such as normalizing pixel values or data augmentation
        
        x_data.append(image)
        y_data.append(label)

# Convert lists to NumPy arrays
x_data = np.array(x_data)
y_data = np.array(y_data)

In [23]:
datagen = ImageDataGenerator(
    rotation_range=40,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

In [24]:
train_generator = datagen.flow(
    x_data,
    y_data,
    batch_size=32,
    shuffle=True,
    seed=42
)

#### Split reduced train data (from Kaggle) into training (70%), validation (15%), and test (15%) sets

In [25]:

x_train, x_temp, y_train, y_temp = train_test_split(
    x_data, y_data, test_size=0.3, random_state=42
)
x_val, x_test, y_val, y_test = train_test_split(
    x_temp, y_temp, test_size=0.5, random_state=42
)

#### Resize and Normalization

In [26]:
# Define the target size (e.g., 224x224)
target_size = (224, 224)

# Resize images in the training set
x_train_resized = [Image.fromarray(image).resize(target_size) for image in x_train]

# Resize images in the validation set
x_val_resized = [Image.fromarray(image).resize(target_size) for image in x_val]

# Resize images in the test set
x_test_resized = [Image.fromarray(image).resize(target_size) for image in x_test]

# Convert resized images to NumPy arrays
x_train_resized = [np.array(image) for image in x_train_resized]
x_val_resized = [np.array(image) for image in x_val_resized]
x_test_resized = [np.array(image) for image in x_test_resized]

# Normalize pixel values to [0, 1]
x_train_normalized = np.array(x_train_resized) / 255.0
x_val_normalized = np.array(x_val_resized) / 255.0
x_test_normalized = np.array(x_test_resized) / 255.0

In [33]:
# Define file paths for train, validation, and test sets
train_file = os.path.join(output_dir, 'train.h5')
val_file = os.path.join(output_dir, 'validation.h5')
test_file = os.path.join(output_dir, 'test.h5')

# Create HDF5 files for each set
with h5py.File(train_file, 'w') as hf:
    hf.create_dataset('x_train_normalized', data=x_train_normalized)
    hf.create_dataset('y_train', data=y_train)

with h5py.File(val_file, 'w') as hf:
    hf.create_dataset('x_val_normalized', data=x_val_normalized)
    hf.create_dataset('y_val', data=y_val)

with h5py.File(test_file, 'w') as hf:
    hf.create_dataset('x_test_normalized', data=x_test_normalized)
    hf.create_dataset('y_test', data=y_test)

Saving Train Data:   0%|          | 1/7000 [00:00<00:16, 433.83it/s]


ValueError: Unable to create dataset (name already exists)