In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from PIL import Image

# Load metadata CSV
df = pd.read_csv("labels.csv")  # should have 'filename' and 'label' columns

# Encode labels
le = LabelEncoder()
df['encoded_label'] = le.fit_transform(df['label'])
df['encoded_label_cat'] = to_categorical(df['encoded_label']).tolist()

# Stratified train-validation split
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['encoded_label'], random_state=42)

# ImageDataGenerator with augmentations
train_datagen = ImageDataGenerator(
    rescale=1./255,
    rotation_range=20,
    width_shift_range=0.1,
    height_shift_range=0.1,
    horizontal_flip=True,
    vertical_flip=True,
    brightness_range=[0.8, 1.2]
)

val_datagen = ImageDataGenerator(rescale=1./255)

# Flow from dataframe
train_generator = train_datagen.flow_from_dataframe(
    dataframe=train_df,
    directory='train_images/',
    x_col='filename',
    y_col='label',
    target_size=(300, 300),
    class_mode='categorical',
    batch_size=32,
    shuffle=True
)

val_generator = val_datagen.flow_from_dataframe(
    dataframe=val_df,
    directory='train_images/',
    x_col='filename',
    y_col='label',
    target_size=(300, 300),
    class_mode='categorical',
    batch_size=32,
    shuffle=False
)

# Save label mapping for inference
np.save("label_classes.npy", le.classes_)
