In [8]:
import tensorflow as tf
import os
import numpy as np
import matplotlib.pyplot as plt
import tkinter as tk
from tkinter import filedialog
from sklearn.utils.class_weight import compute_class_weight

# --- Step 1: Configuration Constants ---
IMAGE_SIZE = (300, 300)
BATCH_SIZE = 32

In [9]:
def select_dataset_directory():
    """Opens a folder selection dialog and returns the selected path."""
    root = tk.Tk()
    root.withdraw() 
    print("Opening a dialog box to choose your dataset folder...")
    directory_path = filedialog.askdirectory(
        title="Please select your dataset folder (e.g., FINAL DATASET)"
    )
    root.destroy()
    return directory_path

In [10]:
def create_data_generators(dataset_dir):
    """Creates and returns training and validation data generators with a 80/20 split."""
    datagen = tf.keras.preprocessing.image.ImageDataGenerator(
        preprocessing_function=tf.keras.applications.efficientnet.preprocess_input,
        rotation_range=30,
        width_shift_range=0.2,
        height_shift_range=0.2,
        shear_range=0.2,
        zoom_range=0.2,
        horizontal_flip=True,
        fill_mode='nearest',
        validation_split=0.2  # Reserve 20% of data for validation
    )

    train_generator = datagen.flow_from_directory(
        directory=dataset_dir,
        target_size=IMAGE_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=True,
        subset='training'
    )

    validation_generator = datagen.flow_from_directory(
        directory=dataset_dir,
        target_size=IMAGE_SIZE,
        batch_size=BATCH_SIZE,
        class_mode='categorical',
        shuffle=False,
        subset='validation'
    )
    
    return train_generator, validation_generator

# --- Step 4: Class Weight Calculation ---
def calculate_class_weights(train_generator):
    """Calculates class weights to handle data imbalance."""
    class_labels = np.unique(train_generator.classes)
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=class_labels,
        y=train_generator.classes
    )
    class_weights_dict = dict(zip(class_labels, class_weights))
    return class_weights_dict

# --- Main Execution Block (for Testing) ---
if __name__ == "__main__":
    DATASET_DIR = select_dataset_directory()

    if DATASET_DIR:
        print(f"\n✅ Dataset folder selected: {DATASET_DIR}")

        # Test Step 3
        train_gen, val_gen = create_data_generators(DATASET_DIR)
        
        print("\n--- Test Results for Data Splitting ---")
        print(f"Found {train_gen.samples} images for training.")
        print(f"Found {val_gen.samples} images for validation.")
        print(f"Number of classes: {train_gen.num_classes}")
        
        # Test Step 4
        weights = calculate_class_weights(train_gen)
        
        print("\n--- Test Results for Class Balancing ---")
        print("Calculated Class Weights (to handle imbalance):")
        print(weights)
        print("✅ Data pipeline is ready!")

    else:
        print("\n❌ No folder was selected.")

Opening a dialog box to choose your dataset folder...

✅ Dataset folder selected: C:/Users/Jay Surieya/Downloads/FINAL DATASET
Found 47928 images belonging to 110 classes.
Found 11975 images belonging to 110 classes.

--- Test Results for Data Splitting ---
Found 47928 images for training.
Found 11975 images for validation.
Number of classes: 110

--- Test Results for Class Balancing ---
Calculated Class Weights (to handle imbalance):
{np.int32(0): np.float64(10.992660550458716), np.int32(1): np.float64(0.5496330275229357), np.int32(2): np.float64(0.5496330275229357), np.int32(4): np.float64(0.5496330275229357), np.int32(5): np.float64(0.5496330275229357), np.int32(6): np.float64(10.992660550458716), np.int32(7): np.float64(10.992660550458716), np.int32(8): np.float64(10.992660550458716), np.int32(9): np.float64(9.160550458715596), np.int32(10): np.float64(10.992660550458716), np.int32(11): np.float64(10.992660550458716), np.int32(12): np.float64(0.5496330275229357), np.int32(13): np.f