In [1]:
import os
import shutil
import numpy as np
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense, Flatten, Dropout
from tensorflow.keras.models import Model, load_model
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# --- Configuration Section ---
# Set the path to your downloaded and unzipped dataset
# IMPORTANT: Make sure this path is correct for your system.
dataset_dir = "C:\\Users\\Admin\\Downloads\\smart-sorting\\Fruit And Vegetable Diseases Dataset"
output_dir = "C:\\Users\\Admin\\Downloads\\smart-sorting\\output_dataset" # Directory for split datasets

IMG_SIZE = (224, 224)
BATCH_SIZE = 32

# Adjusted epochs as requested for a total of 7 epochs
# Initial training with frozen layers for 3 epochs
EPOCHS_INITIAL = 3
# Fine-tuning with unfrozen layers for 4 additional epochs
EPOCHS_FINE_TUNE = 4

LEARNING_RATE_INITIAL = 0.0001
LEARNING_RATE_FINE_TUNE = 0.00001
EARLY_STOPPING_PATIENCE = 3 # Reduced patience for fewer total epochs
MODEL_SAVE_NAME = 'healthy_vs_rotten_best_model.h5' # Name for saving the best model

# --- Step 1: Data Preparation ---
print("--- Step 1: Data Preparation ---")

# Check if the dataset directory exists
if not os.path.exists(dataset_dir):
    print(f"Error: Dataset directory not found at {dataset_dir}")
    print("Please ensure the path is correct and the dataset is unzipped.")
    exit() # Exit if dataset not found

classes = os.listdir(dataset_dir)
if not classes:
    print(f"Error: No classes (subdirectories) found in {dataset_dir}")
    exit()

print(f"Found classes: {classes}")

# Clean up previous split if any and create new directories for train, val, and test sets
print(f"Cleaning up and creating dataset split directories in: {output_dir}")
shutil.rmtree(output_dir, ignore_errors=True) # Clean up previous split if any
os.makedirs(output_dir, exist_ok=True)
for subset in ['train', 'val', 'test']:
    for cls in classes:
        os.makedirs(os.path.join(output_dir, subset, cls), exist_ok=True)

# Copy images to respective directories and split
print("Splitting and copying images to train, validation, and test directories...")
for cls in classes:
    class_dir = os.path.join(dataset_dir, cls)
    images = os.listdir(class_dir)

    # Use the full list of images for the class.
    # This will process ALL images found for each class, as recommended for accuracy.
    images_to_process = images

    print(f"Processing {len(images_to_process)} images for class: {cls}")

    # Split data into training (60%), validation (20%), and test (20%)
    # First split: 80% for train+val, 20% for test
    train_and_val_images, test_images = train_test_split(images_to_process, test_size=0.2, random_state=42)
    # Second split: 75% of (train+val) for train (0.75 * 0.8 = 0.6), 25% of (train+val) for val (0.25 * 0.8 = 0.2)
    train_images, val_images = train_test_split(train_and_val_images, test_size=0.25, random_state=42)

    for img in train_images:
        shutil.copy(os.path.join(class_dir, img), os.path.join(output_dir, 'train', cls, img))
    for img in val_images:
        shutil.copy(os.path.join(class_dir, img), os.path.join(output_dir, 'val', cls, img))
    for img in test_images:
        shutil.copy(os.path.join(class_dir, img), os.path.join(output_dir, 'test', cls, img))

print("Dataset split into training, validation, and test sets successfully!")

# --- Step 2: Image Data Generators ---
print("\n--- Step 2: Setting up Image Data Generators ---")

train_dir = os.path.join(output_dir, 'train')
val_dir = os.path.join(output_dir, 'val')
test_dir = os.path.join(output_dir, 'test')

# Data augmentation for training images using ResNet50's specific preprocessing
train_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input, # ResNet50 specific preprocessing for accuracy
    rotation_range=20,
    width_shift_range=0.2,
    height_shift_range=0.2,
    shear_range=0.2,
    zoom_range=0.2,
    horizontal_flip=True,
    fill_mode='nearest'
)

# No augmentation for validation/test data, only preprocessing specific to ResNet50 for accuracy
val_test_datagen = ImageDataGenerator(
    preprocessing_function=preprocess_input # ResNet50 specific preprocessing
)

train_generator = train_datagen.flow_from_directory(
    train_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

val_generator = val_test_datagen.flow_from_directory(
    val_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    class_mode='categorical'
)

test_generator = val_test_datagen.flow_from_directory(
    test_dir,
    target_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False, # Important: Do not shuffle test data to maintain order for evaluation
    class_mode='categorical'
)

print("\nTrain Generator Class Indices:", train_generator.class_indices)
print("Validation Generator Class Indices:", val_generator.class_indices)
print("Test Generator Class Indices:", test_generator.class_indices)

print(f"\nNumber of training samples: {train_generator.samples}")
print(f"Number of validation samples: {val_generator.samples}")
print(f"Number of test samples: {test_generator.samples}")
print(f"Number of classes: {train_generator.num_classes}")

# --- Step 3: Model Building ---
print("\n--- Step 3: Building the Model ---")

# Load ResNet50 model, pre-trained on ImageNet, excluding the top (classification) layer
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))

# Freeze the layers of the base model so they are not updated during initial training
for layer in base_model.layers:
    layer.trainable = False

print("ResNet50 base model loaded and layers frozen.")

# Add custom classification layers on top of the base model
x = Flatten()(base_model.output)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x) # Dropout layer for regularization to prevent overfitting
output_layer = Dense(train_generator.num_classes, activation='softmax')(x)

# Create the final model
model = Model(inputs=base_model.input, outputs=output_layer)

print("Custom classification layers added.")

# --- Step 4: Model Compilation and Training (Initial Training with Frozen Base Layers) ---
print("\n--- Step 4: Compiling and Training (Initial Phase) ---")

# Define the optimizer with initial learning rate
optimizer_initial = Adam(learning_rate=LEARNING_RATE_INITIAL)

# Compile the model
model.compile(optimizer=optimizer_initial, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()
print("Model compiled for initial training.")

# Callbacks for saving the best model and early stopping during training
# ModelCheckpoint: Saves the best model based on validation accuracy
checkpoint = ModelCheckpoint(
    MODEL_SAVE_NAME,
    monitor='val_accuracy',
    save_best_only=True,
    mode='max', # Save model when validation accuracy is maximized
    verbose=1
)
# EarlyStopping: Stops training if validation accuracy does not improve for 'patience' epochs
early_stopping = EarlyStopping(
    monitor='val_accuracy',
    patience=EARLY_STOPPING_PATIENCE,
    mode='max',
    verbose=1,
    restore_best_weights=True # Restore model weights from the epoch with the best validation accuracy
)

callbacks_list = [checkpoint, early_stopping]

print(f"\nStarting initial model training for up to {EPOCHS_INITIAL} epochs (with early stopping and model checkpoint)...")
history_initial = model.fit(
    train_generator,
    epochs=EPOCHS_INITIAL,
    validation_data=val_generator,
    callbacks=callbacks_list # Apply callbacks to the training process
)

print("\nInitial model training complete.")

# --- Step 5: Fine-tuning (Unfreeze Base Model Layers and Retrain) ---
print("\n--- Step 5: Starting Fine-tuning Phase ---")

# Unfreeze all layers of the base model for fine-tuning.
# A common strategy is to unfreeze only a few top layers (e.g., base_model.layers[-50:]),
# but for this dataset, unfreezing all layers with a very low learning rate is a valid approach.
for layer in base_model.layers:
    layer.trainable = True

# Recompile the model with a much lower learning rate for fine-tuning.
# This is CRUCIAL when unfreezing layers to prevent drastic changes and to allow subtle adjustments.
optimizer_fine_tune = Adam(learning_rate=LEARNING_RATE_FINE_TUNE)
model.compile(optimizer=optimizer_fine_tune,
              loss='categorical_crossentropy',
              metrics=['accuracy'])

model.summary() # Review the model summary after unfreezing layers (trainable parameters will increase)

print(f"\nStarting fine-tuning for up to {EPOCHS_FINE_TUNE} additional epochs (with early stopping and model checkpoint)...")
# Continue training with the unfrozen layers.
# The 'initial_epoch' ensures training continues from where the initial phase left off.
history_fine_tune = model.fit(
    train_generator,
    epochs=EPOCHS_INITIAL + EPOCHS_FINE_TUNE, # Total epochs will be initial + fine-tune
    initial_epoch=history_initial.epoch[-1] + 1, # Start from the epoch after initial training ended
    validation_data=val_generator,
    callbacks=callbacks_list # Re-apply callbacks for fine-tuning phase
)

print("\nFine-tuning complete.")

# --- Step 6: Final Model Evaluation and Saving ---
print("\n--- Step 6: Final Evaluation ---")

# Load the best model saved by ModelCheckpoint for final evaluation
# This ensures you evaluate the model that performed best on the validation set
try:
    best_model = load_model(MODEL_SAVE_NAME)
    print(f"\nLoaded the best model from '{MODEL_SAVE_NAME}' for final evaluation.")
except Exception as e:
    print(f"Error loading the best model: {e}")
    print("Proceeding with the last state of the trained model. Consider checking the save path and previous training run.")
    best_model = model # Fallback to the last trained model if loading fails

loss, accuracy = best_model.evaluate(test_generator, verbose=1)
print(f"\nFinal Test Loss: {loss:.4f}")
print(f"Final Test Accuracy: {accuracy:.4f}")

print(f"\nTraining script finished. The best model is saved as '{MODEL_SAVE_NAME}'.")

--- Step 1: Data Preparation ---
Found classes: ['Apple__Healthy', 'Apple__Rotten', 'Banana__Healthy', 'Banana__Rotten', 'Bellpepper__Healthy', 'Bellpepper__Rotten', 'Carrot__Healthy', 'Carrot__Rotten', 'Cucumber__Healthy', 'Cucumber__Rotten', 'Grape__Healthy', 'Grape__Rotten', 'Guava__Healthy', 'Guava__Rotten', 'Jujube__Healthy', 'Jujube__Rotten', 'Mango__Healthy', 'Mango__Rotten', 'Orange__Healthy', 'Orange__Rotten', 'Pomegranate__Healthy', 'Pomegranate__Rotten', 'Potato__Healthy', 'Potato__Rotten', 'Strawberry__Healthy', 'Strawberry__Rotten', 'Tomato__Healthy', 'Tomato__Rotten']
Cleaning up and creating dataset split directories in: C:\Users\Admin\Downloads\smart-sorting\output_dataset
Splitting and copying images to train, validation, and test directories...
Processing 2438 images for class: Apple__Healthy
Processing 2930 images for class: Apple__Rotten
Processing 2000 images for class: Banana__Healthy
Processing 2800 images for class: Banana__Rotten
Processing 611 images for class

Model compiled for initial training.

Starting initial model training for up to 3 epochs (with early stopping and model checkpoint)...


  self._warn_if_super_not_called()


Epoch 1/3
[1m  2/549[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m37:58[0m 4s/step - accuracy: 0.0625 - loss: 7.4848   



[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.6345 - loss: 1.7161
Epoch 1: val_accuracy improved from -inf to 0.93562, saving model to healthy_vs_rotten_best_model.h5




[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3095s[0m 6s/step - accuracy: 0.6347 - loss: 1.7148 - val_accuracy: 0.9356 - val_loss: 0.2151
Epoch 2/3
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.8425 - loss: 0.5477
Epoch 2: val_accuracy improved from 0.93562 to 0.95338, saving model to healthy_vs_rotten_best_model.h5




[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3069s[0m 6s/step - accuracy: 0.8425 - loss: 0.5476 - val_accuracy: 0.9534 - val_loss: 0.1684
Epoch 3/3
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.8764 - loss: 0.4498
Epoch 3: val_accuracy improved from 0.95338 to 0.95355, saving model to healthy_vs_rotten_best_model.h5




[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3013s[0m 5s/step - accuracy: 0.8764 - loss: 0.4498 - val_accuracy: 0.9536 - val_loss: 0.1514
Restoring model weights from the end of the best epoch: 3.

Initial model training complete.

--- Step 5: Starting Fine-tuning Phase ---



Starting fine-tuning for up to 4 additional epochs (with early stopping and model checkpoint)...
Epoch 4/7
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20s/step - accuracy: 0.8712 - loss: 0.4496 
Epoch 4: val_accuracy improved from 0.95355 to 0.96824, saving model to healthy_vs_rotten_best_model.h5




[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11495s[0m 21s/step - accuracy: 0.8713 - loss: 0.4494 - val_accuracy: 0.9682 - val_loss: 0.1052
Epoch 5/7
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19s/step - accuracy: 0.9356 - loss: 0.2126 
Epoch 5: val_accuracy improved from 0.96824 to 0.97217, saving model to healthy_vs_rotten_best_model.h5




[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11126s[0m 20s/step - accuracy: 0.9356 - loss: 0.2126 - val_accuracy: 0.9722 - val_loss: 0.0891
Epoch 6/7
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19s/step - accuracy: 0.9501 - loss: 0.1639 
Epoch 6: val_accuracy improved from 0.97217 to 0.97456, saving model to healthy_vs_rotten_best_model.h5




[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11198s[0m 20s/step - accuracy: 0.9502 - loss: 0.1639 - val_accuracy: 0.9746 - val_loss: 0.0790
Epoch 7/7
[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19s/step - accuracy: 0.9564 - loss: 0.1415 
Epoch 7: val_accuracy improved from 0.97456 to 0.97848, saving model to healthy_vs_rotten_best_model.h5




[1m549/549[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10849s[0m 20s/step - accuracy: 0.9564 - loss: 0.1415 - val_accuracy: 0.9785 - val_loss: 0.0766
Restoring model weights from the end of the best epoch: 7.

Fine-tuning complete.

--- Step 6: Final Evaluation ---





Loaded the best model from 'healthy_vs_rotten_best_model.h5' for final evaluation.
[1m184/184[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m634s[0m 3s/step - accuracy: 0.9817 - loss: 0.0712

Final Test Loss: 0.0875
Final Test Accuracy: 0.9776

Training script finished. The best model is saved as 'healthy_vs_rotten_best_model.h5'.
