## Dataset Splitting Function
Define function to split dataset into training, validation, and test sets (currently commented out)

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['ABSL_LOG_LEVEL'] = 'FATAL'

import shutil
from sklearn.model_selection import train_test_split

base_dir = '../data/garbage-dataset'
classes = os.listdir(base_dir)
train_dir = '../data/garbage-split/train'
test_dir = '../data/garbage-split/test'
val_dir = '../data/garbage-split/val'

def split_dataset():
    class_names = os.listdir(base_dir)
    for class_name in class_names:
        imgs = os.listdir(os.path.join(base_dir, class_name))
        train_imgs, temp_imgs = train_test_split(imgs, test_size=0.2, random_state=42)
        val_imgs, test_imgs = train_test_split(temp_imgs, test_size=0.25, random_state=42)
        for split, split_imgs in zip([train_dir, val_dir, test_dir], [train_imgs, val_imgs, test_imgs]):
            os.makedirs(os.path.join(split, class_name), exist_ok=True)
            for img in split_imgs:
                src = os.path.join(base_dir, class_name, img)
                dst = os.path.join(split, class_name, img)
                shutil.copyfile(src, dst)

# split_dataset()

In [None]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
os.environ['ABSL_LOG_LEVEL'] = 'FATAL'

import numpy as np
from tensorflow.keras.models import load_model # type: ignore
from tensorflow.keras.preprocessing.image import ImageDataGenerator # type: ignore
from collections import defaultdict

# Normalization values used by PyTorch pre-trained models
mean = np.array([0.485, 0.456, 0.406])
std = np.array([0.229, 0.224, 0.225])

def pytorch_normalize(img):
    img = img / 255.0
    return (img - mean) / std

def evaluate_per_class(model, model_name, preprocessing_func):
    test_dir = '../data/garbage-split/test'

    test_datagen = ImageDataGenerator(preprocessing_function=preprocessing_func)

    test_generator = test_datagen.flow_from_directory(
        test_dir,
        target_size=(224, 224),
        batch_size=32,
        class_mode='categorical',
        shuffle=False
    )
    
    true_labels = test_generator.classes
    class_indices = test_generator.class_indices
    class_names = list(class_indices.keys())
    inv_class_indices = {v: k for k, v in class_indices.items()}

    print(f"\n=== {model_name} Model Evaluation ===")
    predictions = model.predict(test_generator, verbose=1)
    predicted_labels = np.argmax(predictions, axis=1)

    # Counters
    correct = np.sum(predicted_labels == true_labels)
    total = len(true_labels)
    overall_acc = correct / total * 100
    class_correct = defaultdict(int)
    class_total = defaultdict(int)

    for i in range(len(true_labels)):
        true = true_labels[i]
        pred = predicted_labels[i]
        class_total[true] += 1
        if true == pred:
            class_correct[true] += 1

    print(f"Total Images: {total}")
    print(f"Correct Predictions: {correct}")
    print(f"Wrong Predictions: {total - correct}")
    print(f"Overall Accuracy: {overall_acc:.2f}%")

    print("\nPer-class results:")
    for class_id in range(len(class_names)):
        total_cls = class_total[class_id]
        correct_cls = class_correct[class_id]
        wrong_cls = total_cls - correct_cls
        acc = (correct_cls / total_cls * 100) if total_cls > 0 else 0.0
        print(f"  {inv_class_indices[class_id]:<20} Correct: {correct_cls:<3}  Wrong: {wrong_cls:<3}  Accuracy: {acc:.2f}%")
        
    return overall_acc

# Load models
resnet50_model = load_model("saved_models/best_resnet50.keras")
custom_cnn_model = load_model("saved_models/best_custom_cnn.keras")
mobilenetv2_model = load_model("saved_models/best_mobilenetv2.keras")

# Evaluate all models
resnet50_acc = evaluate_per_class(resnet50_model, "ResNet50", pytorch_normalize)
custom_cnn_acc = evaluate_per_class(custom_cnn_model, "Custom CNN", pytorch_normalize)
mobilenetv2_acc = evaluate_per_class(mobilenetv2_model, "MobileNetV2", pytorch_normalize)

# Summary
print("\n=== Model Accuracy Summary ===")
print(f"ResNet50 Accuracy:     {resnet50_acc:.2f}%")
print(f"Custom CNN Accuracy:   {custom_cnn_acc:.2f}%")
print(f"MobileNetV2 Accuracy:  {mobilenetv2_acc:.2f}%")


Found 993 images belonging to 10 classes.

=== ResNet50 Model Evaluation ===


  self._warn_if_super_not_called()
I0000 00:00:1753898735.955687   60580 service.cc:145] XLA service 0x7af524002b80 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1753898735.955796   60580 service.cc:153]   StreamExecutor device (0): NVIDIA GeForce GTX 1660 Ti with Max-Q Design, Compute Capability 7.5


[1m 2/32[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m2s[0m 90ms/step

I0000 00:00:1753898740.922593   60580 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 213ms/step
Total Images: 993
Correct Predictions: 965
Wrong Predictions: 28
Overall Accuracy: 97.18%

Per-class results:
  battery              Correct: 48   Wrong: 0    Accuracy: 100.00%
  biological           Correct: 47   Wrong: 3    Accuracy: 94.00%
  cardboard            Correct: 87   Wrong: 5    Accuracy: 94.57%
  clothes              Correct: 262  Wrong: 5    Accuracy: 98.13%
  glass                Correct: 150  Wrong: 4    Accuracy: 97.40%
  metal                Correct: 48   Wrong: 3    Accuracy: 94.12%
  paper                Correct: 82   Wrong: 2    Accuracy: 97.62%
  plastic              Correct: 97   Wrong: 3    Accuracy: 97.00%
  shoes                Correct: 98   Wrong: 1    Accuracy: 98.99%
  trash                Correct: 46   Wrong: 2    Accuracy: 95.83%
Found 993 images belonging to 10 classes.

=== Custom CNN Model Evaluation ===
[1m32/32[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 210ms/ste