In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import os

BASE_DIR = r"D:\SMART_VISION_AI\smartvision_dataset"
IMG_SIZE = (224, 224)
BATCH_SIZE = 32
IMG_SIZE   = (224, 224)

NUM_CLASSES = 25

train_dir = os.path.join(BASE_DIR, "classification", "train")
val_dir   = os.path.join(BASE_DIR, "classification", "val")
test_dir  = os.path.join(BASE_DIR, "classification", "test")

train_ds = tf.keras.utils.image_dataset_from_directory(
    train_dir,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=True
)

val_ds = tf.keras.utils.image_dataset_from_directory(
    val_dir,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)

test_ds = tf.keras.utils.image_dataset_from_directory(
    test_dir,
    image_size=IMG_SIZE,
    batch_size=BATCH_SIZE,
    shuffle=False
)

Found 1750 files belonging to 25 classes.
Found 375 files belonging to 25 classes.
Found 375 files belonging to 25 classes.


In [4]:
# 1.4. Data augmentation block (applied only on training data)
data_augmentation = keras.Sequential(
    [
        layers.RandomFlip("horizontal"),                # random horizontal flip
        layers.RandomRotation(0.04),                    # ~ ±15° (15/360 ≈ 0.04)
        layers.RandomZoom(0.1),                         # random zoom
        layers.RandomContrast(0.2),                     # ±20% contrast
        # Brightness jitter using Lambda + tf.image
        layers.Lambda(
            lambda x: tf.image.random_brightness(x, max_delta=0.2)
        ),
        # Optional: light color jitter via saturation
        layers.Lambda(
            lambda x: tf.image.random_saturation(x, lower=0.8, upper=1.2)
        ),
    ],
    name="data_augmentation",
)

# Normalization layer (0–1 scaling or ImageNet style)
normalization = layers.Rescaling(1./255)

In [5]:
# 2.1: Model 1 - VGG16

def build_vgg16_model():
    inputs = keras.Input(shape=(*IMG_SIZE, 3))
    x = data_augmentation(inputs)       # train only
    x = normalization(x)

    base_model = keras.applications.VGG16(
        include_top=False,
        weights="imagenet",
        input_tensor=x
    )
    base_model.trainable = False        # freeze convolutional base

    x = layers.GlobalAveragePooling2D()(base_model.output)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(NUM_CLASSES, activation="softmax")(x)

    model = keras.Model(inputs, outputs, name="VGG16_smartvision")
    return model
def compile_and_train(model, model_name, train_ds, val_ds, epochs=25, lr=1e-4):
    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"]
    )

    callbacks = [
        keras.callbacks.ModelCheckpoint(
            filepath=f"{model_name}_best.h5",
            monitor="val_accuracy",
            save_best_only=True,
            mode="max"
        ),
        keras.callbacks.EarlyStopping(
            monitor="val_accuracy",
            patience=5,
            restore_best_weights=True
        ),
        keras.callbacks.ReduceLROnPlateau(
            monitor="val_loss",
            factor=0.5,
            patience=2,
            min_lr=1e-6,
            verbose=1
        )
    ]

    history = model.fit(
        train_ds,
        validation_data=val_ds,
        epochs=epochs,
        callbacks=callbacks
    )
    return history

vgg16_model = build_vgg16_model()
history_vgg16 = compile_and_train(vgg16_model, "vgg16", train_ds, val_ds, epochs=25)


Epoch 1/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.0313 - loss: 3.4989



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m630s[0m 11s/step - accuracy: 0.0309 - loss: 3.4421 - val_accuracy: 0.0293 - val_loss: 3.2345 - learning_rate: 1.0000e-04
Epoch 2/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13s/step - accuracy: 0.0512 - loss: 3.3057 



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m796s[0m 14s/step - accuracy: 0.0497 - loss: 3.2952 - val_accuracy: 0.0773 - val_loss: 3.1857 - learning_rate: 1.0000e-04
Epoch 3/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11s/step - accuracy: 0.0589 - loss: 3.2227 



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m763s[0m 14s/step - accuracy: 0.0549 - loss: 3.2327 - val_accuracy: 0.1200 - val_loss: 3.1538 - learning_rate: 1.0000e-04
Epoch 4/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12s/step - accuracy: 0.0548 - loss: 3.2219 



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m722s[0m 13s/step - accuracy: 0.0629 - loss: 3.2152 - val_accuracy: 0.1600 - val_loss: 3.1274 - learning_rate: 1.0000e-04
Epoch 5/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10s/step - accuracy: 0.0707 - loss: 3.1797 



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m714s[0m 13s/step - accuracy: 0.0777 - loss: 3.1695 - val_accuracy: 0.1973 - val_loss: 3.1023 - learning_rate: 1.0000e-04
Epoch 6/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12s/step - accuracy: 0.0944 - loss: 3.1293 



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1187s[0m 22s/step - accuracy: 0.0937 - loss: 3.1314 - val_accuracy: 0.2133 - val_loss: 3.0780 - learning_rate: 1.0000e-04
Epoch 7/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.1248 - loss: 3.0998



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m353s[0m 6s/step - accuracy: 0.1131 - loss: 3.0977 - val_accuracy: 0.2160 - val_loss: 3.0539 - learning_rate: 1.0000e-04
Epoch 8/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.1347 - loss: 3.0714



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m287s[0m 5s/step - accuracy: 0.1331 - loss: 3.0629 - val_accuracy: 0.2320 - val_loss: 3.0272 - learning_rate: 1.0000e-04
Epoch 9/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.1383 - loss: 3.0420



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m292s[0m 5s/step - accuracy: 0.1343 - loss: 3.0423 - val_accuracy: 0.2347 - val_loss: 3.0063 - learning_rate: 1.0000e-04
Epoch 10/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5s/step - accuracy: 0.1552 - loss: 3.0362



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m294s[0m 5s/step - accuracy: 0.1531 - loss: 3.0279 - val_accuracy: 0.2640 - val_loss: 2.9857 - learning_rate: 1.0000e-04
Epoch 11/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.1666 - loss: 2.9923



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m285s[0m 5s/step - accuracy: 0.1646 - loss: 2.9947 - val_accuracy: 0.2880 - val_loss: 2.9620 - learning_rate: 1.0000e-04
Epoch 12/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m284s[0m 5s/step - accuracy: 0.1783 - loss: 2.9715 - val_accuracy: 0.2693 - val_loss: 2.9436 - learning_rate: 1.0000e-04
Epoch 13/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m283s[0m 5s/step - accuracy: 0.1697 - loss: 2.9516 - val_accuracy: 0.2800 - val_loss: 2.9200 - learning_rate: 1.0000e-04
Epoch 14/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.1914 - loss: 2.9245



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m270s[0m 5s/step - accuracy: 0.1926 - loss: 2.9296 - val_accuracy: 0.3040 - val_loss: 2.9006 - learning_rate: 1.0000e-04
Epoch 15/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.1862 - loss: 2.9033



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m257s[0m 5s/step - accuracy: 0.1943 - loss: 2.9099 - val_accuracy: 0.3067 - val_loss: 2.8765 - learning_rate: 1.0000e-04
Epoch 16/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 5s/step - accuracy: 0.2046 - loss: 2.8864 - val_accuracy: 0.2827 - val_loss: 2.8566 - learning_rate: 1.0000e-04
Epoch 17/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4s/step - accuracy: 0.2050 - loss: 2.8514



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m274s[0m 5s/step - accuracy: 0.2057 - loss: 2.8519 - val_accuracy: 0.3227 - val_loss: 2.8289 - learning_rate: 1.0000e-04
Epoch 18/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m323s[0m 6s/step - accuracy: 0.2251 - loss: 2.8530 - val_accuracy: 0.3173 - val_loss: 2.8120 - learning_rate: 1.0000e-04
Epoch 19/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 6s/step - accuracy: 0.2303 - loss: 2.8209 - val_accuracy: 0.3200 - val_loss: 2.7889 - learning_rate: 1.0000e-04
Epoch 20/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m320s[0m 6s/step - accuracy: 0.2389 - loss: 2.7991 - val_accuracy: 0.3200 - val_loss: 2.7680 - learning_rate: 1.0000e-04
Epoch 21/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1705s[0m 31s/step - accuracy: 0.2480 - loss: 2.7800 - val_accuracy: 0.3173 - val_loss: 2.7520 - learning_rate: 1.0000e-04
Epoch 22/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 4s/step - accuracy: 0.2514 - loss: 2.7607 - val_accuracy: 0.3413 - val_loss: 2.7255 - learning_rate: 1.0000e-04
Epoch 23/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 5s/step - accuracy: 0.2537 - loss: 2.7403 - val_accuracy: 0.3387 - val_loss: 2.7055 - learning_rate: 1.0000e-04
Epoch 24/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3s/step - accuracy: 0.2521 - loss: 2.7091



[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m237s[0m 4s/step - accuracy: 0.2389 - loss: 2.7280 - val_accuracy: 0.3653 - val_loss: 2.6848 - learning_rate: 1.0000e-04
Epoch 25/25
[1m55/55[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 5s/step - accuracy: 0.2514 - loss: 2.7004 - val_accuracy: 0.3520 - val_loss: 2.6663 - learning_rate: 1.0000e-04


In [6]:
class_names = train_ds.class_names
NUM_CLASSES = len(class_names)
print(class_names)

['airplane', 'bed', 'bench', 'bicycle', 'bird', 'bottle', 'bowl', 'bus', 'cake', 'car', 'cat', 'chair', 'couch', 'cow', 'cup', 'dog', 'elephant', 'horse', 'motorcycle', 'person', 'pizza', 'potted plant', 'stop sign', 'traffic light', 'truck']


In [7]:
import numpy as np
import time
import json
import os
from sklearn.metrics import classification_report, confusion_matrix, precision_recall_fscore_support

def evaluate_and_collect_metrics(model, model_name, test_ds, class_names, weights_path=None):
    # If you saved best weights, load them
    if weights_path is not None and os.path.exists(weights_path):
        model.load_weights(weights_path)
        print(f"✅ Loaded best weights from {weights_path}")

    y_true = []
    y_pred = []
    y_pred_probs = []

    # ----- measure inference time -----
    total_time = 0.0
    total_images = 0

    for images, labels in test_ds:
        images_np = images.numpy()
        batch_size = images_np.shape[0]

        start = time.perf_counter()
        probs = model.predict(images_np, verbose=0)
        end = time.perf_counter()

        total_time += (end - start)
        total_images += batch_size

        preds = np.argmax(probs, axis=1)

        y_true.extend(labels.numpy())
        y_pred.extend(preds)
        y_pred_probs.append(probs)

    y_true = np.array(y_true)
    y_pred = np.array(y_pred)
    y_pred_probs = np.concatenate(y_pred_probs, axis=0)

    # ----- basic metrics -----
    acc = (y_true == y_pred).mean()

    precision, recall, f1, _ = precision_recall_fscore_support(
        y_true, y_pred, average="weighted", zero_division=0
    )

    # ----- top-5 accuracy -----
    top5_correct = 0
    for i, label in enumerate(y_true):
        top5 = np.argsort(y_pred_probs[i])[-5:]
        if label in top5:
            top5_correct += 1
    top5_acc = top5_correct / len(y_true)

    # ----- inference time -----
    avg_time_per_image = total_time / total_images  # seconds
    imgs_per_second = 1.0 / avg_time_per_image if avg_time_per_image > 0 else 0.0

    # ----- model size -----
    # Save weights temporarily to compute size
    temp_weights = f"{model_name}_temp_for_size.weights.h5" 
    model.save_weights(temp_weights)
    size_mb = os.path.getsize(temp_weights) / (1024 * 1024)
    os.remove(temp_weights)

    # ----- classification report & confusion matrix (for plots) -----
    print(f"\n=== {model_name.upper()} – Classification Report ===")
    print(classification_report(y_true, y_pred, target_names=class_names, zero_division=0))

    cm = confusion_matrix(y_true, y_pred)
    print(f"\nConfusion matrix shape: {cm.shape}")

    metrics = {
        "model_name": model_name,
        "accuracy": float(acc),
        "precision_weighted": float(precision),
        "recall_weighted": float(recall),
        "f1_weighted": float(f1),
        "top5_accuracy": float(top5_acc),
        "avg_inference_time_sec_per_image": float(avg_time_per_image),
        "images_per_second": float(imgs_per_second),
        "model_size_mb": float(size_mb),
        "num_parameters": int(model.count_params()),
    }
    return metrics, cm

In [8]:
vgg_metrics, vgg_cm = evaluate_and_collect_metrics(
    vgg16_model, "vgg16", test_ds, class_names, "vgg16_best.h5"
)
with open("vgg16_metrics.json", "w") as f:
    json.dump(vgg_metrics, f, indent=2)

✅ Loaded best weights from vgg16_best.h5

=== VGG16 – Classification Report ===
               precision    recall  f1-score   support

     airplane       0.57      0.53      0.55        15
          bed       0.29      0.33      0.31        15
        bench       0.00      0.00      0.00        15
      bicycle       0.62      0.33      0.43        15
         bird       0.20      0.60      0.30        15
       bottle       0.24      0.47      0.32        15
         bowl       0.19      0.27      0.22        15
          bus       0.23      0.47      0.30        15
         cake       0.56      0.33      0.42        15
          car       0.08      0.20      0.12        15
          cat       0.29      0.33      0.31        15
        chair       0.43      0.40      0.41        15
        couch       0.00      0.00      0.00        15
          cow       0.26      0.33      0.29        15
          cup       1.00      0.13      0.24        15
          dog       0.67      0.13     

In [None]:
# 2.2: Model 2 - ResNet50
def build_resnet50_model():
    inputs = keras.Input(shape=(*IMG_SIZE, 3))
    x = data_augmentation(inputs)
    x = normalization(x)

    base_model = keras.applications.ResNet50(
        include_top=False,
        weights="imagenet",
        input_tensor=x
    )

    # Freeze all, then unfreeze last 20 layers
    for layer in base_model.layers:
        layer.trainable = False
    for layer in base_model.layers[-20:]:
        layer.trainable = True

    x = layers.GlobalAveragePooling2D()(base_model.output)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.5)(x)
    outputs = layers.Dense(NUM_CLASSES, activation="softmax")(x)

    model = keras.Model(inputs, outputs, name="ResNet50_smartvision")
    return model

resnet_model = build_resnet50_model()
history_resnet = compile_and_train(resnet_model, "resnet50", train_ds, val_ds, epochs=25, lr=1e-4)

In [None]:
# 2.3: Model 3 - MobileNetV2

def build_mobilenetv2_model():
    inputs = keras.Input(shape=(*IMG_SIZE, 3))
    x = data_augmentation(inputs)
    x = normalization(x)

    base_model = keras.applications.MobileNetV2(
        include_top=False,
        weights="imagenet",
        input_tensor=x
    )
    base_model.trainable = False  # keep it light & fast

    x = layers.GlobalAveragePooling2D()(base_model.output)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.3)(x)
    outputs = layers.Dense(NUM_CLASSES, activation="softmax")(x)

    model = keras.Model(inputs, outputs, name="MobileNetV2_smartvision")
    return model

mobilenet_model = build_mobilenetv2_model()
history_mobilenet = compile_and_train(mobilenet_model, "mobilenetv2", train_ds, val_ds, epochs=20, lr=1e-4)

In [None]:
# 2.4: Model 4 - EfficientNetB0

from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy("mixed_float16")   # for GPU speed

def build_efficientnetb0_model():
    inputs = keras.Input(shape=(*IMG_SIZE, 3))
    x = data_augmentation(inputs)
    x = normalization(x)

    base_model = keras.applications.EfficientNetB0(
        include_top=False,
        weights="imagenet",
        input_tensor=x
    )

    # Fine-tune: unfreeze some top layers
    for layer in base_model.layers[:-30]:
        layer.trainable = False
    for layer in base_model.layers[-30:]:
        layer.trainable = True

    x = layers.GlobalAveragePooling2D()(base_model.output)
    x = layers.BatchNormalization()(x)
    x = layers.Dense(256, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(NUM_CLASSES, activation="softmax", dtype="float32")(x)  # force float32 at output

    model = keras.Model(inputs, outputs, name="EfficientNetB0_smartvision")
    return model

effnet_model = build_efficientnetb0_model()
history_effnet = compile_and_train(effnet_model, "efficientnetb0", train_ds, val_ds, epochs=30, lr=5e-5)

In [None]:
# 2.5: Model Comparison & Selection

from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

def evaluate_on_test(model, test_ds, model_name):
    y_true = []
    y_pred = []

    for images, labels in test_ds:
        preds = model.predict(images)
        y_true.extend(labels.numpy())
        y_pred.extend(np.argmax(preds, axis=1))

    print(f"\n=== {model_name} TEST REPORT ===")
    print(classification_report(y_true, y_pred, target_names=class_names))

    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=False, cmap="Blues",
                xticklabels=class_names,
                yticklabels=class_names)
    plt.title(f"{model_name} - Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.show()

# Load best weights if needed and evaluate
vgg16_model.load_weights("vgg16_best.h5")
resnet_model.load_weights("resnet50_best.h5")
mobilenet_model.load_weights("mobilenetv2_best.h5")
effnet_model.load_weights("efficientnetb0_best.h5")

evaluate_on_test(vgg16_model, test_ds, "VGG16")
evaluate_on_test(resnet_model, test_ds, "ResNet50")
evaluate_on_test(mobilenet_model, test_ds, "MobileNetV2")
evaluate_on_test(effnet_model, test_ds, "EfficientNetB0")