In [1]:
import os
import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, optimizers
import numpy as np
import pathlib
import cv2
import json

2025-10-13 20:34:21.184477: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-13 20:34:21.402956: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-10-13 20:34:21.556083: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1760367861.703861    9031 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1760367861.757145    9031 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1760367862.097743    9031 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linkin

In [4]:
DATA_DIR = "../data/raw/ASL_Dataset/"   # <<-- change this to your dataset path
IMG_SIZE = 64
BATCH_SIZE = 32
AUTOTUNE = tf.data.AUTOTUNE
EPOCHS = 40
PATIENCE = 6
MODEL_SAVE_PATH = "gesture_model.h5"
SEED = 123


train_ds = tf.keras.preprocessing.image_dataset_from_directory(
    DATA_DIR,
    labels="inferred",
    label_mode="categorical",   # one-hot labels
    batch_size=BATCH_SIZE,
    image_size=(IMG_SIZE, IMG_SIZE),
    shuffle=True,
    seed=SEED,
    validation_split=0.2,
    subset="training",
)

val_ds = tf.keras.preprocessing.image_dataset_from_directory(
    DATA_DIR,
    labels="inferred",
    label_mode="categorical",
    batch_size=BATCH_SIZE,
    image_size=(IMG_SIZE, IMG_SIZE),
    shuffle=True,
    seed=SEED,
    validation_split=0.2,
    subset="validation",
)

class_names = train_ds.class_names
NUM_CLASSES = len(class_names)
print("Classes:", class_names, "NUM_CLASSES=", NUM_CLASSES)


def rgb_to_gray_and_scale(images, labels):
    # images: float32 in [0,255]
    images = tf.image.rgb_to_grayscale(images)                      # -> (B, H, W, 1)
    images = tf.cast(images, tf.float32) / 255.0                    # -> [0,1]
    return images, labels


data_augmentation = tf.keras.Sequential([
    layers.RandomFlip("horizontal"),
    layers.RandomRotation(0.08),         # +- ~15 degrees
    layers.RandomZoom(0.08),
    layers.RandomTranslation(0.05, 0.05),
    layers.RandomContrast(0.1),
], name="data_augmentation")

def augment(images, labels):

    return data_augmentation(images), labels


train_ds = train_ds.map(lambda x, y: (tf.image.rgb_to_grayscale(x), y), num_parallel_calls=AUTOTUNE)
train_ds = train_ds.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y), num_parallel_calls=AUTOTUNE)
train_ds = train_ds.map(lambda x, y: (data_augmentation(x), y), num_parallel_calls=AUTOTUNE)
train_ds = train_ds.prefetch(AUTOTUNE)

val_ds = val_ds.map(lambda x, y: (tf.image.rgb_to_grayscale(x), y), num_parallel_calls=AUTOTUNE)
val_ds = val_ds.map(lambda x, y: (tf.cast(x, tf.float32) / 255.0, y), num_parallel_calls=AUTOTUNE)
val_ds = val_ds.prefetch(AUTOTUNE)

Found 165782 files belonging to 28 classes.
Using 132626 files for training.
Found 165782 files belonging to 28 classes.
Using 33156 files for validation.
Classes: ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'Nothing', 'O', 'P', 'Q', 'R', 'S', 'Space', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z'] NUM_CLASSES= 28


In [None]:
def build_light_cnn(input_shape=(IMG_SIZE, IMG_SIZE, 1), num_classes=NUM_CLASSES):
    inputs = layers.Input(shape=input_shape)

    # Conv Block 1
    x = layers.Conv2D(16, (3,3), padding="same", activation=None)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.MaxPooling2D((2,2))(x)

    # Conv Block 2
    x = layers.Conv2D(32, (3,3), padding="same", activation=None)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.MaxPooling2D((2,2))(x)

    # Conv Block 3
    x = layers.Conv2D(64, (3,3), padding="same", activation=None)(x)
    x = layers.BatchNormalization()(x)
    x = layers.Activation("relu")(x)
    x = layers.MaxPooling2D((2,2))(x)

    # Flatten and Dense
    x = layers.Flatten()(x)
    x = layers.Dense(128, activation="relu")(x)
    x = layers.Dropout(0.4)(x)
    outputs = layers.Dense(num_classes, activation="softmax")(x)

    model = models.Model(inputs, outputs, name="LightweightGestureCNN")
    return model

model = build_light_cnn()
model.compile(optimizer=optimizers.Adam(learning_rate=1e-3),
              loss="categorical_crossentropy",
              metrics=["accuracy"])

model.summary()

# ---------------------------
# Callbacks
# ---------------------------
checkpoint_cb = callbacks.ModelCheckpoint("best_gesture_model.h5", save_best_only=True, monitor="val_accuracy", mode="max")
earlystop_cb = callbacks.EarlyStopping(monitor="val_loss", patience=PATIENCE, restore_best_weights=True)
reduce_lr_cb = callbacks.ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=3, min_lr=1e-6)

# ---------------------------
# Train
# ---------------------------
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=[checkpoint_cb, earlystop_cb, reduce_lr_cb]
)



MODEL_SAVE_PATH="../model/gesture_model.h5"

model.save(MODEL_SAVE_PATH)
print(f"Saved model to {MODEL_SAVE_PATH}")

# Optionally, save the class names mapping
with open("class_names.json", "w") as f:
    json.dump(class_names, f)

Epoch 1/40
[1m   1/4145[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m4:59:34[0m 4s/step - accuracy: 0.0312 - loss: 4.3408

2025-10-13 20:40:45.416202: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25362432 exceeds 10% of free system memory.
2025-10-13 20:40:45.416284: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25362432 exceeds 10% of free system memory.


[1m   2/4145[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m31:45[0m 460ms/step - accuracy: 0.0469 - loss: 4.6280

2025-10-13 20:40:45.946039: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25362432 exceeds 10% of free system memory.
2025-10-13 20:40:45.946436: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25362432 exceeds 10% of free system memory.


[1m   3/4145[0m [37m━━━━━━━━━━━━━━━━━━━━[0m [1m26:26[0m 383ms/step - accuracy: 0.0486 - loss: 4.8607

2025-10-13 20:40:46.277621: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 25362432 exceeds 10% of free system memory.


[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step - accuracy: 0.2037 - loss: 2.5118



[1m4145/4145[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m487s[0m 116ms/step - accuracy: 0.2847 - loss: 2.1351 - val_accuracy: 0.6419 - val_loss: 1.0181 - learning_rate: 0.0010
Epoch 2/40
[1m 596/4145[0m [32m━━[0m[37m━━━━━━━━━━━━━━━━━━[0m [1m6:37[0m 112ms/step - accuracy: 0.4111 - loss: 1.6660

In [None]:
def convert_to_tflite(h5_path="../model/gesture_model.h5", tflite_path="../model/gesture_model.tflite"):
    import tensorflow as tf
    model = tf.keras.models.load_model(h5_path)
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    tflite_model = converter.convert()
    with open(tflite_path, "wb") as f:
        f.write(tflite_model)
    print("Saved TFLite model to", tflite_path)


In [None]:
def run_webcam_inference(model_path="best_gesture_model.h5", class_map_path="class_names.json"):
    import time
    import tensorflow as tf
    import cv2
    import numpy as np
    import json

    model = tf.keras.models.load_model(model_path)
    with open(class_map_path, "r") as f:
        classes = json.load(f)

    cap = cv2.VideoCapture(0)  # 0 = default webcam
    if not cap.isOpened():
        print("Error: Could not open webcam.")
        return

    while True:
        ret, frame = cap.read()
        if not ret:
            break

        # Preprocess frame: ROI cropping optional, here we center crop or resize entire frame
        frame_gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
        frame_resized = cv2.resize(frame_gray, (IMG_SIZE, IMG_SIZE))
        img_in = frame_resized.astype("float32") / 255.0
        img_in = np.expand_dims(img_in, axis=(0, -1))  # shape (1, H, W, 1)

        preds = model.predict(img_in)
        top_idx = np.argmax(preds[0])
        top_prob = preds[0][top_idx]
        label = classes[top_idx]

        # Draw text on original frame
        text = f"{label}: {top_prob:.2f}"
        cv2.putText(frame, text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1.0, (0,255,0), 2)
        cv2.imshow("Gesture Recognition", frame)

        # quit
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break

    cap.release()
    cv2.destroyAllWindows()