In [1]:
# Mixed precision for speed + lower VRAM
import tensorflow as tf
tf.keras.mixed_precision.set_global_policy("mixed_float16")

print("Mixed precision:", tf.keras.mixed_precision.global_policy())


Mixed precision: <DTypePolicy "mixed_float16">


In [None]:
pip install mediapipe==0.10.21


Collecting mediapipe==0.10.21
  Downloading mediapipe-0.10.21-cp312-cp312-manylinux_2_28_x86_64.whl.metadata (9.7 kB)
Collecting numpy<2 (from mediapipe==0.10.21)
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m61.0/61.0 kB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting protobuf<5,>=4.25.3 (from mediapipe==0.10.21)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe==0.10.21)
  Downloading sounddevice-0.5.3-py3-none-any.whl.metadata (1.6 kB)
INFO: pip is looking at multiple versions of jax to determine which version is compatible with other requirements. This could take a while.
Collecting jax (from mediapipe==0.10.21)
  Downloading jax-0.8.2-py3-none-any.whl.metadata (13 kB)
Collecting jaxlib (from media

In [3]:
import os, cv2, json, random
import numpy as np
from pathlib import Path
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import layers, models

import mediapipe as mp


AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

In [4]:
DATASET_ROOT = r"D:\Samvad_Setu_final\datasets\WLASL"
VIDEO_DIR = os.path.join(DATASET_ROOT, "videos")
JSON_PATH = os.path.join(DATASET_ROOT, "nslt_2000.json")

CACHE_DIR = os.path.join(DATASET_ROOT, "cache_fast")
MODEL_DIR = r"D:\Samvad_Setu_final\notebooks\Saved_models"

os.makedirs(CACHE_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)

IMG_SIZE = 160
MAX_FRAMES = 16
BATCH_SIZE = 4
EPOCHS_STAGE1 = 12
EPOCHS_STAGE2 = 20
BASE_LR = 3e-4


In [5]:
with open(JSON_PATH, "r") as f:
    data = json.load(f)

video_meta = {}
for vid, meta in data.items():
    vid = vid.zfill(5)
    label = meta["action"][0]
    subset = meta["subset"]
    video_meta[vid] = (label, subset)

num_classes = len(set(v[0] for v in video_meta.values()))
print("Classes:", num_classes)


Classes: 2000


In [6]:
mp_hands = mp.solutions.hands
hands = mp_hands.Hands(
    static_image_mode=False,
    max_num_hands=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

def extract_hand_skeleton(frame):
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    res = hands.process(rgb)
    if not res.multi_hand_landmarks:
        return np.zeros((42,), dtype=np.float16)

    coords = []
    for lm in res.multi_hand_landmarks[0].landmark:
        coords.extend([lm.x, lm.y])

    return np.array(coords, dtype=np.float16)


In [7]:
def process_video_ultrafast(video_path):
    cap = cv2.VideoCapture(video_path)
    total = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

    if total <= 0:
        cap.release()
        return None, None

    idxs = np.linspace(0, total-1, MAX_FRAMES, dtype=int)

    frames, skeletons = [], []

    for i, idx in enumerate(idxs):
        cap.set(cv2.CAP_PROP_POS_FRAMES, idx)
        ret, frame = cap.read()
        if not ret:
            break

        frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        frames.append(frame / 255.0)

        # Skeleton only every alternate frame (latency win)
        if i % 2 == 0:
            skeletons.append(extract_hand_skeleton(frame))
        else:
            skeletons.append(np.zeros((42,), dtype=np.float16))

    cap.release()

    if len(frames) != MAX_FRAMES:
        return None, None

    return (
        np.array(frames, dtype=np.float16),
        np.array(skeletons, dtype=np.float16)
    )


In [8]:
def ultrafast_cache(video_meta, limit=None):
    cached = set(f.replace(".npz","") for f in os.listdir(CACHE_DIR))
    todo = [vid for vid in video_meta if vid not in cached]

    if limit:
        todo = todo[:limit]

    print(f"Cached: {len(cached)} | Remaining: {len(todo)}")

    for vid in tqdm(todo, desc="Ultra-fast caching"):
        vp = os.path.join(VIDEO_DIR, f"{vid}.mp4")
        if not os.path.exists(vp):
            continue

        frames, skel = process_video_ultrafast(vp)
        if frames is None:
            continue

        label, _ = video_meta[vid]
        np.savez_compressed(
            os.path.join(CACHE_DIR, f"{vid}.npz"),
            frames=frames,
            skeleton=skel,
            label=label
        )


In [12]:
ultrafast_cache(video_meta, limit=2000)


Cached: 1102 | Remaining: 2000


Ultra-fast caching: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2000/2000 [06:24<00:00,  5.20it/s]


In [13]:
def load_npz(path):
    d = np.load(path.numpy().decode())
    return d["frames"], d["skeleton"], d["label"]

def tf_loader(path):
    f, s, l = tf.py_function(
        load_npz, [path],
        [tf.float16, tf.float16, tf.int32]
    )
    f.set_shape((MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3))
    s.set_shape((MAX_FRAMES, 42))
    l.set_shape(())
    return (f, s), l


In [14]:
paths = [os.path.join(CACHE_DIR, f) for f in os.listdir(CACHE_DIR)]
random.shuffle(paths)

split1 = int(0.7 * len(paths))
split2 = int(0.85 * len(paths))

train_p = paths[:split1]
val_p   = paths[split1:split2]
test_p  = paths[split2:]

train_ds = (
    tf.data.Dataset.from_tensor_slices(train_p)
    .map(tf_loader, num_parallel_calls=tf.data.AUTOTUNE)
    .shuffle(1024)
    .batch(BATCH_SIZE)
    .repeat()
    .prefetch(tf.data.AUTOTUNE)
)

val_ds = (
    tf.data.Dataset.from_tensor_slices(val_p)
    .map(tf_loader, num_parallel_calls=tf.data.AUTOTUNE)
    .batch(BATCH_SIZE)
    .repeat()
    .prefetch(tf.data.AUTOTUNE)
)

STEPS_PER_EPOCH = max(1, len(train_p)//BATCH_SIZE)
VAL_STEPS = max(1, len(val_p)//BATCH_SIZE)

print("Steps:", STEPS_PER_EPOCH, VAL_STEPS)


Steps: 199 42


In [16]:
video_in = layers.Input((MAX_FRAMES, IMG_SIZE, IMG_SIZE, 3))

base = tf.keras.applications.MobileNetV3Small(
    include_top=False, weights="imagenet", pooling="avg"
)
base.trainable = False

x = layers.TimeDistributed(base)(video_in)
x = layers.GRU(128)(x)
x = layers.Dense(256, activation="relu")(x)
out = layers.Dense(num_classes, activation="softmax", dtype="float32")(x)

stage1_model = models.Model(video_in, out)


Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v3/weights_mobilenet_v3_small_224_1.0_float_no_top_v2.h5
[1m4334752/4334752[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m3s[0m 1us/step


In [None]:

stage1_model.compile(
    optimizer=tf.keras.optimizers.AdamW(BASE_LR),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy", tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5)]
)

stage1_model.fit(
    train_ds.map(lambda x,y:(x[0],y)),
    validation_data=val_ds.map(lambda x,y:(x[0],y)),
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_steps=VAL_STEPS,
    epochs=30
)

Epoch 1/30
[1m199/199[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2219s[0m 9s/step - accuracy: 0.0038 - loss: 7.4799 - sparse_top_k_categorical_accuracy: 0.0088 - val_accuracy: 0.0060 - val_loss: 7.2689 - val_sparse_top_k_categorical_accuracy: 0.0119
Epoch 2/30
[1m199/199[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m2812s[0m 14s/step - accuracy: 0.0025 - loss: 6.5041 - sparse_top_k_categorical_accuracy: 0.0240 - val_accuracy: 0.0000e+00 - val_loss: 7.5552 - val_sparse_top_k_categorical_accuracy: 0.0060
Epoch 3/30
[1m  3/199[0m [37m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [1m17:44[0m 5s/step - accuracy: 0.0000e+00 - loss: 6.6456 - sparse_top_k_categorical_accuracy: 0.0000e+00

In [None]:
class GraphConv(layers.Layer):
    def __init__(self, units):
        super().__init__()
        self.fc = layers.Dense(units)

    def call(self, x):
        return self.fc(x)

    def compute_output_shape(self, input_shape):
        # input_shape will be (batch_size, num_nodes, feature_dim)
        # The Dense layer transforms the last dimension
        return (input_shape[0], input_shape[1], self.fc.units)

In [None]:
skel_in = layers.Input((MAX_FRAMES, 42))
s = layers.Reshape((MAX_FRAMES, 21, 2))(skel_in)
s = layers.TimeDistributed(GraphConv(64))(s)
s = layers.TimeDistributed(layers.GlobalAveragePooling1D())(s)
s = layers.GRU(64)(s)

fusion = layers.Concatenate()([stage1_model.output, s])
fusion = layers.Dense(256, activation="relu")(fusion)
out = layers.Dense(num_classes, activation="softmax", dtype="float32")(fusion)

final_model = models.Model(
    inputs=[video_in, skel_in],
    outputs=out
)

In [None]:
final_model.compile(
    optimizer=tf.keras.optimizers.AdamW(BASE_LR),
    loss="sparse_categorical_crossentropy",
    metrics=[
        "accuracy",
        tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5, name="top5")
    ]
)

callbacks = [
    tf.keras.callbacks.ModelCheckpoint(
        os.path.join(MODEL_DIR, "epoch_{epoch:03d}.keras"),
        save_best_only=False,
        verbose=1
    ),
    tf.keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.3)
]

final_model.fit(
    train_ds,
    validation_data=val_ds,
    steps_per_epoch=STEPS_PER_EPOCH,
    validation_steps=VAL_STEPS,
    epochs=EPOCHS_STAGE2,
    callbacks=callbacks
)

Epoch 1/20
[1m170/170[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 447ms/step - accuracy: 0.0000e+00 - loss: 7.5813 - top5: 0.0048
Epoch 1: saving model to /content/drive/MyDrive/SAMVAD_SETU/Saved-models/epoch_001.keras
[1m170/170[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m236s[0m 734ms/step - accuracy: 0.0000e+00 - loss: 7.5809 - top5: 0.0048 - val_accuracy: 0.0000e+00 - val_loss: 7.1641 - val_top5: 0.0069 - learning_rate: 3.0000e-04
Epoch 2/20
[1m170/170[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m0s[0m 530ms/step - accuracy: 0.0019 - loss: 6.5346 - top5: 0.0188
Epoch 2: saving model to /content/drive/MyDrive/SAMVAD_SETU/Saved-models/epoch_002.keras
[1m170/170[0m [32m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m[37m[0m [1m112s[0m 659ms/step - accuracy: 0.0019 - loss: 6.5342 - top5: 0.0188 - val_accuracy: 0.0000e+00 - va

<keras.src.callbacks.history.History at 0x78183897eb40>

In [None]:
final_model.save("/content/drive/MyDrive/SAMVAD_SETU/Saved-models/wlasl/wlasl-final.keras")

In [None]:
from IPython.display import display, Javascript
from google.colab.output import eval_js
import base64, cv2, numpy as np

def init_webcam():
    display(Javascript("""
    async function init() {
      const video = document.createElement('video');
      video.setAttribute('autoplay', '');
      video.setAttribute('playsinline', '');
      document.body.appendChild(video);

      const stream = await navigator.mediaDevices.getUserMedia({video: true});
      video.srcObject = stream;

      // wait until video is ready
      await new Promise(resolve => video.onloadedmetadata = resolve);

      window._video = video;
    }
    init();
    """))

def capture_frame():
    data = eval_js("""
    (() => {
      if (!window._video) return null;
      const canvas = document.createElement('canvas');
      canvas.width = _video.videoWidth;
      canvas.height = _video.videoHeight;
      const ctx = canvas.getContext('2d');
      ctx.drawImage(_video, 0, 0);
      return canvas.toDataURL('image/jpeg');
    })()
    """)
    if data is None:
        return None

    img_bytes = base64.b64decode(data.split(',')[1])
    return cv2.imdecode(np.frombuffer(img_bytes, np.uint8), cv2.IMREAD_COLOR)


In [None]:
init_webcam()
print("Webcam initializing... wait 3‚Äì5 seconds, then run next cell")


<IPython.core.display.Javascript object>

Webcam initializing... wait 3‚Äì5 seconds, then run next cell


In [None]:
buf_f, buf_s = [], []

print("Manual testing started (Ctrl+C to stop)")

try:
    while True:
        frame = capture_frame()
        if frame is None:
            continue

        frame = cv2.flip(frame, 1)

        fr = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        buf_f.append(fr / 255.0)
        buf_s.append(extract_hand_skeleton(fr))

        # Draw ROI
        h, w, _ = frame.shape
        size = 300
        x1, y1 = w//2 - size//2, h//2 - size//2
        x2, y2 = x1 + size, y1 + size
        cv2.rectangle(frame, (x1,y1), (x2,y2), (0,255,0), 2)

        if len(buf_f) == MAX_FRAMES:
            video = np.expand_dims(buf_f, 0)
            skel  = np.expand_dims(buf_s, 0)

            if np.sum(video) < 1e-3:
                pred = final_model.predict(
                    [np.zeros_like(video), skel],
                    verbose=0
                )
            else:
                pred = final_model.predict(
                    [video, skel],
                    verbose=0
                )

            cls = int(np.argmax(pred))
            conf = float(np.max(pred)) * 100

            cv2.putText(
                frame,
                f"Pred: {cls} ({conf:.1f}%)",
                (20,40),
                cv2.FONT_HERSHEY_SIMPLEX,
                1,
                (0,255,0),
                2
            )

            buf_f.pop(0)
            buf_s.pop(0)

        display(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

except KeyboardInterrupt:
    print("Stopped manual testing")


Manual testing started (Ctrl+C to stop)
Stopped manual testing


In [None]:
from IPython.display import clear_output
import time

MAX_FRAMES = 12        # üî• faster feedback
CONF_THRESH = 0.30    # ignore junk predictions

buf_f, buf_s = [], []

print("üü¢ Manual testing started (Ctrl+C to stop)")
time.sleep(2)

try:
    while True:
        frame = capture_frame()
        if frame is None:
            continue

        frame = cv2.flip(frame, 1)

        # ---------- Preprocess ----------
        fr = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
        fr = fr / 255.0
        sk = extract_hand_skeleton(fr)

        buf_f.append(fr)
        buf_s.append(sk)

        # Keep rolling buffer
        if len(buf_f) > MAX_FRAMES:
            buf_f.pop(0)
            buf_s.pop(0)

        # ---------- UI ----------
        h, w, _ = frame.shape
        cv2.rectangle(
            frame,
            (w//2-150, h//2-150),
            (w//2+150, h//2+150),
            (0,255,0), 2
        )

        cv2.putText(
            frame,
            f"Buffer: {len(buf_f)}/{MAX_FRAMES}",
            (20,30),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.9, (255,255,0), 2
        )

        # ---------- Predict ----------
        if len(buf_f) == MAX_FRAMES:
            video = np.expand_dims(np.array(buf_f), 0)
            skel  = np.expand_dims(np.array(buf_s), 0)

            # Fallback logic
            if np.sum(video) < 1e-3 or np.sum(skel) < 1e-3:
                pred = final_model.predict(
                    [np.zeros_like(video), skel],
                    verbose=0
                )
                fallback = "Skeleton"
            else:
                pred = final_model.predict(
                    [video, skel],
                    verbose=0
                )
                fallback = "Video+Skeleton"

            cls = int(np.argmax(pred))
            conf = float(np.max(pred))

            if conf > CONF_THRESH:
                label = ID2GLOSS.get(cls, f"Class-{cls}")
                txt = f"{label}  {conf*100:.1f}%  [{fallback}]"
            else:
                txt = "Low confidence..."

            cv2.putText(
                frame, txt,
                (20,70),
                cv2.FONT_HERSHEY_SIMPLEX,
                0.9, (0,255,0), 2
            )

        # ---------- Display ----------
        clear_output(wait=True)
        display(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

except KeyboardInterrupt:
    print("üõë Manual testing stopped")


üü¢ Manual testing started (Ctrl+C to stop)
üõë Manual testing stopped


In [None]:
final_model.input


[<KerasTensor shape=(None, 16, 160, 160, 3), dtype=float32, sparse=False, ragged=False, name=keras_tensor>,
 <KerasTensor shape=(None, 16, 42), dtype=float32, sparse=False, ragged=False, name=keras_tensor_357>]

In [None]:
sk = extract_hand_skeleton(fr)
print(np.sum(sk))


NameError: name 'fr' is not defined

In [None]:
frame = capture_frame()
frame = cv2.flip(frame, 1)

fr = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
fr = fr / 255.0

sk = extract_hand_skeleton(fr)

print("Skeleton sum:", np.sum(sk))


error: OpenCV(4.11.0) /io/opencv/modules/imgproc/src/resize.cpp:4208: error: (-215:Assertion failed) !ssize.empty() in function 'resize'
