Preprocessing

In [8]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

DATASET_DIR = "D:/SFT_ATTEMPT2/raw_video"  # <-- adjust full path to your dataset
TARGET_WORDS = ["Halo", "Kamu", "Apa", "Dimana", "Duduk"]
IMG_SIZE = 96
FRAMES_PER_VIDEO = 20  # sample fixed length from each video

def load_videos():
    X, y = [], []
    for word in TARGET_WORDS:
        word_path = os.path.join(DATASET_DIR, word)
        for vid_file in os.listdir(word_path):
            vid_path = os.path.join(word_path, vid_file)
            cap = cv2.VideoCapture(vid_path)

            frames = []
            while True:
                ret, frame = cap.read()
                if not ret:
                    break
                frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
                frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frame = frame.astype("float32") / 255.0
                frames.append(frame)

            cap.release()

            # Uniformly sample FRAMES_PER_VIDEO frames
            if len(frames) >= FRAMES_PER_VIDEO:
                idxs = np.linspace(0, len(frames)-1, FRAMES_PER_VIDEO, dtype=int)
                frames = [frames[i] for i in idxs]
                X.append(frames)
                y.append(TARGET_WORDS.index(word))

    return np.array(X), np.array(y)

X, y = load_videos()
print("Shape:", X.shape, y.shape)  # (num_samples, FRAMES_PER_VIDEO, 96, 96, 3)

# Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


Shape: (250, 20, 96, 96, 3) (250,)


In [9]:
from tensorflow.keras import layers, models

FRAMES_PER_VIDEO = X_train.shape[1]  # should be 20
IMG_SIZE = X_train.shape[2]          # should be 96
NUM_CLASSES = len(TARGET_WORDS)

def build_model():
    model = models.Sequential([
        # CNN applied to each frame (via TimeDistributed)
        layers.TimeDistributed(layers.Conv2D(32, (3,3), activation="relu"),
                               input_shape=(FRAMES_PER_VIDEO, IMG_SIZE, IMG_SIZE, 3)),
        layers.TimeDistributed(layers.MaxPooling2D((2,2))),
        layers.TimeDistributed(layers.Conv2D(64, (3,3), activation="relu")),
        layers.TimeDistributed(layers.MaxPooling2D((2,2))),
        layers.TimeDistributed(layers.Flatten()),

        # LSTM across frames
        layers.LSTM(128),
        layers.Dense(64, activation="relu"),
        layers.Dense(NUM_CLASSES, activation="softmax")
    ])
    return model

model = build_model()
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
model.summary()


  super().__init__(**kwargs)


In [10]:
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=8
)


Epoch 1/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m71s[0m 2s/step - accuracy: 0.1999 - loss: 1.6835 - val_accuracy: 0.5200 - val_loss: 1.1038
Epoch 2/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 2s/step - accuracy: 0.7295 - loss: 0.8012 - val_accuracy: 1.0000 - val_loss: 0.1872
Epoch 3/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m74s[0m 3s/step - accuracy: 0.9806 - loss: 0.1604 - val_accuracy: 1.0000 - val_loss: 0.0254
Epoch 4/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 3s/step - accuracy: 1.0000 - loss: 0.0171 - val_accuracy: 1.0000 - val_loss: 0.0070
Epoch 5/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 3s/step - accuracy: 1.0000 - loss: 0.0055 - val_accuracy: 1.0000 - val_loss: 0.0033
Epoch 6/20
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m92s[0m 3s/step - accuracy: 1.0000 - loss: 0.0026 - val_accuracy: 1.0000 - val_loss: 0.0021
Epoch 7/20
[1m25/25[0m [32m━━━━━━━━━━

In [11]:
model.save("bisindo_words_model.h5")




In [14]:
class_names = TARGET_WORDS  # same list you used for training


In [15]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Predict on validation set
y_pred_probs = model.predict(X_val)
y_pred = np.argmax(y_pred_probs, axis=1)

# If y_val is already integer encoded, just use it directly
y_true = y_val  

# Report
print(classification_report(y_true, y_pred, target_names=class_names))

# Confusion matrix
cm = confusion_matrix(y_true, y_pred)
print(cm)


[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 442ms/step
              precision    recall  f1-score   support

        Halo       1.00      1.00      1.00        13
        Kamu       1.00      1.00      1.00         7
         Apa       1.00      1.00      1.00        10
      Dimana       1.00      1.00      1.00        11
       Duduk       1.00      1.00      1.00         9

    accuracy                           1.00        50
   macro avg       1.00      1.00      1.00        50
weighted avg       1.00      1.00      1.00        50

[[13  0  0  0  0]
 [ 0  7  0  0  0]
 [ 0  0 10  0  0]
 [ 0  0  0 11  0]
 [ 0  0  0  0  9]]


In [17]:
import cv2
import numpy as np
import tensorflow as tf
from collections import deque

# Load trained model
model = tf.keras.models.load_model("bisindo_words_model.h5")
class_names = ["Halo", "Kamu", "Apa", "Dimana", "Duduk"]

# Parameters
IMG_SIZE = 96
FRAMES_PER_VIDEO = 20
CONF_THRESHOLD = 0.6  # adjust as needed

# Buffer to store recent frames
frame_buffer = deque(maxlen=FRAMES_PER_VIDEO)

def preprocess_frame(frame):
    frame = cv2.resize(frame, (IMG_SIZE, IMG_SIZE))
    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    frame = frame.astype("float32") / 255.0
    return frame

def predict_sequence(frames):
    X_input = np.expand_dims(np.array(frames), axis=0)  # (1, 20, 96, 96, 3)
    probs = model.predict(X_input, verbose=0)
    pred_idx = np.argmax(probs, axis=1)[0]
    confidence = np.max(probs)
    if confidence < CONF_THRESHOLD:
        return "No Gesture", confidence
    return class_names[pred_idx], confidence

# Open webcam
cap = cv2.VideoCapture(0)

print("🎥 Starting real-time gesture recognition. Press 'q' to quit.")

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Preprocess and store frame
    processed = preprocess_frame(frame)
    frame_buffer.append(processed)

    prediction_text = "Waiting..."
    conf = 0

    # Once we have enough frames, predict
    if len(frame_buffer) == FRAMES_PER_VIDEO:
        pred, conf = predict_sequence(list(frame_buffer))
        prediction_text = f"{pred} ({conf:.2f})"

    # Show on screen
    cv2.putText(frame, prediction_text, (30, 40),
                cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
    cv2.imshow("Real-Time BISINDO", frame)

    # Quit key
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()




🎥 Starting real-time gesture recognition. Press 'q' to quit.


In [12]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from collections import deque

# Load trained model
model = tf.keras.models.load_model("bisindo_words_model.h5")
class_names = ["Halo", "Kamu", "Apa", "Dimana", "Duduk"]

# Parameters
SEQ_LENGTH = 30
CONF_THRESHOLD = 0.8
sentence = []
pred_buffer = deque(maxlen=5)

# MediaPipe hands
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(max_num_hands=1, min_detection_confidence=0.5, min_tracking_confidence=0.5)

# Detect model expected input shape
model_input_shape = model.input_shape  # e.g. (None, 30, 63) or (None, 1890)
print("Model expects input shape:", model_input_shape)

def format_input(keypoints):
    """Format landmark array based on model input shape"""
    flat = np.array(keypoints).flatten()  # shape (63,)
    if len(model_input_shape) == 3:
        # Shape (batch, timesteps, features)
        timesteps, feat = model_input_shape[1], model_input_shape[2]
        frame_data = flat.reshape(1, 1, -1)        # (1, 1, 63)
        return np.repeat(frame_data, timesteps, axis=1)  # (1, 30, 63)
    elif len(model_input_shape) == 2:
        # Shape (batch, features)
        return flat.reshape(1, -1)  # (1, 1890)
    else:
        raise ValueError(f"Unsupported input shape: {model_input_shape}")

cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            keypoints = []
            for lm in hand_landmarks.landmark:
                keypoints.extend([lm.x, lm.y, lm.z])

            # Prepare input for model
            try:
                input_data = format_input(keypoints)
                preds = model.predict(input_data, verbose=0)[0]
                confidence = np.max(preds)
                predicted_label = class_names[np.argmax(preds)]

                # Only accept confident predictions
                if confidence > CONF_THRESHOLD:
                    pred_buffer.append(predicted_label)
                    if len(pred_buffer) == pred_buffer.maxlen and all(p == pred_buffer[0] for p in pred_buffer):
                        sentence.append(predicted_label)
                        pred_buffer.clear()
            except Exception as e:
                cv2.putText(frame, f"ERR {str(e)}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0,0,255), 2)

    # Display prediction and sentence
    if sentence:
        cv2.putText(frame, "Sentence: " + " ".join(sentence[-10:]), (10, 30),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)

    cv2.imshow("Real-Time Sign Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




Model expects input shape: (None, 20, 96, 96, 3)


In [13]:
import cv2
import numpy as np
import mediapipe as mp
from collections import deque

# -------------------
# Setup
# -------------------
SEQUENCE_LENGTH = 20
frame_buffer = deque(maxlen=SEQUENCE_LENGTH)

cap = cv2.VideoCapture(0)

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False,
                       max_num_hands=2,
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.5)
mp_draw = mp.solutions.drawing_utils

# Load your model + labels
from tensorflow.keras.models import load_model
model = load_model("bisindo_words_model.h5")
labels = ["apa", "dimana", "kamu", "halo", "duduk", "NO"]  # adjust to your dataset

# State variables
sentence = []
last_label = None
same_count = 0
CONF_THRESH = 0.7
STABILITY_COUNT = 5

prev_frame_gray = None

# -------------------
# Main Loop
# -------------------
while True:
    ret, frame = cap.read()
    if not ret:
        break

    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(frame_rgb)

    # -------------------
    # Hand presence detection
    # -------------------
    hand_present = results.multi_hand_landmarks is not None
    if hand_present:
        for lm in results.multi_hand_landmarks:
            mp_draw.draw_landmarks(frame, lm, mp_hands.HAND_CONNECTIONS)

    # -------------------
    # Motion detection
    # -------------------
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    motion_val = 0.0
    if prev_frame_gray is not None:
        diff = cv2.absdiff(prev_frame_gray, gray)
        motion_val = np.mean(diff) / 255.0
    prev_frame_gray = gray

    # -------------------
    # Preprocess and buffer frames
    # -------------------
    img = cv2.resize(frame, (96, 96))
    img = img.astype("float32") / 255.0
    frame_buffer.append(img)

    accepted_label, accepted_conf = "NO", 1.0

    # -------------------
    # Prediction (only if we have 20 frames + hands detected)
    # -------------------
    if len(frame_buffer) == SEQUENCE_LENGTH:
        input_data = np.expand_dims(frame_buffer, axis=0)  # (1,20,96,96,3)
        preds = model.predict(input_data, verbose=0)[0]

        pred_idx = np.argmax(preds)
        pred_conf = preds[pred_idx]
        pred_label = labels[pred_idx]

        # Decide if it's "NO gesture"
        if not hand_present or motion_val < 0.01:
            accepted_label = "NO"
            accepted_conf = 1.0
        elif pred_conf >= CONF_THRESH:
            if pred_label == last_label:
                same_count += 1
            else:
                same_count = 1
                last_label = pred_label

            if same_count >= STABILITY_COUNT:
                accepted_label = pred_label
                accepted_conf = pred_conf
                sentence.append(pred_label)

    # -------------------
    # Display
    # -------------------
    cv2.putText(frame, f"Pred: {accepted_label} {accepted_conf:.2f}", (10, 30),
                cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2)
    cv2.putText(frame, "Sentence: " + " ".join(sentence[-5:]), (10, 60),
                cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)
    cv2.putText(frame, f"Motion:{motion_val:.4f} Hands:{int(hand_present)}", (10, 90),
                cv2.FONT_HERSHEY_SIMPLEX, 0.6, (200, 200, 0), 2)

    cv2.imshow("Sign Detection", frame)
    if cv2.waitKey(1) & 0xFF == ord("q"):
        break

cap.release()
cv2.destroyAllWindows()




In [14]:
import cv2
import mediapipe as mp
import numpy as np
import tensorflow as tf
from collections import deque

# ---- Load trained model ----
model = tf.keras.models.load_model("bisindo_words_model.h5")
class_names = ["Halo", "Kamu", "Apa", "Dimana", "Duduk"]

# ---- Parameters ----
SEQ_LENGTH = 20   # your model expects 20 frames
sentence = []
pred_buffer = deque(maxlen=5)   # for stable final labels
prob_buffer = deque(maxlen=10)  # smooth probabilities

# Running averages for auto-threshold
class_running_mean = {cls: 0.5 for cls in class_names}
alpha = 0.1        # smoothing (0.1 = slow, 0.3 = fast)
base_margin = 0.1  # margin above average confidence
NO_GESTURE_LABEL = "NoGesture"

# ---- MediaPipe setup ----
mp_hands = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils
hands = mp_hands.Hands(max_num_hands=1,
                       min_detection_confidence=0.5,
                       min_tracking_confidence=0.5)

cap = cv2.VideoCapture(0)

while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    results = hands.process(rgb)

    if results.multi_hand_landmarks:
        for hand_landmarks in results.multi_hand_landmarks:
            mp_drawing.draw_landmarks(frame, hand_landmarks, mp_hands.HAND_CONNECTIONS)

            # ---- Extract hand ROI ----
            h, w, _ = frame.shape
            x_coords = [lm.x for lm in hand_landmarks.landmark]
            y_coords = [lm.y for lm in hand_landmarks.landmark]
            x_min, x_max = int(min(x_coords) * w), int(max(x_coords) * w)
            y_min, y_max = int(min(y_coords) * h), int(max(y_coords) * h)

            # Add padding
            x_min, y_min = max(0, x_min - 20), max(0, y_min - 20)
            x_max, y_max = min(w, x_max + 20), min(h, y_max + 20)

            roi = frame[y_min:y_max, x_min:x_max]
            if roi.size == 0:
                continue
            roi_resized = cv2.resize(roi, (96, 96))
            roi_norm = roi_resized.astype("float32") / 255.0

            # ---- Format input (1,20,96,96,3) ----
            input_data = np.expand_dims(roi_norm, axis=0)   # (1,96,96,3)
            input_data = np.repeat(input_data[np.newaxis, ...], SEQ_LENGTH, axis=1)

            # ---- Prediction ----
            preds = model.predict(input_data, verbose=0)[0]
            prob_buffer.append(preds)

            # Average probs across buffer
            avg_preds = np.mean(prob_buffer, axis=0)

            # Top-2 predictions
            sorted_idx = np.argsort(avg_preds)[::-1]
            top1, top2 = sorted_idx[0], sorted_idx[1]
            pred_idx = top1
            confidence = avg_preds[pred_idx]
            predicted_label = class_names[pred_idx]

            # ---- Auto-adjust threshold ----
            prev_mean = class_running_mean[predicted_label]
            new_mean = (1 - alpha) * prev_mean + alpha * confidence
            class_running_mean[predicted_label] = new_mean

            dynamic_thresh = new_mean + base_margin

            # ---- Decide final prediction ----
            final_label = NO_GESTURE_LABEL
            if confidence > dynamic_thresh:
                pred_buffer.append(predicted_label)

                if len(pred_buffer) == pred_buffer.maxlen and all(p == pred_buffer[0] for p in pred_buffer):
                    sentence.append(predicted_label)
                    pred_buffer.clear()
                final_label = predicted_label

            # ---- Debug text ----
            debug_text = (f"{class_names[top1]}:{avg_preds[top1]:.2f} (th:{dynamic_thresh:.2f}) | "
                          f"{class_names[top2]}:{avg_preds[top2]:.2f}")
            cv2.putText(frame, debug_text, (10, 30),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.65, (0, 255, 0), 2)

            cv2.putText(frame, f"Final: {final_label}", (10, 60),
                        cv2.FONT_HERSHEY_SIMPLEX, 0.8, (255, 0, 0), 2)

    # ---- Show sentence ----
    if sentence:
        cv2.putText(frame, "Sentence: " + " ".join(sentence[-10:]), (10, 90),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.8, (200, 100, 200), 2)

    cv2.imshow("Real-Time Sign Recognition", frame)
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
cv2.destroyAllWindows()




In [15]:
# preprocess.py
import os, glob, json
import numpy as np
import cv2
import mediapipe as mp
from tqdm import tqdm

DATASET_DIR = "D:/SFT_ATTEMPT2/raw_video"
TARGET_WORDS = ["Halo","Kamu","Apa","Dimana","Duduk"]
CACHE_DIR = "cache_landmarks"
SEQ_LEN = 20
TARGET_FPS = 24

os.makedirs(CACHE_DIR, exist_ok=True)

mp_hands = mp.solutions.hands
hands = mp_hands.Hands(static_image_mode=False, model_complexity=0,
                       max_num_hands=2, min_detection_confidence=0.5, min_tracking_confidence=0.5)

def normalize_hand(landmarks, w, h):
    """Return 63-d vector for 21 landmarks (x,y,z) normalized relative to wrist & scale.
       If landmarks is None return zeros."""
    if landmarks is None:
        return np.zeros(63, dtype=np.float32)
    pts = np.array([(lm.x, lm.y, lm.z) for lm in landmarks], dtype=np.float32)
    # center at wrist (index 0)
    center = pts[0].copy()
    pts[:, :2] -= center[:2]
    px = pts[:,0]*w
    py = pts[:,1]*h
    scale = max(px.max()-px.min(), py.max()-py.min(), 1e-3)
    pts[:, :2] /= (scale / max(w,h))
    return pts.flatten().astype(np.float32)

def extract_sequences_from_video(path, seq_len=SEQ_LEN, target_fps=TARGET_FPS):
    cap = cv2.VideoCapture(path)
    if not cap.isOpened():
        return []
    src_fps = cap.get(cv2.CAP_PROP_FPS) or 30
    step = max(int(round(src_fps / target_fps)), 1)

    frames_feats = []
    i = 0
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if i % step != 0:
            i += 1
            continue
        h,w = frame.shape[:2]
        rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        res = hands.process(rgb)

        left = None; right = None
        if res.multi_hand_landmarks and res.multi_handedness:
            # pair them
            for lm, handed in zip(res.multi_hand_landmarks, res.multi_handedness):
                if handed.classification[0].label.lower() == "left":
                    left = lm.landmark
                else:
                    right = lm.landmark

        L = normalize_hand(left, w, h)
        R = normalize_hand(right, w, h)
        present = np.array([1.0 if left is not None else 0.0, 1.0 if right is not None else 0.0], dtype=np.float32)
        feat = np.concatenate([L, R, present])    # length = 63+63+2 = 128
        frames_feats.append(feat)
        i += 1

    cap.release()
    if len(frames_feats) < seq_len:
        return []

    seqs = []
    for start in range(0, len(frames_feats) - seq_len + 1, seq_len):
        seqs.append(np.stack(frames_feats[start:start+seq_len], axis=0))
    return seqs

# Scan dataset
X=[]; y=[]
label_map = {w:i for i,w in enumerate(TARGET_WORDS)}
for w in TARGET_WORDS:
    folder = os.path.join(DATASET_DIR, w)
    if not os.path.isdir(folder): 
        print("Missing folder:", folder); continue
    vids = glob.glob(os.path.join(folder, "*.mp4"))
    print(f"Found {len(vids)} videos for {w}")
    for v in tqdm(vids):
        seqs = extract_sequences_from_video(v)
        for s in seqs:
            X.append(s)   # shape (T,128)
            y.append(label_map[w])

X = np.array(X, dtype=np.float32)
y = np.array(y, dtype=np.int64)

print("Built dataset shapes:", X.shape, y.shape)
np.save(os.path.join(CACHE_DIR, "X.npy"), X)
np.save(os.path.join(CACHE_DIR, "y.npy"), y)
with open(os.path.join(CACHE_DIR, "labels.json"), "w") as f:
    json.dump(TARGET_WORDS, f)
print("Saved to", CACHE_DIR)


Found 50 videos for Halo


100%|██████████| 50/50 [00:55<00:00,  1.11s/it]


Found 50 videos for Kamu


100%|██████████| 50/50 [00:39<00:00,  1.27it/s]


Found 50 videos for Apa


100%|██████████| 50/50 [00:33<00:00,  1.47it/s]


Found 50 videos for Dimana


100%|██████████| 50/50 [00:45<00:00,  1.10it/s]


Found 50 videos for Duduk


100%|██████████| 50/50 [01:20<00:00,  1.62s/it]

Built dataset shapes: (773, 20, 128) (773,)
Saved to cache_landmarks





In [17]:
# train.py
import os
import math
import json
import numpy as np
from tensorflow import keras
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight

# -------------------------
# Config
# -------------------------
CACHE_DIR = "cache_landmarks"
SEQ_LEN = 20
MODEL_PATH = "bisindo_landmarks.keras"   # recommended Keras v3 format
BATCH_SIZE = 32
EPOCHS = 80

# -------------------------
# Load data
# -------------------------
X = np.load(os.path.join(CACHE_DIR, "X.npy"))  # expected shape (N, T, F)
y = np.load(os.path.join(CACHE_DIR, "y.npy"))
with open(os.path.join(CACHE_DIR, "labels.json"), "r", encoding="utf-8") as f:
    labels = json.load(f)

print("Loaded data shapes:", X.shape, y.shape)
num_classes = len(labels)

# -------------------------
# Train / val split
# -------------------------
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)
print("Train:", X_train.shape, y_train.shape, "Val:", X_val.shape, y_val.shape)

# -------------------------
# Augmentation helper
# -------------------------
def augment_batch(X_batch, prob=0.5):
    """Add small gaussian jitter to landmarks with probability per sample."""
    Xb = X_batch.copy()
    for i in range(len(Xb)):
        if np.random.rand() < prob:
            noise = np.random.normal(0, 0.01, Xb[i].shape).astype(np.float32)
            Xb[i] = Xb[i] + noise
    return Xb

# -------------------------
# Compute class weights and map to sample weights
# -------------------------
class_weights_arr = compute_class_weight("balanced", classes=np.unique(y_train), y=y_train)
class_weights = dict(enumerate(class_weights_arr))
print("Class weights:", class_weights)

# -------------------------
# Generator that yields (Xb, yb, sample_weight)
# -------------------------
def gen_with_weights(Xa, ya, batch_size=BATCH_SIZE):
    n = len(Xa)
    idx = np.arange(n)
    while True:
        np.random.shuffle(idx)
        for i in range(0, n, batch_size):
            b = idx[i:i+batch_size]
            Xb = Xa[b].copy()
            Xb = augment_batch(Xb, prob=0.5)
            yb = ya[b]
            # sample weight per item based on its class
            sw = np.array([class_weights[int(lbl)] for lbl in yb], dtype=np.float32)
            yield Xb, yb, sw

# -------------------------
# Build model (LSTM on landmarks)
# -------------------------
input_shape = X_train.shape[1:]  # (T, F)
inputs = keras.Input(shape=input_shape)
x = keras.layers.Masking(mask_value=0.0)(inputs)
x = keras.layers.LSTM(128, return_sequences=True)(x)
x = keras.layers.Dropout(0.25)(x)
x = keras.layers.LSTM(64)(x)
x = keras.layers.Dropout(0.25)(x)
x = keras.layers.Dense(64, activation="relu")(x)
outputs = keras.layers.Dense(num_classes, activation="softmax")(x)

model = keras.Model(inputs, outputs)
model.compile(optimizer=keras.optimizers.Adam(1e-3),
              loss="sparse_categorical_crossentropy",
              metrics=["accuracy"])
model.summary()

# -------------------------
# Callbacks
# -------------------------
cb = [
    keras.callbacks.ModelCheckpoint(MODEL_PATH, save_best_only=True, monitor="val_accuracy", mode="max", verbose=1),
    keras.callbacks.ReduceLROnPlateau(patience=3, factor=0.5, verbose=1),
    keras.callbacks.EarlyStopping(patience=8, restore_best_weights=True, verbose=1)
]

# -------------------------
# Fit using generator that yields sample weights
# -------------------------
steps_per_epoch = max(1, math.ceil(len(X_train) / BATCH_SIZE))
print("Steps per epoch:", steps_per_epoch)

history = model.fit(
    gen_with_weights(X_train, y_train, batch_size=BATCH_SIZE),
    steps_per_epoch=steps_per_epoch,
    validation_data=(X_val, y_val),
    epochs=EPOCHS,
    callbacks=cb,
    verbose=2
)

# ensure final save in Keras format
model.save(MODEL_PATH)
print("Training finished. Model saved to", MODEL_PATH)


Loaded data shapes: (773, 20, 128) (773,)
Train: (618, 20, 128) (618,) Val: (155, 20, 128) (155,)
Class weights: {0: 0.8524137931034482, 1: 1.0747826086956522, 2: 1.0564102564102564, 3: 1.03, 4: 1.0214876033057851}


Steps per epoch: 20
Epoch 1/80

Epoch 1: val_accuracy improved from -inf to 0.81290, saving model to bisindo_landmarks.keras
20/20 - 5s - 254ms/step - accuracy: 0.6440 - loss: 1.1415 - val_accuracy: 0.8129 - val_loss: 0.6547 - learning_rate: 0.0010
Epoch 2/80

Epoch 2: val_accuracy improved from 0.81290 to 0.87097, saving model to bisindo_landmarks.keras
20/20 - 0s - 22ms/step - accuracy: 0.8738 - loss: 0.4228 - val_accuracy: 0.8710 - val_loss: 0.3642 - learning_rate: 0.0010
Epoch 3/80

Epoch 3: val_accuracy improved from 0.87097 to 0.89032, saving model to bisindo_landmarks.keras
20/20 - 0s - 21ms/step - accuracy: 0.9207 - loss: 0.2708 - val_accuracy: 0.8903 - val_loss: 0.3343 - learning_rate: 0.0010
Epoch 4/80

Epoch 4: val_accuracy improved from 0.89032 to 0.89677, saving model to bisindo_landmarks.keras
20/20 - 0s - 23ms/step - accuracy: 0.9482 - loss: 0.1781 - val_accuracy: 0.8968 - val_loss: 0.2977 - learning_rate: 0.0010
Epoch 5/80

Epoch 5: val_accuracy improved from 0.89677 to

In [18]:
import cv2, numpy as np, json, tensorflow as tf
from pathlib import Path
# assumes you have the normalize function from preprocess.py (or reuse code)

# quick helper: extract first seq of length SEQ_LEN from a video using mediapipe landmarks
def extract_seq_from_video(video_path, seq_len=20, target_fps=24):
    import mediapipe as mp
    cap = cv2.VideoCapture(video_path)
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(static_image_mode=False, max_num_hands=2,
                           min_detection_confidence=0.5, min_tracking_confidence=0.5)
    feats=[]
    i=0
    src_fps = cap.get(cv2.CAP_PROP_FPS) or 30
    step = max(int(round(src_fps/target_fps)),1)
    while True:
        ret, frame = cap.read()
        if not ret: break
        if i % step != 0:
            i+=1; continue
        h,w = frame.shape[:2]
        res = hands.process(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
        left=None; right=None
        if res.multi_hand_landmarks and res.multi_handedness:
            for lm, handed in zip(res.multi_hand_landmarks, res.multi_handedness):
                if handed.classification[0].label.lower()=="left": left=lm.landmark
                else: right=lm.landmark
        # use normalize_hand function from preprocess.py (copy it here)
        def normalize_hand(landmarks,w,h):
            if landmarks is None: return np.zeros(63,dtype=np.float32)
            pts=np.array([(lm.x,lm.y,lm.z) for lm in landmarks],dtype=np.float32)
            center=pts[0].copy(); pts[:,:2]-=center[:2]
            px=pts[:,0]*w; py=pts[:,1]*h
            scale=max(px.max()-px.min(), py.max()-py.min(), 1e-3)
            pts[:,:2]/=(scale/max(w,h))
            return pts.flatten().astype(np.float32)
        L=normalize_hand(left,w,h); R=normalize_hand(right,w,h)
        pres=np.array([1.0 if left else 0.0, 1.0 if right else 0.0], dtype=np.float32)
        feats.append(np.concatenate([L,R,pres]))
        i+=1
        if len(feats)>=seq_len: break
    cap.release()
    hands.close()
    if len(feats)<seq_len: return None
    return np.stack(feats[:seq_len], axis=0)

model = tf.keras.models.load_model("bisindo_landmarks.keras")
labels = json.load(open("cache_landmarks/labels.json"))

seq = extract_seq_from_video("D:/SFT_ATTEMPT2/raw_video/Duduk/Duduk_001.mp4", seq_len=20)
if seq is None:
    print("Not enough frames or missing landmarks")
else:
    pred = model.predict(np.expand_dims(seq,0))[0]
    print("Top probs:", sorted([(labels[i], float(pred[i])) for i in range(len(pred))], key=lambda x:-x[1])[:5])


Not enough frames or missing landmarks
