In [1]:
import os
import time
import warnings
from dataclasses import dataclass
from pathlib import Path
from typing import Dict, List, Tuple, Optional

import cv2
import numpy as np
import tensorflow as tf
from ultralytics import YOLO
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2
import pickle
import face_recognition

2025-11-21 01:58:32.710167: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-21 01:58:32.722158: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763661512.732455  319762 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763661512.735605  319762 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1763661512.745743  319762 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
@dataclass
class Config:
    # BASE_DIR: lokasi file ini (kalau .py) atau CWD (kalau di Jupyter)
    BASE_DIR: Path = (
        Path(__file__).resolve().parent
        if "__file__" in globals()
        else Path(os.getcwd())
    )

    # YOLO models (.pt atau .engine)
    PERSON_MODEL_PATH: str = str(BASE_DIR / "model" / "yolo" / "yolo12n.pt")
    WEAPON_MODEL_PATH: str = str(BASE_DIR / "model" / "yolo" / "yolov12n-weapon.pt")
    FACE_MODEL_PATH:   str = str(BASE_DIR / "model" / "yolo" / "yolov12n-face.pt")

    # LSTM artifacts (pakai lstm_s v2)
    LSTM_ARTIFACT_PKL: str = str(
        BASE_DIR / "model" / "trained" / "lstm_s" / "lstm_model_v1.pkl"
    )
    LSTM_MODEL_DIR: str = str(BASE_DIR / "model" / "trained" / "lstm_s")

    # Input video
    VIDEO_PATH: str = str(
        BASE_DIR
        / "assets"
        / "dataset"
        / "v2"
        / "WhatsApp Video 2025-11-21 at 01.23.21.mp4"
    )

    # Dataset wajah (folder berisi subfolder: rusdi, nawfal, rio, ...)
    KNOWN_FACES_DIR: str = str(BASE_DIR / "assets" / "dataset" / "face")

    # Layout / sequence
    FIXED_SIZE: int = 640           # tinggi normalisasi untuk tampilan (optional)
    SMALL_SIZE: Tuple[int, int] = (150, 150)
    NUM_FRAMES: int = 15            # panjang sequence LSTM


cfg = Config()

mp_holistic = mp.solutions.holistic

In [3]:
def load_or_build_trt_engine(model_path: str, device: str = "0", half: bool = True):
    """
    Load YOLO TensorRT engine jika sudah ada.
    Jika masih .pt → convert otomatis menjadi .engine (task='detect').
    """
    if model_path.endswith(".engine"):
        print(f"[TensorRT] Loading existing .engine: {model_path}")
        return YOLO(model_path)

    if not model_path.endswith(".pt"):
        raise ValueError(f"Model path bukan .pt atau .engine: {model_path}")

    engine_path = model_path.replace(".pt", ".engine")

    if os.path.exists(engine_path):
        print(f"[TensorRT] Engine sudah ada, loading: {engine_path}")
        return YOLO(engine_path)

    print(f"[TensorRT] Converting {model_path} → {engine_path}")
    model = YOLO(model_path)
    model.export(
        format="engine",
        device=device,
        half=half,
        task="detect",
    )
    print(f"[TensorRT] DONE converting → {engine_path}")
    return YOLO(engine_path)

In [4]:
def smart_yolo_loader(path: str) -> Optional[YOLO]:
    """
    Loader aman:
    - Kalau file tidak ada → return None (tidak crash).
    - Kalau .engine → YOLO(engine).
    - Kalau .pt → load_or_build_trt_engine.
    """
    if not path:
        print("[YOLO] Path kosong, skip.")
        return None

    if not os.path.exists(path):
        print(f"[YOLO] File model tidak ditemukan, skip: {path}")
        return None

    if path.endswith(".engine"):
        print(f"[YOLO] Loading engine: {path}")
        return YOLO(path)
    elif path.endswith(".pt"):
        print(f"[YOLO] Loading/Converting PT: {path}")
        return load_or_build_trt_engine(path)
    else:
        print(f"[YOLO] Format model tidak didukung: {path}")
        return None

In [5]:
def load_all_yolo_models(cfg: Config) -> Dict[str, Optional[YOLO]]:
    return {
        "person": smart_yolo_loader(cfg.PERSON_MODEL_PATH),
        "weapon": smart_yolo_loader(cfg.WEAPON_MODEL_PATH),
        "face": smart_yolo_loader(cfg.FACE_MODEL_PATH),
    }

In [6]:
def load_lstm_artifacts(cfg: Config):
    """
    Load scaler, label_encoder, dan model LSTM Keras dari pkl.
    Abaikan path lama di dalam pkl, pakai hanya nama file keras-nya.
    """
    if not os.path.exists(cfg.LSTM_ARTIFACT_PKL):
        raise FileNotFoundError(
            f"LSTM artifact pkl tidak ditemukan: {cfg.LSTM_ARTIFACT_PKL}"
        )

    with open(cfg.LSTM_ARTIFACT_PKL, "rb") as f:
        artifacts = pickle.load(f)

    scaler = artifacts["scaler"]
    label_encoder = artifacts["label_encoder"]
    keras_model_path = artifacts["model_filename"]  # bisa berisi path lama
    num_classes = artifacts.get("num_classes", None)

    keras_filename = os.path.basename(keras_model_path)
    full_model_path = os.path.normpath(os.path.join(cfg.LSTM_MODEL_DIR, keras_filename))

    print(f"[LSTM] Will load model from: {full_model_path}")

    if not os.path.exists(full_model_path):
        raise FileNotFoundError(
            f"File model LSTM Keras tidak ditemukan: {full_model_path}"
        )

    try:
        model_pred = tf.keras.models.load_model(full_model_path)
        print(f"[LSTM] Model Keras loaded from: {full_model_path}")
    except Exception as e:
        print(f"[LSTM] Error loading Keras model: {e}")
        model_pred = None

    return {
        "scaler": scaler,
        "label_encoder": label_encoder,
        "model_pred": model_pred,
        "num_classes": num_classes,
    }

In [7]:
def detect_faces_yolo(
    frame: np.ndarray, face_model: Optional[YOLO], conf: float = 0.25
):
    """
    Deteksi wajah dengan YOLO-face, return list bbox:
    (top, right, bottom, left, conf)  -> format yg cocok dengan face_recognition.
    """
    if face_model is None:
        return []

    h, w, _ = frame.shape
    results = face_model(frame, conf=conf, verbose=False)
    boxes_out = []

    res0 = results[0]
    if res0.boxes is None or len(res0.boxes) == 0:
        return []

    for box in res0.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(w, x2), min(h, y2)
        if x2 <= x1 or y2 <= y1:
            continue
        conf_box = float(box.conf)
        # face_recognition order: top, right, bottom, left
        boxes_out.append((y1, x2, y2, x1, conf_box))

    return boxes_out

In [8]:
def load_face_database(
    cfg: Config,
    face_model: Optional[YOLO],
    exts=(".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"),
):
    """
    Scan assets/dataset/face/<nama>/... dan encode semua wajah.
    - Deteksi wajah dulu pakai YOLO-face.
    - Encode pakai face_recognition di bbox hasil YOLO.

    Folder name = label (rusdi, nawfal, rio, dll).
    Return dict dengan:
      - encodings: np.ndarray (N, 128)
      - names: List[str]
    """
    base_dir = cfg.KNOWN_FACES_DIR
    if not os.path.isdir(base_dir):
        print(f"[FaceDB] Folder tidak ditemukan: {base_dir}")
        return None

    if face_model is None:
        print("[FaceDB] YOLO face model tidak tersedia, tidak bisa build DB.")
        return None

    all_encodings = []
    all_names = []

    print(f"[FaceDB] Building face DB from: {base_dir}")
    for person_name in os.listdir(base_dir):
        person_dir = os.path.join(base_dir, person_name)
        if not os.path.isdir(person_dir):
            continue

        for fname in os.listdir(person_dir):
            fpath = os.path.join(person_dir, fname)
            if not os.path.isfile(fpath):
                continue

            # filter kasar by ekstensi
            if "." in fname and not fname.endswith(exts):
                continue

            try:
                img_bgr = cv2.imread(fpath)
                if img_bgr is None:
                    print(f"[FaceDB] Gagal baca gambar: {fpath}")
                    continue

                # Deteksi wajah pakai YOLO-face
                face_boxes = detect_faces_yolo(img_bgr, face_model, conf=0.4)
                if not face_boxes:
                    print(f"[FaceDB] Tidak ada wajah terdeteksi (YOLO): {fpath}")
                    continue

                img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
                boxes_fr = [(t, r, b, l) for (t, r, b, l, c) in face_boxes]

                encs = face_recognition.face_encodings(
                    img_rgb,
                    known_face_locations=boxes_fr,
                    model="large",
                )

                if not encs:
                    print(f"[FaceDB] face_recognition tidak menemukan wajah: {fpath}")
                    continue

                for enc in encs:
                    all_encodings.append(enc)
                    all_names.append(person_name)

                print(f"[FaceDB] Encoded {person_name}: {fname} (faces={len(encs)})")

            except Exception as e:
                print(f"[FaceDB] Skip {fpath}: {e}")

    if not all_encodings:
        print("[FaceDB] No face encodings collected.")
        return None

    encodings_arr = np.array(all_encodings, dtype=np.float32)
    print(f"[FaceDB] Total encodings: {len(all_names)}")
    return {"encodings": encodings_arr, "names": all_names}

In [9]:
def recognize_faces_in_frame(
    frame_bgr: np.ndarray,
    face_boxes_tr: List[tuple],
    face_db: Optional[dict],
    match_threshold: float = 0.2,
):
    """
    - face_boxes_tr: list (top, right, bottom, left, conf) dari YOLO face
    - face_db: {"encodings": np.ndarray(N,128), "names": List[str]}
    Menggambar bbox + label nama langsung di frame_bgr.
    """
    if face_db is None or not face_boxes_tr:
        return frame_bgr

    known_encodings = face_db["encodings"]
    known_names = face_db["names"]

    boxes_fr = [(t, r, b, l) for (t, r, b, l, c) in face_boxes_tr]
    confs = [c for (_, _, _, _, c) in face_boxes_tr]

    try:
        frame_rgb = cv2.cvtColor(frame_bgr, cv2.COLOR_BGR2RGB)
    except Exception:
        frame_rgb = frame_bgr[:, :, ::-1]

    encodings = face_recognition.face_encodings(
        frame_rgb,
        known_face_locations=boxes_fr,
        model="large",
    )

    for enc, (top, right, bottom, left), conf in zip(encodings, boxes_fr, confs):
        if known_encodings.size == 0:
            name = "Unknown"
            dist_val = None
        else:
            enc_vec = enc.astype(np.float32)
            dists = np.linalg.norm(known_encodings - enc_vec, axis=1)
            idx_min = int(np.argmin(dists))
            dist_val = float(dists[idx_min])
            if dist_val < match_threshold:
                name = known_names[idx_min]
            else:
                name = "Unknown"

        color = (0, 255, 0) if name != "Unknown" else (0, 0, 255)
        label = (
            f"{name}" if dist_val is None else f"{name} ({dist_val:.2f}) ({conf:.2f})"
        )

        cv2.rectangle(frame_bgr, (left, top), (right, bottom), color, 2)
        cv2.putText(
            frame_bgr,
            label,
            (left, top - 10),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            color,
            2,
        )

    return frame_bgr

In [10]:
WEAPON_CLASS_NAMES = {
    0: "knife",
    1: "long_weapon",
    2: "pistol",
}


def detect_weapons_yolo(
    frame: np.ndarray,
    weapon_model: Optional[YOLO],
    conf: float = 0.35,
):
    """
    Deteksi senjata dari frame original.
    Return list: (x1, y1, x2, y2, conf, label)
    """
    detections = []
    if weapon_model is None:
        return detections

    h, w, _ = frame.shape
    results = weapon_model(frame, conf=conf, verbose=False)
    res0 = results[0]

    if res0.boxes is None or len(res0.boxes) == 0:
        return detections

    for box in res0.boxes:
        cls_id = int(box.cls)
        if cls_id not in WEAPON_CLASS_NAMES:
            continue

        x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
        x1, y1 = max(0, x1), max(0, y1)
        x2, y2 = min(w, x2), min(h, y2)
        if x2 <= x1 or y2 <= y1:
            continue

        conf_box = float(box.conf)
        label = WEAPON_CLASS_NAMES[cls_id]
        detections.append((x1, y1, x2, y2, conf_box, label))

    return detections

In [11]:
def draw_weapon_detections(frame: np.ndarray, detections):
    """
    Gambar bbox & label senjata di atas frame (termasuk confidence).
    """
    for x1, y1, x2, y2, conf, label in detections:
        # warna beda untuk pistol & long weapon (bahaya) dan knife
        if label in ("pistol", "long_weapon"):
            color = (0, 0, 255)  # merah
        else:
            color = (0, 165, 255)  # oranye

        # bounding box
        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 3)

        # text dengan conf
        text = f"{label.upper()}  {conf:.2f}"

        # background agar tulisan selalu terbaca
        (tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
        cv2.rectangle(frame, (x1, y1 - th - 8), (x1 + tw + 8, y1), color, -1)

        # tulis text
        cv2.putText(
            frame,
            text,
            (x1 + 4, y1 - 5),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (255, 255, 255),
            2,
        )

In [12]:
def detect_person_yolo(
    frame: np.ndarray, person_model: Optional[YOLO], conf: float = 0.35
):
    """
    Deteksi manusia di full frame (class 0).
    Return list: (x1, y1, x2, y2, conf)
    """
    if person_model is None:
        return []

    results = person_model(frame, classes=[0], conf=conf, verbose=False)
    dets = []
    res0 = results[0]

    if res0.boxes is None or len(res0.boxes) == 0:
        return dets

    for box in res0.boxes:
        x1, y1, x2, y2 = map(int, box.xyxy[0].cpu().numpy())
        conf_box = float(box.conf)
        dets.append((x1, y1, x2, y2, conf_box))

    return dets

In [13]:
def draw_person_detections(frame: np.ndarray, detections):
    """
    Gambar bbox & label 'PERSON' di atas frame.
    """
    for x1, y1, x2, y2, conf in detections:
        color = (0, 150, 255)  # biru-oranye

        cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)

        text = f"PERSON {conf:.2f}"
        (tw, th), _ = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, 0.7, 2)
        cv2.rectangle(frame, (x1, y1 - th - 8), (x1 + tw + 8, y1), color, -1)
        cv2.putText(
            frame,
            text,
            (x1 + 4, y1 - 5),
            cv2.FONT_HERSHEY_SIMPLEX,
            0.7,
            (255, 255, 255),
            2,
        )

In [14]:
def select_person_for_motion(person_dets):
    """
    Pilih 1 person bbox yang paling cocok untuk dianalisis motion.
    Pilih bounding box dengan luas terbesar.
    person_dets format: (x1, y1, x2, y2, conf)
    """
    if not person_dets:
        return None

    max_area = 0
    best = None
    for x1, y1, x2, y2, conf in person_dets:
        area = (x2 - x1) * (y2 - y1)
        if area > max_area:
            max_area = area
            best = (x1, y1, x2, y2, conf)
    return best

In [15]:
def fit_to_canvas(img: np.ndarray, fixed_size: int):
    """
    Resize img ke dalam canvas fixed_size x fixed_size dengan padding putih.
    """
    h, w = img.shape[:2]
    scale = fixed_size / max(w, h)
    new_w = int(w * scale)
    new_h = int(h * scale)
    resized = cv2.resize(img, (new_w, new_h), interpolation=cv2.INTER_AREA)

    canvas = np.full((fixed_size, fixed_size, 3), 255, dtype=np.uint8)
    dw = fixed_size - new_w
    dh = fixed_size - new_h
    top = dh // 2
    left = dw // 2
    canvas[top : top + new_h, left : left + new_w] = resized
    return canvas

In [16]:
def side_state(state: int, lndmrkX: Dict[str, float]) -> int:
    """
    Menentukan arah wajah (kiri/tengah/kanan) berdasarkan posisi X hidung
    relatif terhadap landmark lain.
    state = 0 atau 1 untuk membalik interpretasi arah jika perlu.
    """
    nose_x = lndmrkX["nose"]
    min_x = min(lndmrkX.values())
    max_x = max(lndmrkX.values())

    if state == 0:
        if nose_x == min_x:
            direction = 1  # Kiri
        elif nose_x == max_x:
            direction = 2  # Kanan
        else:
            direction = 0  # Tengah
    else:
        if nose_x == min_x:
            direction = 2  # Kanan (dibalik)
        elif nose_x == max_x:
            direction = 1  # Kiri
        else:
            direction = 0  # Tengah

    return direction

In [17]:
def hand_state(state: int, lndmrkZ: Dict[str, float]) -> int:
    """
    Menentukan apakah tangan 'terlihat' berdasarkan nilai Z (kedalaman).
    """
    nose_z = lndmrkZ["nose"]
    wrist_r_z = lndmrkZ["wrist_r"]
    wrist_l_z = lndmrkZ["wrist_l"]

    visible = wrist_r_z < nose_z and wrist_l_z < nose_z

    if state == 0:
        hand = 1 if visible else 0
    else:
        hand = 0 if visible else 1

    return hand

In [18]:
def get_ext_feature_value(lndmrk) -> Tuple[int, int, int]:
    """
    lndmrk: dict dengan key 'nose', 'ear_l', 'ear_r', 'wrist_r', 'wrist_l'
    value = NormalizedLandmark
    Return: (side, state, hand)
    """
    nose = lndmrk["nose"]
    ear_l = lndmrk["ear_l"]
    ear_r = lndmrk["ear_r"]
    wrist_r = lndmrk["wrist_r"]
    wrist_l = lndmrk["wrist_l"]

    noseX, noseY, noseZ = nose.x, nose.y, nose.z
    earLX, earLY, earLZ = ear_l.x, ear_l.y, ear_l.z
    earRX, earRY, earRZ = ear_r.x, ear_r.y, ear_r.z
    wristRX, wristRY, wristRZ = wrist_r.x, wrist_r.y, wrist_r.z
    wristLX, wristLY, wristLZ = wrist_l.x, wrist_l.y, wrist_l.z

    lndmrkX = {
        "nose": noseX,
        "ear_l": earLX,
        "ear_r": earRX,
        "wrist_r": wristRX,
        "wrist_l": wristLX,
    }
    lndmrkZ = {
        "nose": noseZ,
        "ear_l": earLZ,
        "ear_r": earRZ,
        "wrist_r": wristRZ,
        "wrist_l": wristLZ,
    }

    if noseZ < min(earLZ, earRZ):
        state = 0
    else:
        state = 1

    side = side_state(state, lndmrkX)
    hand = hand_state(state, lndmrkZ)

    return side, state, hand

In [19]:
class MotionSequenceBuffer:
    """
    Menyimpan NUM_FRAMES terakhir:
    - frames kecil (thumbnail),
    - landmark hasil deteksi,
    - fitur tambahan (face_direction, face_shown, hand_shown).
    """

    def __init__(self, num_frames: int):
        self.num_frames = num_frames
        self.frames: List[np.ndarray] = []
        self.detected: List[List[landmark_pb2.NormalizedLandmark]] = []
        self.face_direction: List[int] = []
        self.face_shown: List[int] = []
        self.hand_shown: List[int] = []

    def add(
        self,
        frame_small: np.ndarray,
        landmarks: List[landmark_pb2.NormalizedLandmark],
        ext_feature: Tuple[int, int, int],
    ):
        side, state, hand = ext_feature

        if len(self.frames) >= self.num_frames:
            self.frames.pop(0)
            self.detected.pop(0)
            self.face_direction.pop(0)
            self.face_shown.pop(0)
            self.hand_shown.pop(0)

        self.frames.append(frame_small)
        self.detected.append(landmarks)
        self.face_direction.append(side)
        self.face_shown.append(state)
        self.hand_shown.append(hand)

    def is_full(self) -> bool:
        return len(self.frames) == self.num_frames

    def build_landmark_list(self) -> Optional[landmark_pb2.NormalizedLandmarkList]:
        if not self.detected:
            return None
        flat_detected = [lm for sublist in self.detected for lm in sublist]
        landmark_list = landmark_pb2.NormalizedLandmarkList()
        landmark_list.landmark.extend(flat_detected)
        return landmark_list

In [20]:
def build_layout(
    buffer: MotionSequenceBuffer,
    base_frame: np.ndarray,
    cfg: Config,
) -> np.ndarray:
    """
    Layout final:
    - base_frame (frame original + overlay)
    - 5 kolom history frame (jika buffer penuh 15 frame)
    """
    if not buffer.is_full():
        return base_frame

    SMALL_W, _ = cfg.SMALL_SIZE
    stacked_columns = []
    for col_idx in range(5):
        start = col_idx * 3
        end = start + 3
        column_frames = buffer.frames[start:end]
        stacked = np.vstack(column_frames)
        stacked = cv2.resize(stacked, (SMALL_W, cfg.FIXED_SIZE))
        stacked_columns.append(stacked)

    final_layout = np.hstack([base_frame] + stacked_columns)
    return final_layout

In [21]:
def build_motion_feature_vector(
    buffer: MotionSequenceBuffer,
    landmark_list: landmark_pb2.NormalizedLandmarkList,
) -> List[float]:
    """
    Struktur:
    - Untuk setiap landmark: x, y, z, visibility
    - Setiap 5 landmark (1 frame): tambah 3 fitur (side, state, hand)
    """
    motion_row = []
    counter = 0
    feature_idx = 0

    for lndmrk in landmark_list.landmark:
        motion_row.extend([lndmrk.x, lndmrk.y, lndmrk.z, lndmrk.visibility])
        counter += 1

        if counter % 5 == 0 and feature_idx < len(buffer.face_direction):
            motion_row.append(buffer.face_direction[feature_idx])
            motion_row.append(buffer.face_shown[feature_idx])
            motion_row.append(buffer.hand_shown[feature_idx])
            feature_idx += 1

    return motion_row

In [22]:
def predict_motion_class(
    motion_row: List[float],
    scaler,
    model_pred,
    label_encoder,
    min_conf: float = 0.0,
) -> Tuple[Optional[str], Optional[float]]:
    """
    Normalisasi fitur, reshape, dan prediksi kelas dengan LSTM.

    Return:
      - motion_class: str atau None
      - conf: float (prob max) atau None

    min_conf:
      - kalau prob max < min_conf → return (None, None)
    """
    if model_pred is None:
        return None, None

    X_data = np.array(motion_row).reshape(1, -1)
    X_scaled = scaler.transform(X_data)
    X_reshaped = X_scaled.reshape(1, 1, X_scaled.shape[1])

    try:
        y_pred_probs = model_pred.predict(
            X_reshaped, verbose=0
        )  # shape: (1, num_classes)
        probs = y_pred_probs[0]
        y_pred_index = int(np.argmax(probs))
        conf = float(probs[y_pred_index])

        if conf < min_conf:
            return None, None

        motion_class = label_encoder.inverse_transform([y_pred_index])[0]
        return motion_class, conf
    except Exception as e:
        print(f"[Predict] Error: {e}")
        return None, None

In [23]:
def process_video(cfg: Config):
    # === Load semua model ===
    yolo_models = load_all_yolo_models(cfg)
    lstm_artifacts = load_lstm_artifacts(cfg)
    scaler = lstm_artifacts["scaler"]
    label_encoder = lstm_artifacts["label_encoder"]
    model_pred = lstm_artifacts["model_pred"]

    # Face DB (rusdi / nawfal / rio) pakai YOLO-face
    face_db = load_face_database(cfg, yolo_models["face"])

    warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")

    cap = cv2.VideoCapture(cfg.VIDEO_PATH)
    if not cap.isOpened():
        print(f"Error opening video file: {cfg.VIDEO_PATH}")
        return

    buffer = MotionSequenceBuffer(cfg.NUM_FRAMES)

    with mp_holistic.Holistic(
        min_detection_confidence=0.5, min_tracking_confidence=0.5
    ) as holistic:

        while cap.isOpened():
            ret, frame = cap.read()
            if not ret:
                break

            # === 1) FULL FRAME UNTUK DISPLAY ===
            frame_disp = frame.copy()
            h, w = frame_disp.shape[:2]
            if h != cfg.FIXED_SIZE:
                scale = cfg.FIXED_SIZE / float(h)
                new_w = int(w * scale)
                frame_disp = cv2.resize(frame_disp, (new_w, cfg.FIXED_SIZE))

            # === 2) DETEKSI SENJATA DI FULL FRAME ===
            weapon_dets = detect_weapons_yolo(
                frame_disp,
                yolo_models["weapon"],
                conf=0.5,
            )
            draw_weapon_detections(frame_disp, weapon_dets)

            # === 2.5) DETEKSI PERSON DI FULL FRAME (sekali saja) ===
            person_dets = detect_person_yolo(
                frame_disp,
                yolo_models["person"],
                conf=0.35,
            )
            draw_person_detections(frame_disp, person_dets)

            # pilih 1 person untuk motion (bbox terbesar)
            best_person = select_person_for_motion(person_dets)

            # === 3) (OPSIONAL) FACE RECOG DI FULL FRAME ===
            face_boxes = detect_faces_yolo(frame_disp, yolo_models["face"], conf=0.4)
            if face_boxes and face_db is not None:
                recognize_faces_in_frame(
                    frame_disp, face_boxes, face_db, match_threshold=0.3
                )

            # === 4) CROP PERSON (HANYA UNTUK MOTION MODEL) ===
            person_crop = None
            if best_person is not None:
                x1, y1, x2, y2, _ = best_person
                person_raw = frame_disp[y1:y2, x1:x2]
                if person_raw.size > 0:
                    person_crop = fit_to_canvas(person_raw, cfg.FIXED_SIZE)

            if person_crop is None:
                # Tidak ada orang terdeteksi → tetap tampilkan full frame dengan weapon/face
                final_layout = frame_disp
                cv2.imshow("Video", final_layout)
                if cv2.waitKey(1) & 0xFF == ord("q"):
                    break
                continue

            # === 5) MOTION PIPELINE DI person_crop ===
            frame_small = cv2.resize(person_crop, cfg.SMALL_SIZE)

            # MediaPipe Holistic di crop orang
            crop_rgb = cv2.cvtColor(person_crop, cv2.COLOR_BGR2RGB)
            crop_rgb.flags.writeable = False
            results = holistic.process(crop_rgb)
            crop_rgb.flags.writeable = True

            detection_successful = False
            current_landmarks: List[landmark_pb2.NormalizedLandmark] = []

            if results.pose_landmarks:
                pose_lm = results.pose_landmarks.landmark
                nose = pose_lm[mp_holistic.PoseLandmark.NOSE]
                ear_r = pose_lm[mp_holistic.PoseLandmark.RIGHT_EAR]
                ear_l = pose_lm[mp_holistic.PoseLandmark.LEFT_EAR]
                wrist_r = pose_lm[mp_holistic.PoseLandmark.RIGHT_WRIST]
                wrist_l = pose_lm[mp_holistic.PoseLandmark.LEFT_WRIST]

                current_landmarks.extend([nose, ear_l, ear_r, wrist_r, wrist_l])
                detection_successful = True

            if not detection_successful:
                # Pose gagal tapi weapon/face tetap jalan
                final_layout = frame_disp
                cv2.imshow("Video", final_layout)
                if cv2.waitKey(1) & 0xFF == ord("q"):
                    break
                continue

            # === 6) FITUR TAMBAHAN (side/state/hand) DARI LANDMARK DI person_crop ===
            ext_feature = get_ext_feature_value(
                {
                    "nose": nose,
                    "ear_r": ear_r,
                    "ear_l": ear_l,
                    "wrist_r": wrist_r,
                    "wrist_l": wrist_l,
                }
            )

            # === 7) UPDATE BUFFER SEQUENCE UNTUK LSTM ===
            buffer.add(frame_small, current_landmarks, ext_feature)

            # Layout: full frame + history crop person
            final_layout = build_layout(buffer, frame_disp, cfg)

            # === 8) PREDIKSI MOTION JIKA BUFFER PENUH ===
            if buffer.is_full():
                landmark_list = buffer.build_landmark_list()
                if landmark_list is not None:
                    motion_row = build_motion_feature_vector(buffer, landmark_list)
                    motion_class, motion_conf = predict_motion_class(
                        motion_row,
                        scaler,
                        model_pred,
                        label_encoder,
                        min_conf=0.5,  # boleh diubah
                    )

                    if motion_class is not None:
                        cv2.putText(
                            final_layout,
                            f"Class: {motion_class} ({motion_conf:.2f})",
                            (10, 30),
                            cv2.FONT_HERSHEY_SIMPLEX,
                            1,
                            (0, 0, 255),
                            2,
                        )

            # === 9) TAMPILKAN ===
            cv2.imshow("Video", final_layout)
            if cv2.waitKey(1) & 0xFF == ord("q"):
                break

    cap.release()
    cv2.destroyAllWindows()


if __name__ == "__main__":
    process_video(cfg)

[YOLO] Loading/Converting PT: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolo12n.pt
[TensorRT] Engine sudah ada, loading: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolo12n.engine
[YOLO] Loading/Converting PT: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolov12n-weapon.pt
[TensorRT] Engine sudah ada, loading: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolov12n-weapon.engine
[YOLO] Loading/Converting PT: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolov12n-face.pt
[TensorRT] Engine sudah ada, loading: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolov12n-face.engine
[LSTM] Will load model from: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/trained/lstm_s/lstm_weights_v1.keras


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
I0000 00:00:1763661519.781775  319762 gpu_device.cc:2019] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 5561 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


[LSTM] Model Keras loaded from: /mnt/d/Programming/PycharmProjects/suspicious_detection/model/trained/lstm_s/lstm_weights_v1.keras
[FaceDB] Building face DB from: /mnt/d/Programming/PycharmProjects/suspicious_detection/assets/dataset/face
Loading /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolov12n-face.engine for TensorRT inference...
[11/21/2025-01:58:40] [TRT] [I] Loaded engine size: 10 MiB
[11/21/2025-01:58:40] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +13, now: CPU 1, GPU 18 (MiB)
[FaceDB] Encoded nawfal: 9800a5c1-4aa4-4118-82ed-c53a62763ad7.jpg (faces=1)
[FaceDB] Encoded rio: 73ae6ec6-85ed-4e0c-82b3-ad13dfacf6f8.jpg (faces=1)
[FaceDB] Encoded rusdi: 11211005_Foto_Ahmad Rusdianto Andarina Syakbani Square.jpg (faces=1)
[FaceDB] Total encodings: 3
Loading /mnt/d/Programming/PycharmProjects/suspicious_detection/model/yolo/yolov12n-weapon.engine for TensorRT inference...
[11/21/2025-01:58:42] [TRT] [I] The 

I0000 00:00:1763661522.652218  319762 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1763661522.674707  320005 gl_context.cc:369] GL version: 3.1 (OpenGL ES 3.1 Mesa 23.2.1-1ubuntu3.1~22.04.3), renderer: D3D12 (NVIDIA GeForce RTX 4060 Laptop GPU)
INFO: Created TensorFlow Lite XNNPACK delegate for CPU.
W0000 00:00:1763661522.748120  319968 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1763661522.785291  319980 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1763661522.791854  319971 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1763661522.791992  319975 inference_feedback_manager.cc:114] Feedback manager requires a model with a s

[11/21/2025-01:58:42] [TRT] [I] The logger passed into createInferRuntime differs from one already provided for an existing builder, runtime, or refitter. Uses of the global logger, returned by nvinfer1::getLogger(), will return the existing value.
[11/21/2025-01:58:42] [TRT] [I] Loaded engine size: 10 MiB
[11/21/2025-01:58:42] [TRT] [I] [MemUsageChange] TensorRT-managed allocation in IExecutionContext creation: CPU +1, GPU +13, now: CPU 3, GPU 53 (MiB)


W0000 00:00:1763661523.125280  319990 landmark_projection_calculator.cc:186] Using NORM_RECT without IMAGE_DIMENSIONS is only supported for the square ROI. Provide IMAGE_DIMENSIONS or use PROJECTION_MATRIX.
I0000 00:00:1763661524.461780  319911 service.cc:152] XLA service 0x7f3648003770 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1763661524.461804  319911 service.cc:160]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Laptop GPU, Compute Capability 8.9
2025-11-21 01:58:44.470548: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1763661524.509823  319911 cuda_dnn.cc:529] Loaded cuDNN version 90600
I0000 00:00:1763661524.647592  319911 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.
