**Video Annotation Pipeline Using MediaPipe and SVM**

In [1]:
!pip install mediapipe==0.10.14

Collecting mediapipe==0.10.14
  Downloading mediapipe-0.10.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.7 kB)
Collecting protobuf<5,>=4.25.3 (from mediapipe==0.10.14)
  Downloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)
Collecting sounddevice>=0.4.4 (from mediapipe==0.10.14)
  Downloading sounddevice-0.5.5-py3-none-any.whl.metadata (1.4 kB)
Downloading mediapipe-0.10.14-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (35.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m35.7/35.7 MB[0m [31m24.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading protobuf-4.25.8-cp37-abi3-manylinux2014_x86_64.whl (294 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m294.9/294.9 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sounddevice-0.5.5-py3-none-any.whl (32 kB)
Installing collected packages: protobuf, sounddevice, mediapipe
  Attempting uninstall: protobuf
    Found existing in

In [2]:
COLAB           = True
INPUT_VIDEO     = "input_video.mp4"
OUTPUT_VIDEO    = "output_annotated.mp4"
MODEL_PATH      = "/content/svm_winner.pkl"

MAX_HANDS            = 2
MIN_DETECT_CONF      = 0.7
MIN_TRACK_CONF       = 0.6
MIN_PROBA_SHOW       = 0.3    # only show label when confidence >= this value
SMOOTH_WINDOW        = 10
LABEL_PERSIST_FRAMES = 8
FONT_SCALE  = 0.85
THICKNESS   = 2



import subprocess, sys
def _pip(pkg):
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", pkg])

if COLAB:
    for _pkg in ("mediapipe", "opencv-python-headless"):
        try:
            __import__(_pkg.replace("-headless", "").replace("-", "_"))
        except ImportError:
            _pip(_pkg)

import cv2
import mediapipe as mp
import numpy as np
import joblib
import os
import time
from collections import deque, Counter


mp_hands   = mp.solutions.hands
mp_drawing = mp.solutions.drawing_utils

def _load_joblib(path, label):
    if path and os.path.exists(path):
        obj = joblib.load(path)
        print(f"  [OK] {label:<20} loaded from '{path}'")
        return obj
    print(f"  [--] {label:<20} NOT found at '{path}'")
    return None

print("\n── Loading model files ──────────────────────────────")

model = _load_joblib(MODEL_PATH, "SVM model")
USE_MODEL = model is not None

if USE_MODEL:
    print("  [OK] Gesture prediction ENABLED  (raw string labels, no LabelEncoder, no scaler)")
else:
    print("  [--] Gesture prediction DISABLED  (landmark drawing only)")
print("─────────────────────────────────────────────────────\n")




class PredictionSmoother:
    """
    Keeps a rolling window of recent (label, proba) predictions per hand.
    Returns the majority-vote label and the average probability of that label.
    Prevents a single noisy frame from flipping the displayed gesture.
    """
    def __init__(self, window=10, persist=8):
        self.window      = window
        self.persist     = persist
        self.history     = [deque(maxlen=window), deque(maxlen=window)]
        self.last_label  = [None, None]
        self.last_proba  = [0.0,  0.0]
        self.frames_gone = [0,    0]

    def update(self, hand_idx, label, proba):
        self.history[hand_idx].append((label, proba))
        self.frames_gone[hand_idx] = 0
        labels = [l for l, _ in self.history[hand_idx]]
        winner = Counter(labels).most_common(1)[0][0]
        avg_p  = np.mean([p for l, p in self.history[hand_idx] if l == winner])
        self.last_label[hand_idx] = winner
        self.last_proba[hand_idx] = float(avg_p)
        return winner, float(avg_p)

    def get_persisted(self, hand_idx):
        self.frames_gone[hand_idx] += 1
        if (self.last_label[hand_idx] is not None
                and self.frames_gone[hand_idx] <= self.persist):
            return self.last_label[hand_idx], self.last_proba[hand_idx]
        return None, 0.0

    def reset_hand(self, hand_idx):
        self.history[hand_idx].clear()
        self.last_label[hand_idx]  = None
        self.last_proba[hand_idx]  = 0.0
        self.frames_gone[hand_idx] = 0

smoother = PredictionSmoother(window=SMOOTH_WINDOW, persist=LABEL_PERSIST_FRAMES)





def preprocess_hand(hand_landmarks):
    """
    Replicates the preprocessing from Project_ML.ipynb Cell 16:

      1. Extract (x, y, z) for all 21 landmarks → shape (21, 3).
      2. Recenter: subtract wrist (landmark index 0) x,y from every landmark's
         x,y (including the wrist itself, which then becomes 0,0).
      3. Normalise: compute Euclidean distance from origin to landmark index 12
         (mid-finger tip) using the already-recentered coordinates, then divide
         every landmark's x,y by that distance.
         → matches notebook: scale = sqrt(x13² + y13²) on the recentered data.
      4. Flatten (21, 3) → (63,)  — z is kept, no StandardScaler applied.

    Returns float32 array of shape (63,), or None if the hand is invalid.
    """
    # Step 1 — extract
    arr = np.array(
        [[lm.x, lm.y, lm.z] for lm in hand_landmarks.landmark],
        dtype=np.float32
    )

    if not np.isfinite(arr).all():
        return None

    arr[:, 0] -= arr[0, 0]
    arr[:, 1] -= arr[0, 1]

    scale = float(np.sqrt(arr[12, 0] ** 2 + arr[12, 1] ** 2))
    if scale < 1e-6:
        return None
    arr[:, 0] /= scale
    arr[:, 1] /= scale

    return arr.flatten().astype(np.float32)



def predict_raw(features):
    """
    Feed 63 features directly into the model.
    The SVM was trained on raw string labels → model.predict() returns the
    gesture name string directly.  No LabelEncoder, no StandardScaler.

    Returns (label_str, proba_float).
    """
    try:
        x         = features.reshape(1, -1)
        label_str = model.predict(x)[0]

        # Confidence estimate
        if hasattr(model, "predict_proba"):
            proba = float(model.predict_proba(x).max())
        elif hasattr(model, "decision_function"):
            scores = model.decision_function(x).ravel()
            exp    = np.exp(scores - scores.max())
            proba  = float(exp.max() / exp.sum())
        else:
            proba  = 1.0

        return label_str, proba
    except Exception:
        return "Unknown", 0.0



HAND_COLORS = [
    (0,   220, 100),   # green  — hand 0
    (0,   165, 255),   # orange — hand 1
]

def draw_landmarks(frame, hand_landmarks, hand_idx=0):
    color = HAND_COLORS[hand_idx % len(HAND_COLORS)]
    mp_drawing.draw_landmarks(
        frame, hand_landmarks, mp_hands.HAND_CONNECTIONS,
        mp_drawing.DrawingSpec(color=color,           thickness=2, circle_radius=4),
        mp_drawing.DrawingSpec(color=(220, 220, 220), thickness=1, circle_radius=2),
    )


def draw_gesture_label(frame, label, proba, hand_landmarks, hand_idx=0, faded=False):
    h, w  = frame.shape[:2]
    wrist = hand_landmarks.landmark[0]
    cx    = int(np.clip(wrist.x * w,       5,  w - 220))
    cy    = int(np.clip(wrist.y * h - 25, 30,  h -  10))
    color = HAND_COLORS[hand_idx % len(HAND_COLORS)]
    if faded:
        color = tuple(int(c * 0.5) for c in color)
    text = f"{label}  {proba:.0%}"
    cv2.putText(frame, text, (cx+1, cy+1),
                cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, (0,0,0), THICKNESS+2, cv2.LINE_AA)
    cv2.putText(frame, text, (cx, cy),
                cv2.FONT_HERSHEY_SIMPLEX, FONT_SCALE, color,  THICKNESS,   cv2.LINE_AA)


def draw_hud(frame, frame_num, total_frames, fps, n_hands):
    lines = [
        f"Frame: {frame_num}/{total_frames}   Speed: {fps:5.1f} fps",
        f"Hands detected: {n_hands}",
    ]
    for i, text in enumerate(lines):
        y = 30 + i * 28
        cv2.putText(frame, text, (10, y),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.65, (0,0,0),       3, cv2.LINE_AA)
        cv2.putText(frame, text, (10, y),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.65, (255,255,255),  1, cv2.LINE_AA)


def draw_progress_bar(frame, frame_num, total_frames):
    h, w  = frame.shape[:2]
    bar_w = int(w * frame_num / max(total_frames, 1))
    cv2.rectangle(frame, (0, h-8), (w, h),     (50, 50, 50),  -1)
    cv2.rectangle(frame, (0, h-8), (bar_w, h), (0, 200, 100), -1)



def process_frame(frame, hands_model, prev_n_hands):
    rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    rgb.flags.writeable = False
    results = hands_model.process(rgb)
    rgb.flags.writeable = True

    n_hands = 0

    if results.multi_hand_landmarks:
        n_hands = len(results.multi_hand_landmarks)

        for hand_idx, hand_landmarks in enumerate(results.multi_hand_landmarks):
            draw_landmarks(frame, hand_landmarks, hand_idx)

            if USE_MODEL:
                features = preprocess_hand(hand_landmarks)
                if features is not None:
                    raw_label, raw_proba = predict_raw(features)
                    stable_label, stable_proba = smoother.update(
                        hand_idx, raw_label, raw_proba
                    )
                    if stable_proba >= MIN_PROBA_SHOW:
                        draw_gesture_label(
                            frame, stable_label, stable_proba,
                            hand_landmarks, hand_idx, faded=False
                        )

    if USE_MODEL:
        for hand_idx in range(prev_n_hands):
            if hand_idx >= n_hands:
                p_label, p_proba = smoother.get_persisted(hand_idx)
                if p_label is None:
                    smoother.reset_hand(hand_idx)

    return frame, n_hands



def process_video(input_path, output_path):
    if not os.path.exists(input_path):
        raise FileNotFoundError(
            f"Input video not found: '{input_path}'\n"
            "Upload your video and set INPUT_VIDEO correctly."
        )

    cap = cv2.VideoCapture(input_path)
    if not cap.isOpened():
        raise RuntimeError(f"Cannot open video: '{input_path}'")

    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    fps          = cap.get(cv2.CAP_PROP_FPS) or 30.0
    fw           = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    fh           = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

    print(f"── Input video ──────────────────────────────────────")
    print(f"  File       : {input_path}")
    print(f"  Size       : {fw} × {fh}")
    print(f"  FPS        : {fps:.2f}")
    print(f"  Frames     : {total_frames}")
    print(f"  Duration   : {total_frames/fps:.1f} seconds")
    print(f"  Output     : {output_path}")
    print(f"  Smoothing  : majority vote over last {SMOOTH_WINDOW} frames")
    print(f"  Min conf   : {MIN_PROBA_SHOW:.0%} to show label")
    print(f"─────────────────────────────────────────────────────\n")
    print("Processing ...\n")

    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
    writer = cv2.VideoWriter(output_path, fourcc, fps, (fw, fh))
    if not writer.isOpened():
        raise RuntimeError(f"VideoWriter failed for '{output_path}'")

    frame_num    = 0
    hands_total  = 0
    prev_n_hands = 0
    t_start      = time.time()

    with mp_hands.Hands(
        static_image_mode        = False,
        max_num_hands            = MAX_HANDS,
        min_detection_confidence = MIN_DETECT_CONF,
        min_tracking_confidence  = MIN_TRACK_CONF,
    ) as hands:

        while True:
            ret, frame = cap.read()
            if not ret:
                break

            frame_num  += 1
            frame, n_hands = process_frame(frame, hands, prev_n_hands)
            hands_total   += n_hands
            prev_n_hands   = n_hands

            elapsed = time.time() - t_start
            fps_now = frame_num / max(elapsed, 1e-9)

            draw_hud(frame, frame_num, total_frames, fps_now, n_hands)
            draw_progress_bar(frame, frame_num, total_frames)
            writer.write(frame)

            if frame_num % 10 == 0 or frame_num == total_frames:
                pct      = frame_num / total_frames * 100
                eta_s    = (total_frames - frame_num) / max(fps_now, 1e-9)
                bar_done = int(pct / 5)
                bar      = "█" * bar_done + "░" * (20 - bar_done)
                print(
                    f"  [{bar}]  {pct:5.1f}%   "
                    f"frame {frame_num}/{total_frames}   "
                    f"ETA {eta_s:.0f}s",
                    end="\r", flush=True
                )

    cap.release()
    writer.release()

    elapsed_total = time.time() - t_start
    print(f"\n\n── Done ─────────────────────────────────────────────")
    print(f"  Frames processed : {frame_num}")
    print(f"  Total hands seen : {hands_total}")
    print(f"  Time taken       : {elapsed_total:.1f} seconds")
    print(f"  Output saved to  : '{output_path}'")
    print(f"─────────────────────────────────────────────────────\n")
    return output_path



def run_colab():
    from google.colab import files

    print("Step 1: Upload your video file ...")
    uploaded = files.upload()
    if not uploaded:
        print("[!] No file uploaded.")
        return

    input_path = list(uploaded.keys())[0]
    print(f"\n[OK] Received: '{input_path}'\n")

    output_path = process_video(input_path, OUTPUT_VIDEO)

    print("Step 3: Downloading annotated video ...")
    files.download(output_path)
    print(f"[OK] '{output_path}' downloaded.")


if __name__ == "__main__":
    if COLAB:
        run_colab()
    else:
        process_video(INPUT_VIDEO, OUTPUT_VIDEO)


── Loading model files ──────────────────────────────
  [OK] SVM model            loaded from '/content/svm_winner.pkl'
  [OK] Gesture prediction ENABLED  (raw string labels, no LabelEncoder, no scaler)
─────────────────────────────────────────────────────

Step 1: Upload your video file ...


Saving video2.mp4 to video2.mp4

[OK] Received: 'video2.mp4'

── Input video ──────────────────────────────────────
  File       : video2.mp4
  Size       : 1920 × 1080
  FPS        : 14.67
  Frames     : 779
  Duration   : 53.1 seconds
  Output     : output_annotated.mp4
  Smoothing  : majority vote over last 10 frames
  Min conf   : 30% to show label
─────────────────────────────────────────────────────

Processing ...





  [░░░░░░░░░░░░░░░░░░░░]    1.3%   frame 10/779   ETA 102s







  [█░░░░░░░░░░░░░░░░░░░]    5.1%   frame 40/779   ETA 73s



  [█░░░░░░░░░░░░░░░░░░░]    6.4%   frame 50/779   ETA 73s



  [█░░░░░░░░░░░░░░░░░░░]    7.7%   frame 60/779   ETA 73s



  [█░░░░░░░░░░░░░░░░░░░]    9.0%   frame 70/779   ETA 73s



  [██░░░░░░░░░░░░░░░░░░]   10.3%   frame 80/779   ETA 72s



  [██░░░░░░░░░░░░░░░░░░]   11.6%   frame 90/779   ETA 74s



  [██░░░░░░░░░░░░░░░░░░]   12.8%   frame 100/779   ETA 77s



  [██░░░░░░░░░░░░░░░░░░]   14.1%   frame 110/779   ETA 78s



  [███░░░░░░░░░░░░░░░░░]   15.4%   frame 120/779   ETA 77s



  [███░░░░░░░░░░░░░░░░░]   16.7%   frame 130/779   ETA 75s



  [███░░░░░░░░░░░░░░░░░]   18.0%   frame 140/779   ETA 74s



  [███░░░░░░░░░░░░░░░░░]   19.3%   frame 150/779   ETA 72s



  [████░░░░░░░░░░░░░░░░]   20.5%   frame 160/779   ETA 71s



  [████░░░░░░░░░░░░░░░░]   21.8%   frame 170/779   ETA 70s



  [████░░░░░░░░░░░░░░░░]   23.1%   frame 180/779   ETA 68s



  [████░░░░░░░░░░░░░░░░]   24.4%   frame 190/779   ETA 67s



  [█████░░░░░░░░░░░░░░░]   25.7%   frame 200/779   ETA 66s



  [█████░░░░░░░░░░░░░░░]   27.0%   frame 210/779   ETA 66s



  [█████░░░░░░░░░░░░░░░]   28.2%   frame 220/779   ETA 66s



  [█████░░░░░░░░░░░░░░░]   29.5%   frame 230/779   ETA 65s



  [██████░░░░░░░░░░░░░░]   30.8%   frame 240/779   ETA 64s



  [██████░░░░░░░░░░░░░░]   32.1%   frame 250/779   ETA 62s



  [██████░░░░░░░░░░░░░░]   33.4%   frame 260/779   ETA 61s



  [██████░░░░░░░░░░░░░░]   34.7%   frame 270/779   ETA 60s



  [███████░░░░░░░░░░░░░]   35.9%   frame 280/779   ETA 58s



  [███████░░░░░░░░░░░░░]   37.2%   frame 290/779   ETA 57s



  [███████░░░░░░░░░░░░░]   38.5%   frame 300/779   ETA 56s



  [███████░░░░░░░░░░░░░]   39.8%   frame 310/779   ETA 54s



  [████████░░░░░░░░░░░░]   41.1%   frame 320/779   ETA 54s



  [████████░░░░░░░░░░░░]   42.4%   frame 330/779   ETA 53s



  [████████░░░░░░░░░░░░]   43.6%   frame 340/779   ETA 52s



  [████████░░░░░░░░░░░░]   44.9%   frame 350/779   ETA 51s



  [█████████░░░░░░░░░░░]   46.2%   frame 360/779   ETA 50s



  [█████████░░░░░░░░░░░]   47.5%   frame 370/779   ETA 48s



  [█████████░░░░░░░░░░░]   48.8%   frame 380/779   ETA 47s



  [██████████░░░░░░░░░░]   50.1%   frame 390/779   ETA 46s



  [██████████░░░░░░░░░░]   51.3%   frame 400/779   ETA 45s



  [██████████░░░░░░░░░░]   52.6%   frame 410/779   ETA 43s



  [██████████░░░░░░░░░░]   53.9%   frame 420/779   ETA 42s



  [███████████░░░░░░░░░]   55.2%   frame 430/779   ETA 41s



  [███████████░░░░░░░░░]   56.5%   frame 440/779   ETA 40s



  [███████████░░░░░░░░░]   57.8%   frame 450/779   ETA 39s



  [███████████░░░░░░░░░]   59.1%   frame 460/779   ETA 38s



  [████████████░░░░░░░░]   60.3%   frame 470/779   ETA 37s



  [████████████░░░░░░░░]   61.6%   frame 480/779   ETA 36s



  [████████████░░░░░░░░]   62.9%   frame 490/779   ETA 34s



  [████████████░░░░░░░░]   64.2%   frame 500/779   ETA 33s



  [█████████████░░░░░░░]   65.5%   frame 510/779   ETA 32s



  [█████████████░░░░░░░]   66.8%   frame 520/779   ETA 31s



  [█████████████░░░░░░░]   68.0%   frame 530/779   ETA 29s



  [█████████████░░░░░░░]   69.3%   frame 540/779   ETA 28s



  [██████████████░░░░░░]   70.6%   frame 550/779   ETA 27s



  [██████████████░░░░░░]   71.9%   frame 560/779   ETA 26s



  [██████████████░░░░░░]   73.2%   frame 570/779   ETA 25s



  [██████████████░░░░░░]   74.5%   frame 580/779   ETA 24s



  [███████████████░░░░░]   75.7%   frame 590/779   ETA 23s



  [███████████████░░░░░]   77.0%   frame 600/779   ETA 21s



  [███████████████░░░░░]   78.3%   frame 610/779   ETA 20s



  [███████████████░░░░░]   79.6%   frame 620/779   ETA 19s



  [████████████████░░░░]   80.9%   frame 630/779   ETA 18s



  [████████████████░░░░]   82.2%   frame 640/779   ETA 17s



  [████████████████░░░░]   83.4%   frame 650/779   ETA 15s



  [████████████████░░░░]   84.7%   frame 660/779   ETA 14s



  [█████████████████░░░]   86.0%   frame 670/779   ETA 13s



  [█████████████████░░░]   87.3%   frame 680/779   ETA 12s



  [█████████████████░░░]   88.6%   frame 690/779   ETA 11s



  [█████████████████░░░]   89.9%   frame 700/779   ETA 9s



  [██████████████████░░]   91.1%   frame 710/779   ETA 8s



  [██████████████████░░]   92.4%   frame 720/779   ETA 7s



  [██████████████████░░]   93.7%   frame 730/779   ETA 6s



  [██████████████████░░]   95.0%   frame 740/779   ETA 5s



  [███████████████████░]   96.3%   frame 750/779   ETA 3s



  [███████████████████░]   97.6%   frame 760/779   ETA 2s



  [███████████████████░]   98.8%   frame 770/779   ETA 1s



  [████████████████████]  100.0%   frame 779/779   ETA 0s

── Done ─────────────────────────────────────────────
  Frames processed : 779
  Total hands seen : 765
  Time taken       : 93.6 seconds
  Output saved to  : 'output_annotated.mp4'
─────────────────────────────────────────────────────

Step 3: Downloading annotated video ...




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

[OK] 'output_annotated.mp4' downloaded.
