In [1]:
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2
import tensorflow as tf

from ultralytics import YOLO
import numpy as np
import cv2
import time
import pandas as pd
import csv
import os
import pickle
import warnings

# --- Configuration ---
FIXED_SIZE = 640
SMALL_SIZE = 150, 150
NUM_FRAMES = 15
model_yolo = YOLO("../../model/yolo/yolo12n.pt") 

with open('../../model/trained/lstm_s/lstm_model_v0.pkl', 'rb') as f:
    artifacts = pickle.load(f)

scaler = artifacts['scaler']
label_encoder = artifacts['label_encoder']
keras_model_path = artifacts['model_filename'] # e.g., 'single_lstm_weights.keras'
NUM_CLASSES = artifacts['num_classes']

try:
    model_pred = tf.keras.models.load_model(f'../../model/trained/lstm_s/{keras_model_path}')
    print("LSTM Model (Keras) loaded successfully.")
except Exception as e:
    print(f"Error loading Keras model: {e}")
    model_pred = None

mp_drawing = mp.solutions.drawing_utils # Drawing helpers
mp_holistic = mp.solutions.holistic # Mediapipe Solutions

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

2025-10-25 11:32:00.890291: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
I0000 00:00:1761363271.997584   25609 gpu_device.cc:2020] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 3539 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4050 Laptop GPU, pci bus id: 0000:01:00.0, compute capability: 8.9


LSTM Model (Keras) loaded successfully.


In [3]:
def cropped_frame(frame):
    results = model_yolo(frame, classes=[0], verbose=False)
    boxes = results[0].boxes
    plotted_frame = results[0].plot() 

    try:
        if len(boxes) > 0:
            # Get the bounding box coordinates for the first detected object
            x1, y1, x2, y2 = boxes.xyxy[0].cpu().numpy().astype(int)

            cropped_frame = frame[y1:y2, x1:x2]
            crop_h, crop_w = cropped_frame.shape[:2]

            # We want to fit the largest dimension (width or height) to the FIXED_SIZE
            scale = FIXED_SIZE / max(crop_w, crop_h)
            new_w = int(crop_w * scale)
            new_h = int(crop_h * scale)

            # Resize the cropped frame to the new dimensions
            resized_img = cv2.resize(cropped_frame, (new_w, new_h), interpolation=cv2.INTER_AREA)

            # Background frame, that are not filled with boxes
            final_frame = np.full((FIXED_SIZE, FIXED_SIZE, 3), 255, dtype=np.uint8)
            
            # dw and dh are the space left over after placing the image
            dw = FIXED_SIZE - new_w
            dh = FIXED_SIZE - new_h

            # Calculate the starting position (top-left corner) for centering
            top = dh // 2
            bottom = top + new_h
            left = dw // 2
            right = left + new_w

            final_frame[top:bottom, left:right] = resized_img

            return final_frame

        else:
            print("No objects detected in the image.")
    except Exception as e:
        print(f"Error processing frame: {e}")

In [5]:
# cap = cv2.VideoCapture('http://192.168.100.197:5000/video')
cap = cv2.VideoCapture(f'../../assets/test/test_celinguk_2.mp4')

frames = []
detected = []

undetectable_image = None
final_layout = None 

saved = 0

if not cap.isOpened():
    print("Error opening video file")
else:
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            start_time = time.time()
            ret, frame = cap.read()
            if ret:
                frame_cropped = cropped_frame(frame)
                
                # FIX 1: Skip if initial cropping failed (frame_cropped is None)
                if frame_cropped is None:
                    print("Skipping frame: cropped_frame returned None.")
                    continue

                # Prepare the current frame for side display (resized) and an empty list for detection
                current_small_frame = cv2.resize(frame_cropped, (SMALL_SIZE))
                current_detection_list = []
                detection_successful = False

                try:
                    # Attempt Mediapipe detection
                    frames_mp = cv2.cvtColor(frame_cropped, cv2.COLOR_BGR2RGB)
                    frames_mp.flags.writeable = False
                    
                    results = holistic.process(frames_mp)
                    frames_mp.flags.writeable = True
                    frames_mp = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2BGR) 

                    # Extract landmarks if available
                    if results.pose_landmarks:
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.NOSE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_EYE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_EYE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_WRIST])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_WRIST])
                        detection_successful = True
                    else:
                        raise ValueError("No pose landmarks detected.")
                        
                except Exception as e:
                    # Detection failed (Mediapipe error or no landmarks found)
                    error_text = f"No detection: {e}"
                    cv2.putText(frame_cropped, error_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    
                if detection_successful:
                    # This block is now ONLY executed if detection_successful is True
                    if len(frames) < NUM_FRAMES:
                        frames.append(current_small_frame)
                        detected.append(current_detection_list)
                    else:
                        frames.pop(0)
                        detected.pop(0)
                        frames.append(current_small_frame)
                        detected.append(current_detection_list)
                else:
                    print(f"Skipping frame {time.time() - start_time}ms: No pose landmarks detected.")
                    pass
                
                # --- LAYOUT AND DISPLAY LOGIC ---
                if len(frames) == NUM_FRAMES:
                    stacked1_frames = np.vstack((frames[0], frames[1], frames[2]))
                    stacked2_frames = np.vstack((frames[3], frames[4], frames[5]))
                    stacked3_frames = np.vstack((frames[6], frames[7], frames[8]))
                    stacked4_frames = np.vstack((frames[9], frames[10], frames[11]))
                    stacked5_frames = np.vstack((frames[12], frames[13], frames[14]))

                    stacked1_frames = cv2.resize(stacked1_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked2_frames = cv2.resize(stacked2_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked3_frames = cv2.resize(stacked3_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked4_frames = cv2.resize(stacked4_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked5_frames = cv2.resize(stacked5_frames, (SMALL_SIZE[0], FIXED_SIZE))

                    # All components are guaranteed to be valid images here
                    final_layout = np.hstack((frame_cropped, stacked1_frames, stacked2_frames, stacked3_frames, stacked4_frames, stacked5_frames))
                else:
                    final_layout = frame_cropped
                
                if final_layout is not None and final_layout.size > 0:
                    flat_detected = [landmark for sublist in detected for landmark in sublist]
                    # print(len(flat_detected)) # Should always be 5 * NUM_FRAMES when full

                    if len(flat_detected) > 0:
                        landmark_list = landmark_pb2.NormalizedLandmarkList()
                        landmark_list.landmark.extend(flat_detected)
                    
                    try:
                        motion_row = []
                        
                        for lv in landmark_list.landmark:
                            motion_row.extend([lv.x, lv.y, lv.z, lv.visibility])
                        
                        X_data = np.array(motion_row).reshape(1, -1) # Shape (1, 300)
                        X_scaled = scaler.transform(X_data)
                        X_reshaped = X_scaled.reshape(1, 1, X_scaled.shape[1])

                        try:
                            # 3. Predict the probabilities (for classification)
                            y_pred_probs = model_pred.predict(X_reshaped, verbose=0)
                            # 4. Get the predicted class index
                            y_pred_index = np.argmax(y_pred_probs, axis=1)[0]
                            # 5. Decode the index to the class name
                            motion_class = label_encoder.inverse_transform([y_pred_index])[0]

                            warnings.filterwarnings("ignore", category=UserWarning, module="sklearn")
                            cv2.putText(final_layout, f'Class: {motion_class}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                            
                        except Exception as e:
                            error_text = f"Prediction Error: {e}"
                            cv2.putText(final_layout, error_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)

                        # motion_row = list(np.array(motion_row))
                        # motion_detected = pd.DataFrame([motion_row])
                        # motion_class = model_pred.predict(motion_detected)[0]

                        cv2.imshow('Video', final_layout)

                        key = cv2.waitKey(1) & 0xFF

                        if key == ord('q'):
                            break
                    
                            
                    except Exception as e:
                        print(f"Error displaying frame: {e}")
                
            else:
                break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1761363712.117042   25609 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1761363712.130887   33521 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: llvmpipe (LLVM 20.1.2, 256 bits)
W0000 00:00:1761363712.490685   33511 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761363712.667690   33512 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761363712.684346   33513 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761363712.686697   33512 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W

Error displaying frame: X has 20 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 40 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 60 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 80 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 100 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 120 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 140 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 160 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 180 features, but StandardScaler is expecting 300 features as input.
Error displaying frame: X has 200 features, but StandardScaler is expecting 300 features as input.
Error displayi