In [1]:
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2

from ultralytics import YOLO
import numpy as np
import cv2
import time
import csv
import os

# --- Configuration ---
FIXED_SIZE = 640
SMALL_SIZE = 150, 150
NUM_FRAMES = 15
model = YOLO("../../model/yolo/yolo12n.pt") 

mp_drawing = mp.solutions.drawing_utils # Drawing helpers
mp_holistic = mp.solutions.holistic # Mediapipe Solutions

In [22]:
def cropped_frame(frame):
    results = model(frame, classes=[0], verbose=False)
    boxes = results[0].boxes
    plotted_frame = results[0].plot() 

    try:
        if len(boxes) > 0:
            # Get the bounding box coordinates for the first detected object
            x1, y1, x2, y2 = boxes.xyxy[0].cpu().numpy().astype(int)

            cropped_frame = frame[y1:y2, x1:x2]
            crop_h, crop_w = cropped_frame.shape[:2]

            # We want to fit the largest dimension (width or height) to the FIXED_SIZE
            scale = FIXED_SIZE / max(crop_w, crop_h)
            new_w = int(crop_w * scale)
            new_h = int(crop_h * scale)

            # Resize the cropped frame to the new dimensions
            resized_img = cv2.resize(cropped_frame, (new_w, new_h), interpolation=cv2.INTER_AREA)

            # Background frame, that are not filled with boxes
            final_frame = np.full((FIXED_SIZE, FIXED_SIZE, 3), 255, dtype=np.uint8)
            
            # dw and dh are the space left over after placing the image
            dw = FIXED_SIZE - new_w
            dh = FIXED_SIZE - new_h

            # Calculate the starting position (top-left corner) for centering
            top = dh // 2
            bottom = top + new_h
            left = dw // 2
            right = left + new_w

            final_frame[top:bottom, left:right] = resized_img

            return final_frame

        else:
            print("No objects detected in the image.")
    except Exception as e:
        print(f"Error processing frame: {e}")

In [29]:
cap = cv2.VideoCapture('../../assets/dump/cctv.mp4')

frames = []
detected = []
undetectable_image = None
final_layout = None 

if not cap.isOpened():
    print("Error opening video file")
else:
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            start_time = time.time()
            ret, frame = cap.read()
            if ret:
                frame_cropped = cropped_frame(frame)
                
                # FIX 1: Skip if initial cropping failed (frame_cropped is None)
                if frame_cropped is None:
                    print("Skipping frame: cropped_frame returned None.")
                    continue

                # Prepare the current frame for side display (resized) and an empty list for detection
                current_small_frame = cv2.resize(frame_cropped, (SMALL_SIZE))
                current_detection_list = []
                detection_successful = False

                try:
                    # Attempt Mediapipe detection
                    frames_mp = cv2.cvtColor(frame_cropped, cv2.COLOR_BGR2RGB)
                    frames_mp.flags.writeable = False
                    
                    results = holistic.process(frames_mp)
                    frames_mp.flags.writeable = True
                    frames_mp = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2BGR) 

                    # Extract landmarks if available
                    if results.pose_landmarks:
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.NOSE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_EYE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_EYE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_WRIST])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_WRIST])
                        detection_successful = True
                    else:
                        raise ValueError("No pose landmarks detected.")
                        
                except Exception as e:
                    # Detection failed (Mediapipe error or no landmarks found)
                    error_text = f"No detection: {e}"
                    cv2.putText(frame_cropped, error_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    
                    
                    placeholder_landmark = landmark_pb2.NormalizedLandmark(x=0.0, y=0.0, z=0.0, visibility=0.0)
                    current_detection_list = [placeholder_landmark] * 5 
                    detection_successful = True # Treat as 'successful' failure to maintain list length

                # FIX 3: Always manage the lists (frames and detected) here, regardless of detection success
                if len(frames) < NUM_FRAMES:
                    frames.append(current_small_frame)
                    detected.append(current_detection_list)
                else:
                    frames.pop(0)
                    detected.pop(0)
                    frames.append(current_small_frame)
                    detected.append(current_detection_list)
                
                # --- LAYOUT AND DISPLAY LOGIC ---
                if len(frames) == NUM_FRAMES:
                    stacked1_frames = np.vstack((frames[0], frames[1], frames[2]))
                    stacked2_frames = np.vstack((frames[3], frames[4], frames[5]))
                    stacked3_frames = np.vstack((frames[6], frames[7], frames[8]))
                    stacked4_frames = np.vstack((frames[9], frames[10], frames[11]))
                    stacked5_frames = np.vstack((frames[12], frames[13], frames[14]))

                    stacked1_frames = cv2.resize(stacked1_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked2_frames = cv2.resize(stacked2_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked3_frames = cv2.resize(stacked3_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked4_frames = cv2.resize(stacked4_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked5_frames = cv2.resize(stacked5_frames, (SMALL_SIZE[0], FIXED_SIZE))

                    # All components are guaranteed to be valid images here
                    final_layout = np.hstack((frame_cropped, stacked1_frames, stacked2_frames, stacked3_frames, stacked4_frames, stacked5_frames))
                else:
                    final_layout = frame_cropped
                
                
                if final_layout is not None and final_layout.size > 0:
                    flat_detected = [landmark for sublist in detected for landmark in sublist]
                    print(len(flat_detected)) # Should always be 5 * NUM_FRAMES when full

                    if len(flat_detected) > 0:
                        landmark_list = landmark_pb2.NormalizedLandmarkList()
                        landmark_list.landmark.extend(flat_detected)
                    
                    try:
                        cv2.imshow('Video', final_layout)

                        if cv2.waitKey(1) & 0xFF == ord('q'):
                            break
                    except Exception as e:
                        print(f"Error displaying frame: {e}")
                
            else:
                break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1761051551.596179   15900 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1761051551.604448   85001 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: llvmpipe (LLVM 20.1.2, 256 bits)
W0000 00:00:1761051551.916549   84991 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761051552.156447   84989 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761051552.170983   84990 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761051552.177296   84996 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W

5
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
10
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
15
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping frame: cropped_frame returned None.
No objects detected in the image.
Skipping f

In [5]:
used_coords = len(landmark_list.landmark)
used_coords
frames = 1

landmarks = ['class']
for val in range(1, used_coords+1):
    landmarks += ['x{}'.format(val), 'y{}'.format(val), 'z{}'.format(val), 'v{}'.format(val)]
    # if val % 5 == 0:
    #     landmarks += ['face_d{}'.format(frames), 'face_s{}'.format(frames), 'hand_s{}'.format(frames)]
    frames += 1


landmarks

['class',
 'x1',
 'y1',
 'z1',
 'v1',
 'x2',
 'y2',
 'z2',
 'v2',
 'x3',
 'y3',
 'z3',
 'v3',
 'x4',
 'y4',
 'z4',
 'v4',
 'x5',
 'y5',
 'z5',
 'v5',
 'x6',
 'y6',
 'z6',
 'v6',
 'x7',
 'y7',
 'z7',
 'v7',
 'x8',
 'y8',
 'z8',
 'v8',
 'x9',
 'y9',
 'z9',
 'v9',
 'x10',
 'y10',
 'z10',
 'v10',
 'x11',
 'y11',
 'z11',
 'v11',
 'x12',
 'y12',
 'z12',
 'v12',
 'x13',
 'y13',
 'z13',
 'v13',
 'x14',
 'y14',
 'z14',
 'v14',
 'x15',
 'y15',
 'z15',
 'v15',
 'x16',
 'y16',
 'z16',
 'v16',
 'x17',
 'y17',
 'z17',
 'v17',
 'x18',
 'y18',
 'z18',
 'v18',
 'x19',
 'y19',
 'z19',
 'v19',
 'x20',
 'y20',
 'z20',
 'v20',
 'x21',
 'y21',
 'z21',
 'v21',
 'x22',
 'y22',
 'z22',
 'v22',
 'x23',
 'y23',
 'z23',
 'v23',
 'x24',
 'y24',
 'z24',
 'v24',
 'x25',
 'y25',
 'z25',
 'v25',
 'x26',
 'y26',
 'z26',
 'v26',
 'x27',
 'y27',
 'z27',
 'v27',
 'x28',
 'y28',
 'z28',
 'v28',
 'x29',
 'y29',
 'z29',
 'v29',
 'x30',
 'y30',
 'z30',
 'v30',
 'x31',
 'y31',
 'z31',
 'v31',
 'x32',
 'y32',
 'z32',
 'v32',
 '

# Create Template Dataset

In [18]:
file_csv = '../../dataset/csv/dataset_v1.csv'
os.makedirs(os.path.dirname(file_csv), exist_ok=True)

with open(file_csv, mode='w', newline='') as f:
    csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
    csv_writer.writerow(landmarks)

# Capture Dataset

In [19]:
# Setup

class_name = 'celinguk'

In [32]:
cap = cv2.VideoCapture('http://192.168.100.197:5000/video')

frames = []
detected = []

undetectable_image = None
final_layout = None 

saved = 0

if not cap.isOpened():
    print("Error opening video file")
else:
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            start_time = time.time()
            ret, frame = cap.read()
            if ret:
                frame_cropped = cropped_frame(frame)
                
                # FIX 1: Skip if initial cropping failed (frame_cropped is None)
                if frame_cropped is None:
                    print("Skipping frame: cropped_frame returned None.")
                    continue

                # Prepare the current frame for side display (resized) and an empty list for detection
                current_small_frame = cv2.resize(frame_cropped, (SMALL_SIZE))
                current_detection_list = []
                detection_successful = False

                try:
                    # Attempt Mediapipe detection
                    frames_mp = cv2.cvtColor(frame_cropped, cv2.COLOR_BGR2RGB)
                    frames_mp.flags.writeable = False
                    
                    results = holistic.process(frames_mp)
                    frames_mp.flags.writeable = True
                    frames_mp = cv2.cvtColor(frame_cropped, cv2.COLOR_RGB2BGR) 

                    # Extract landmarks if available
                    if results.pose_landmarks:
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.NOSE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_EYE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_EYE])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_WRIST])
                        current_detection_list.append(results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_WRIST])
                        detection_successful = True
                    else:
                        raise ValueError("No pose landmarks detected.")
                        
                except Exception as e:
                    # Detection failed (Mediapipe error or no landmarks found)
                    error_text = f"No detection: {e}"
                    cv2.putText(frame_cropped, error_text, (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 2)
                    
                if detection_successful:
                    # This block is now ONLY executed if detection_successful is True
                    if len(frames) < NUM_FRAMES:
                        frames.append(current_small_frame)
                        detected.append(current_detection_list)
                    else:
                        frames.pop(0)
                        detected.pop(0)
                        frames.append(current_small_frame)
                        detected.append(current_detection_list)
                else:
                    print(f"Skipping frame {time.time() - start_time}ms: No pose landmarks detected.")
                    pass
                
                # --- LAYOUT AND DISPLAY LOGIC ---
                if len(frames) == NUM_FRAMES:
                    stacked1_frames = np.vstack((frames[0], frames[1], frames[2]))
                    stacked2_frames = np.vstack((frames[3], frames[4], frames[5]))
                    stacked3_frames = np.vstack((frames[6], frames[7], frames[8]))
                    stacked4_frames = np.vstack((frames[9], frames[10], frames[11]))
                    stacked5_frames = np.vstack((frames[12], frames[13], frames[14]))

                    stacked1_frames = cv2.resize(stacked1_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked2_frames = cv2.resize(stacked2_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked3_frames = cv2.resize(stacked3_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked4_frames = cv2.resize(stacked4_frames, (SMALL_SIZE[0], FIXED_SIZE))
                    stacked5_frames = cv2.resize(stacked5_frames, (SMALL_SIZE[0], FIXED_SIZE))

                    # All components are guaranteed to be valid images here
                    final_layout = np.hstack((frame_cropped, stacked1_frames, stacked2_frames, stacked3_frames, stacked4_frames, stacked5_frames))
                else:
                    final_layout = frame_cropped
                
                if final_layout is not None and final_layout.size > 0:
                    flat_detected = [landmark for sublist in detected for landmark in sublist]
                    print(len(flat_detected)) # Should always be 5 * NUM_FRAMES when full

                    if len(flat_detected) > 0:
                        landmark_list = landmark_pb2.NormalizedLandmarkList()
                        landmark_list.landmark.extend(flat_detected)
                    
                    try:
                        cv2.imshow('Video', final_layout)

                        key = cv2.waitKey(1) & 0xFF

                        if key == ord('q'):
                            break
                        elif key == ord('s'):
                            # Save to CSV
                            with open(file_csv, mode='a', newline='') as f:
                                csv_writer = csv.writer(f, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
                                row = [class_name]
                                for landmark in landmark_list.landmark:
                                    row.extend([landmark.x, landmark.y, landmark.z, landmark.visibility])
                                csv_writer.writerow(row)
                            saved += 1
                            print(f"Saved frame data to CSV. Total saved: {saved}")
                            
                    except Exception as e:
                        print(f"Error displaying frame: {e}")
                
            else:
                break

cap.release()
cv2.destroyAllWindows()

I0000 00:00:1761051703.650990   15900 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1761051703.659810   85559 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: llvmpipe (LLVM 20.1.2, 256 bits)
W0000 00:00:1761051704.014860   85549 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761051704.245553   85548 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761051704.260401   85553 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761051704.264916   85552 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W

5
10
15
20
25
30
35
40
45
50
55
Saved frame data to CSV. Total saved: 1
60
65
70
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
Saved frame data to CSV. Total saved: 2
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
75
