# Cropped Frame by Yolo, Pose Estimation by Mediapipe

In [1]:
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2

from ultralytics import YOLO
import numpy as np
import cv2

mp_drawing = mp.solutions.drawing_utils # Drawing helpers
mp_holistic = mp.solutions.holistic # Mediapipe Solutions

# --- Configuration ---
FIXED_SIZE = 500
SMALL_SIZE = 150, 150
model = YOLO("../../model/yolo/yolo12n.pt") 

AttributeError: 'MessageFactory' object has no attribute 'GetPrototype'

2025-10-27 11:29:55.924470: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
def cropped_frame(frame):
    results = model(frame, classes=[0])
    boxes = results[0].boxes
    plotted_frame = results[0].plot() 

    try:
        if len(boxes) > 0:
            # Get the bounding box coordinates for the first detected object
            x1, y1, x2, y2 = boxes.xyxy[0].cpu().numpy().astype(int)

            cropped_frame = frame[y1:y2, x1:x2]
            crop_h, crop_w = cropped_frame.shape[:2]

            # We want to fit the largest dimension (width or height) to the FIXED_SIZE
            scale = FIXED_SIZE / max(crop_w, crop_h)
            new_w = int(crop_w * scale)
            new_h = int(crop_h * scale)

            # Resize the cropped frame to the new dimensions
            resized_img = cv2.resize(cropped_frame, (new_w, new_h), interpolation=cv2.INTER_AREA)

            # Background frame, that are not filled with boxes
            final_frame = np.full((FIXED_SIZE, FIXED_SIZE, 3), 255, dtype=np.uint8)
            
            # dw and dh are the space left over after placing the image
            dw = FIXED_SIZE - new_w
            dh = FIXED_SIZE - new_h

            # Calculate the starting position (top-left corner) for centering
            top = dh // 2
            bottom = top + new_h
            left = dw // 2
            right = left + new_w

            final_frame[top:bottom, left:right] = resized_img

            return final_frame

        else:
            print("No objects detected in the image.")
    except Exception as e:
        print(f"Error processing frame: {e}")

In [7]:
cap = cv2.VideoCapture('../../assets/dataset/celinguk.mp4')

if not cap.isOpened():
    print("Error opening video file")
else:
    with mp_holistic.Holistic(min_detection_confidence=0.65, min_tracking_confidence=0.65) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if ret:
                frame_cropped = cropped_frame(frame)

                try:
                    # Start Code of mediapipe here
                    frames_mp = cv2.cvtColor(frame_cropped, cv2. COLOR_BGR2RGB)
                    frames_mp.flags.writeable = False
                    
                    results = holistic.process(frames_mp)
                    frames_mp.flags.writeable = True
                    frames_mp = cv2.cvtColor(frame_cropped, cv2. COLOR_RGB2BGR)
                    

                    # Get specific landmarks
                    nose = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.NOSE]
                    wrist_l = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_WRIST]
                    elbow_l = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_ELBOW]
                    wrist_r = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_WRIST]
                    elbow_r = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_ELBOW]

                    show_landmark_list = landmark_pb2.NormalizedLandmarkList()
                    show_landmark_list.landmark.extend([nose, wrist_l, wrist_r, elbow_l, elbow_r])

                    # Draw landmarks
                    for landmark in show_landmark_list.landmark:
                        x, y = int(landmark.x * frames_mp.shape[1]), int(landmark.y * frames_mp.shape[0])
                        cv2.circle(frames_mp, (x, y), 5, (255, 0, 0), -1)

                    cv2.imshow('Video', frames_mp)
                except Exception as e:
                    print(f"Error displaying frame: {e}")
                
                
                if cv2.waitKey(5) & 0xFF == ord('q'):
                    break
            else:
                break

cap.release()
cv2.destroyAllWindows()


0: 384x640 1 person, 29.2ms
Speed: 3.5ms preprocess, 29.2ms inference, 5.9ms postprocess per image at shape (1, 3, 384, 640)


I0000 00:00:1761536090.570908   11905 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1761536090.578907   12365 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: llvmpipe (LLVM 20.1.2, 256 bits)
W0000 00:00:1761536090.906809   12355 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761536091.062238   12356 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761536091.074941   12353 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1761536091.075606   12360 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W


0: 384x640 1 person, 30.4ms
Speed: 3.3ms preprocess, 30.4ms inference, 4.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 25.5ms
Speed: 3.2ms preprocess, 25.5ms inference, 3.2ms postprocess per image at shape (1, 3, 384, 640)


W0000 00:00:1761536091.108131   12363 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.



0: 384x640 1 person, 25.6ms
Speed: 2.7ms preprocess, 25.6ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 25.6ms
Speed: 2.6ms preprocess, 25.6ms inference, 3.1ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 25.4ms
Speed: 2.8ms preprocess, 25.4ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 25.5ms
Speed: 2.8ms preprocess, 25.5ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 24.2ms
Speed: 2.7ms preprocess, 24.2ms inference, 7.7ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 24.1ms
Speed: 3.0ms preprocess, 24.1ms inference, 2.8ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 24.2ms
Speed: 2.4ms preprocess, 24.2ms inference, 2.9ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 person, 24.1ms
Speed: 2.6ms preprocess, 24.1ms inference, 2.8ms postprocess per image at shape (1, 3, 38