# Cropped Frame by Yolo, Pose Estimation by Mediapipe

In [9]:
import mediapipe as mp
from mediapipe.framework.formats import landmark_pb2

from ultralytics import YOLO
import numpy as np
import cv2

mp_drawing = mp.solutions.drawing_utils # Drawing helpers
mp_holistic = mp.solutions.holistic # Mediapipe Solutions

# --- Configuration ---
FIXED_SIZE = 640
model = YOLO("../../model/yolo/yolo12n.pt") 

In [6]:
def cropped_frame(frame):
    results = model(frame, classes=[0])
    boxes = results[0].boxes
    plotted_frame = results[0].plot() 

    try:
        if len(boxes) > 0:
            # Get the bounding box coordinates for the first detected object
            x1, y1, x2, y2 = boxes.xyxy[0].cpu().numpy().astype(int)

            cropped_frame = frame[y1:y2, x1:x2]
            crop_h, crop_w = cropped_frame.shape[:2]

            # We want to fit the largest dimension (width or height) to the FIXED_SIZE
            scale = FIXED_SIZE / max(crop_w, crop_h)
            new_w = int(crop_w * scale)
            new_h = int(crop_h * scale)

            # Resize the cropped frame to the new dimensions
            resized_img = cv2.resize(cropped_frame, (new_w, new_h), interpolation=cv2.INTER_AREA)

            # Background frame, that are not filled with boxes
            final_frame = np.full((FIXED_SIZE, FIXED_SIZE, 3), 255, dtype=np.uint8)
            
            # dw and dh are the space left over after placing the image
            dw = FIXED_SIZE - new_w
            dh = FIXED_SIZE - new_h

            # Calculate the starting position (top-left corner) for centering
            top = dh // 2
            bottom = top + new_h
            left = dw // 2
            right = left + new_w

            final_frame[top:bottom, left:right] = resized_img

            return final_frame

        else:
            print("No objects detected in the image.")
    except Exception as e:
        print(f"Error processing frame: {e}")

In [15]:
cap = cv2.VideoCapture('../../assets/dump/celinguk.mp4')

if not cap.isOpened():
    print("Error opening video file")
else:
    with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
        while cap.isOpened():
            ret, frame = cap.read()
            if ret:
                frame_cropped = cropped_frame(frame)

                try:
                    # Start Code of mediapipe here
                    frames = cv2.cvtColor(frame_cropped, cv2. COLOR_BGR2RGB)
                    frames.flags.writeable = False
                    
                    results = holistic.process(frames)
                    frames.flags.writeable = True
                    frames = cv2.cvtColor(frame_cropped, cv2. COLOR_RGB2BGR)

                    # Get specific landmarks
                    nose = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.NOSE]
                    wrist_l = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_WRIST]
                    elbow_l = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.LEFT_ELBOW]
                    wrist_r = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_WRIST]
                    elbow_r = results.pose_landmarks.landmark[mp.solutions.holistic.PoseLandmark.RIGHT_ELBOW]

                    show_landmark_list = landmark_pb2.NormalizedLandmarkList()
                    show_landmark_list.landmark.extend([nose, wrist_l, wrist_r, elbow_l, elbow_r])

                    # Draw landmarks
                    for landmark in show_landmark_list.landmark:
                        x, y = int(landmark.x * frames.shape[1]), int(landmark.y * frames.shape[0])
                        cv2.circle(frames, (x, y), 5, (255, 0, 0), -1)

                    cv2.imshow('Video', frames)
                except Exception as e:
                    print(f"Error displaying frame: {e}")
                
                
                if cv2.waitKey(5) & 0xFF == ord('q'):
                    break
            else:
                break

cap.release()
cv2.destroyAllWindows()




I0000 00:00:1760531617.687201     924 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1760531617.708447   64979 gl_context.cc:369] GL version: 3.2 (OpenGL ES 3.2 Mesa 25.0.7-0ubuntu0.24.04.2), renderer: llvmpipe (LLVM 20.1.2, 256 bits)


0: 320x640 (no detections), 328.0ms
Speed: 9.9ms preprocess, 328.0ms inference, 6.7ms postprocess per image at shape (1, 3, 320, 640)
No objects detected in the image.
Error displaying frame: OpenCV(4.11.0) /io/opencv/modules/imgproc/src/color.cpp:199: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


0: 320x640 (no detections), 69.1ms
Speed: 5.9ms preprocess, 69.1ms inference, 5.1ms postprocess per image at shape (1, 3, 320, 640)
No objects detected in the image.
Error displaying frame: OpenCV(4.11.0) /io/opencv/modules/imgproc/src/color.cpp:199: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


0: 320x640 (no detections), 55.0ms
Speed: 4.3ms preprocess, 55.0ms inference, 5.0ms postprocess per image at shape (1, 3, 320, 640)
No objects detected in the image.
Error displaying frame: OpenCV(4.11.0) /io/opencv/modules/imgproc/src/color.cpp:199: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


0: 320x640 (no detections), 50.4ms


W0000 00:00:1760531618.587337   64970 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760531619.142547   64969 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760531619.177230   64975 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760531619.182932   64971 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760531619.200046   64970 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1760531619.232605   64967 inference_feedback_manager.cc:114] Feedback manager 


0: 320x640 1 person, 63.5ms
Speed: 7.2ms preprocess, 63.5ms inference, 9.9ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 45.8ms
Speed: 2.3ms preprocess, 45.8ms inference, 6.2ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 34.0ms
Speed: 3.1ms preprocess, 34.0ms inference, 4.8ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 1 person, 36.0ms
Speed: 10.7ms preprocess, 36.0ms inference, 5.8ms postprocess per image at shape (1, 3, 320, 640)

0: 320x640 (no detections), 58.2ms
Speed: 3.1ms preprocess, 58.2ms inference, 2.8ms postprocess per image at shape (1, 3, 320, 640)
No objects detected in the image.
Error displaying frame: OpenCV(4.11.0) /io/opencv/modules/imgproc/src/color.cpp:199: error: (-215:Assertion failed) !_src.empty() in function 'cvtColor'


0: 320x640 (no detections), 30.9ms
Speed: 2.4ms preprocess, 30.9ms inference, 2.5ms postprocess per image at shape (1, 3, 320, 640)
No objects detected in the image.
Error d