In [12]:
import cv2
import mediapipe as mp
import numpy as np
import glob
import torch
import urllib.request
from PIL import Image, ImageOps
import torchvision.transforms as transforms

In [20]:
# Loading model of choice and set up

# set to true to use slower, more accurate model
use_large_model = False

if use_large_model:
    midas = torch.hub.load("intel-isl/MiDaS", "DPT_Large")
else:
    midas = torch.hub.load("intel-isl/MiDaS", "MiDaS_small")

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
midas.to(device)
midas.eval()

midas_transforms = torch.hub.load("intel-isl/MiDaS", "transforms")

if use_large_model:
    transform = midas_transforms.dpt_transform
    print("Using large (slow) model.")
else:
    transform = midas_transforms.small_transform
    print("Using small (fast) model.")

Using cache found in /home/hassaan/.cache/torch/hub/intel-isl_MiDaS_master


Loading weights:  None


Using cache found in /home/hassaan/.cache/torch/hub/rwightman_gen-efficientnet-pytorch_master


Using small (fast) model.


Using cache found in /home/hassaan/.cache/torch/hub/intel-isl_MiDaS_master


In [14]:
# Mediapipe setup for drwing and pose estimation function
mp_drawing = mp.solutions.drawing_utils
mp_pose = mp.solutions.pose

In [30]:
def depthProcessing(frame):
    input_batch = transform(frame).to(device)

    with torch.no_grad():
        prediction = midas(input_batch)

    prediction = torch.nn.functional.interpolate(
        prediction.unsqueeze(1),
        size=frame.shape[:2],
        mode="bicubic",
        align_corners=False,
    ).squeeze()

    output = prediction.cpu().numpy()
    frame.flags.writeable = True
    frame = (output * 255 / np.max(output)).astype("uint8")
    return frame

In [31]:
def poseDetection(frame, pose, mp_drawing):

    # Convert single-channel depth image to three-channel grayscale image
    frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2RGB)

    # pose estimation after depth recognition
    results = pose.process(frame)

    # convert back to cv2 default bgr
    frame = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)

    # draw landmarks
    mp_drawing.draw_landmarks(
        frame,  # output
        results.pose_landmarks,  # passing landmarks
        mp_pose.POSE_CONNECTIONS,  # passing landmark connections
        mp_drawing.DrawingSpec(color=(245, 117, 66), thickness=2, circle_radius=2),
        mp_drawing.DrawingSpec(color=(245, 66, 230), thickness=2, circle_radius=2),
    )

    return frame

In [34]:
cap = cv2.VideoCapture(0)
try:
    with mp_pose.Pose(min_detection_confidence=0.5, min_tracking_confidence=0.5) as pose:
        while cap.isOpened():
            ret, frame = cap.read()

            if not ret:
                cap.release()
                cv2.destroyAllWindows()
                break

            # writeable flag is unset before processing to improve performance and avoid unintended write ops.
            frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            frame.flags.writeable = False

            # depth estimation
            frame = depthProcessing(frame)

            frame = poseDetection(frame, pose, mp_drawing)

            cv2.imshow("output : ", frame)

            if cv2.waitKey(10) & 0xFF == ord("q"):
                break
    cap.release()
    cv2.destroyAllWindows()
except Exception as e:
    print(e)
    cap.release()
    cv2.destroyAllWindows()

I0000 00:00:1720681635.022733  108005 gl_context_egl.cc:85] Successfully initialized EGL. Major : 1 Minor: 5
I0000 00:00:1720681635.024477  139821 gl_context.cc:357] GL version: 3.2 (OpenGL ES 3.2 Mesa 23.2.1-1ubuntu3.1~22.04.2), renderer: Mesa Intel(R) HD Graphics 4400 (HSW GT2)
W0000 00:00:1720681635.129300  139818 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
W0000 00:00:1720681635.144240  139816 inference_feedback_manager.cc:114] Feedback manager requires a model with a single signature inference. Disabling support for feedback tensors.
