# 0 Install and Import

In [None]:
import mediapipe as mp
import cv2
import torch
import numpy as np

## For single person:

Codes adapted from https://github.com/nicknochnack/Full-Body-Estimation-using-Media-Pipe-Holistic

In [None]:
cap = cv2.VideoCapture(0)
# Initiate holistic model
with mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5) as holistic:
    
    while cap.isOpened():
        success, image = cap.read()
        
        if success:
            # Recolor Feed (because mediapipe uses RGB while cv2 uses BGR)
            #image = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Make Detections
            results = holistic.process(image)
            # print(results.face_landmarks)  # this will just be numbers/coordinates
            
            # the four landmarks we have:
            # face_landmarks, pose_landmarks, left_hand_landmarks, right_hand_landmarks
            
            # Recolor image back to BGR for rendering, because cv2 loves BGR
            #image = cv2.cvtColor(image, cv2.COLOR_RGB2BGR)
            
            
            # Draw face landmarks
            mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                                      mp_drawing.DrawingSpec(color=(0,0,255), thickness=1, circle_radius=1),
                                        mp_drawing.DrawingSpec(color=(0,0,245), thickness=1, circle_radius=1)) # FACE_CONNECTIONS: draw the line between the points
            # Right hand
            mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                      mp_drawing.DrawingSpec(color=(255,0,0), thickness=2, circle_radius=4),
                                      mp_drawing.DrawingSpec(color=(245,0,0), thickness=2, circle_radius=2))
            # Left Hand
            mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(255,0,0), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,0,0), thickness=2, circle_radius=2)
                                 )
            # Pose Detections
            mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(0,255,0), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(0,245,0), thickness=2, circle_radius=2)
                                 )
                        
        
            cv2.imshow('Result from Holistic Model', image)

            if cv2.waitKey(1) & 0xFF == ord('q'):
                break
        
        else:
            break

## For two people:

Idea from https://shawntng.medium.com/multi-person-pose-estimation-with-mediapipe-52e6a60839dd

Idea:
- Use YOLO to detect each person and crop
- Use Mediapipe for single person detection


In [9]:
# Load the YOLOv5 model
yolov5 = torch.hub.load('ultralytics/yolov5', 'yolov5s')

# Load the video
#video = cv2.VideoCapture('Multiple4.MP4')
video = cv2.VideoCapture("Multiple5.mp4")

# Get the video's width, height, and frames per second (fps)
width = int(video.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(video.get(cv2.CAP_PROP_FPS))

# Create a VideoWriter object to save the video
output_file = 'output_video.mp4'  # Specify the output video file name
video_writer = cv2.VideoWriter(output_file, cv2.VideoWriter_fourcc(*'mp4v'), fps, (width, height))

mp_drawing = mp.solutions.drawing_utils
mp_drawing_styles = mp.solutions.drawing_styles

mp_holistic = mp.solutions.holistic
holistic = mp_holistic.Holistic(min_detection_confidence=0.5, min_tracking_confidence=0.5)
  

# def process_image(image):
#     results = pose.process(image)
#     mp_drawing.draw_landmarks(
#     image,
#     results.pose_landmarks,
#     mp_pose.POSE_CONNECTIONS,
#     landmark_drawing_spec=mp_drawing_styles.get_default_pose_landmarks_style())
#     return image


def process_image_holistic(image):
    results = holistic.process(image)
    
    # # Draw face landmarks
    mp_drawing.draw_landmarks(image, results.face_landmarks, mp_holistic.FACEMESH_TESSELATION,
                                      mp_drawing.DrawingSpec(color=(0,0,255), thickness=1, circle_radius=1),
                                        mp_drawing.DrawingSpec(color=(0,0,245), thickness=1, circle_radius=1)) # FACE_CONNECTIONS: draw the line between the points
    # # Right hand
    mp_drawing.draw_landmarks(image, results.right_hand_landmarks, mp_holistic.HAND_CONNECTIONS,
                                      mp_drawing.DrawingSpec(color=(255,0,0), thickness=2, circle_radius=4),
                                      mp_drawing.DrawingSpec(color=(245,0,0), thickness=2, circle_radius=2))
    # # Left Hand
    mp_drawing.draw_landmarks(image, results.left_hand_landmarks, mp_holistic.HAND_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(255,0,0), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(245,0,0), thickness=2, circle_radius=2)
                                 )
    # Pose Detections
    mp_drawing.draw_landmarks(image, results.pose_landmarks, mp_holistic.POSE_CONNECTIONS, 
                                 mp_drawing.DrawingSpec(color=(0,255,0), thickness=2, circle_radius=4),
                                 mp_drawing.DrawingSpec(color=(0,245,0), thickness=2, circle_radius=2)
                                 )
    
    return image


# Process each frame of the video
while True:
  # Read the next frame
  success, frame = video.read()
  if not success:
    break

  # Perform object detection on the frame
  results = yolov5(frame)
  detections = results.xyxy[0]

  boxes = []
  class_ids = []
  scores = []
  
  for i, detection in enumerate(detections):    
    xmin    = detection[0]
    ymin    = detection[1]
    xmax    = detection[2]
    ymax    = detection[3]
    score   = detection[4]
    class_id = detection[5]
    centroid_x = int(xmin + xmax) // 2
    centroid_y =  int(ymin + ymax) // 2
    
    if score > 0.7 and class_id == 0:
        boxes.append([xmin,ymin,xmax-xmin,ymax-ymin])
        class_ids.append(class_id)
        scores.append(float(score))
        
        ### mediapipe
        person = frame[int(ymin):int(ymax), int(xmin):int(xmax)]
        try:  
            # process_image(person)
            process_image_holistic(person)
        except:
            pass


    # object tracking
    #tracker = EuclideanDistTracker()
    #boxes_ids = tracker.update(boxes)
    for box in boxes:
        x, y, w, h = box
        centroid_x = x + w//2
        centroid_y = y + h//2
        (frame_h, frame_w) = frame.shape[:2]
        
        if centroid_x > frame_w//2:
            id = 0
        else:
            id = 1
        
        cv2.putText(frame, f'Person {id}', (int(x), int(y) - 15), cv2.FONT_HERSHEY_PLAIN, 2, (255, 0, 0), 2)
        cv2.rectangle(frame, (int(x), int(y)), (int(x + w), int(y + h)), (0, 255, 0), 3)

        
  # Display the frame
  cv2.imshow("Video", frame)
  video_writer.write(frame)
  if cv2.waitKey(1) & 0xFF == ord('q'):
    break


# Release the video capture object
video.release()
video_writer.release()

cv2.destroyAllWindows()

Using cache found in /Users/julie3399/.cache/torch/hub/ultralytics_yolov5_master
YOLOv5 🚀 2023-8-28 Python-3.11.4 torch-2.0.1 CPU

Fusing layers... 
YOLOv5s summary: 213 layers, 7225885 parameters, 0 gradients
Adding AutoShape... 
