In [None]:
#clone the HRnet model repo
!git clone https://github.com/HRNet/HRNet-Human-Pose-Estimation.git

In [1]:
import os
import torch
import sys
from torch.hub import load_state_dict_from_url
import cv2
import torchvision.transforms as transforms
from PIL import Image
import numpy as np
import matplotlib.pyplot as plt

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
os.getcwd()
import numpy as np
print(np.__version__)

1.19.5


In [3]:
os.chdir("HRNet-Human-Pose-Estimation/lib")


In [4]:
from easydict import EasyDict as edict
# Add to Python path
sys.path.append('./HRNet-Human-Pose-Estimation/lib')
# Import from the correct location
from models.pose_hrnet import get_pose_net

hrnet_w32_config = edict({
    "MODEL": {
        "NAME": "pose_hrnet",
        "INIT_WEIGHTS": True,
        "NUM_JOINTS": 17,  # COCO keypoints
        "PRETRAINED": "",  # Leave empty when loading your own weights
        "IMAGE_SIZE": [256, 192],  # Height, Width
        "HEATMAP_SIZE": [64, 48],  # 1/4 of input size
        "SIGMA": 2,
        "EXTRA": {
            "FINAL_CONV_KERNEL": 1,
            "PRETRAINED_LAYERS": ["*"],
            "STAGE1": {
                "NUM_MODULES": 1,
                "NUM_BRANCHES": 1,
                "BLOCK": "BOTTLENECK",
                "NUM_BLOCKS": [4],
                "NUM_CHANNELS": [64],
                "FUSE_METHOD": "SUM"
            },
            "STAGE2": {
                "NUM_MODULES": 1,
                "NUM_BRANCHES": 2,
                "BLOCK": "BASIC",
                "NUM_BLOCKS": [4, 4],
                "NUM_CHANNELS": [32, 64],
                "FUSE_METHOD": "SUM"
            },
            "STAGE3": {
                "NUM_MODULES": 4,
                "NUM_BRANCHES": 3,
                "BLOCK": "BASIC",
                "NUM_BLOCKS": [4, 4, 4],
                "NUM_CHANNELS": [32, 64, 128],
                "FUSE_METHOD": "SUM"
            },
            "STAGE4": {
                "NUM_MODULES": 3,
                "NUM_BRANCHES": 4,
                "BLOCK": "BASIC",
                "NUM_BLOCKS": [4, 4, 4, 4],
                "NUM_CHANNELS": [32, 64, 128, 256],
                "FUSE_METHOD": "SUM"
            }
        }
    }
})

In [5]:
# 1. Create model with correct config
hr_model = get_pose_net(hrnet_w32_config, is_train=False)

# 2. Load pretrained weights
weights_path = "pose_hrnet_w32_256x192.pth"
state_dict = torch.load(weights_path, map_location='cpu')

# 3. Handle potential key mismatches (if weights were saved with DataParallel)
new_state_dict = {}
for k, v in state_dict.items():
    if k.startswith('module.'):
        k = k[7:]  # Remove 'module.' prefix
    new_state_dict[k] = v

# 4. Load weights
hr_model.load_state_dict(new_state_dict, strict=True)
hr_model.eval()

print("Successfully loaded HRNet-W32 with matching configuration!")

Successfully loaded HRNet-W32 with matching configuration!


In [6]:
# creat class for processing the friends video using the pretrained HRnetw32 model.

class MultipersonPoseEstimator:
    def __init__(self, model, device):
        self.model = model
        self.device = device
       

    def process_video(self, video_path, output_path,show_results=True):

      """
      Process video file for multi-person pose estimation

      Args:
          video_path: Path to input video
          output_path: Path to save output video
          show_results: Whether to display results in real-time
      """
      cap = cv2.VideoCapture(video_path)
      if not cap.isOpened():
          raise IOError("Cannot open video file")

      # Get video properties
      width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
      height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
      fps = cap.get(cv2.CAP_PROP_FPS)
      total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))

      # Define video writer
      fourcc = cv2.VideoWriter_fourcc(*'mp4v')
      out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))
        
      # transform frame to standard weight for Hrnet256x192, to a 
      # tensor and make sure it is RGB.
      transform=transforms.Compose([
          transforms.ToTensor(), #float 32
          transforms.Resize((256, 192))])
      
      frame_count = 0

              
      while cap.isOpened():
          ret, frame = cap.read() #BGR
          
          print(frame.dtype)
        
          if not ret:
              break
          
           
           # Convert to RGB if needed
          if len(frame.shape) == 2:
            frame = cv2.cvtColor(frame, cv2.COLOR_GRAY2BGR)

          
          frame_tr = transform(frame)
          frame_tr = frame_tr.unsqueeze(0) # 1x3x256x192
          
            
          print(torch.max(frame_tr[0,0,:,:]))
          
         
          with torch.no_grad():
            output_keypoints = self.model(frame_tr)
          
          # Display if requested
          if show_results:
            # Post - process the output
            heatmaps = output_keypoints.cpu().numpy()[0]
            keypoints = []
            
            for heatmap in heatmaps:

              h, w = np.unravel_index(np.argmax(heatmap), heatmap.shape)
              h=h/(256//4) *256
              w=w/(192//4) *192
              keypoints.append((w, h)) #pose points in the original frame
              
            
            
            frame_tr=frame_tr.numpy() 
            
            frame_tr=np.squeeze(frame_tr, axis=0) 
                
            temp_frame_tr=np.transpose(frame_tr, (1,2,0)) 
            
            uint8_frame=(temp_frame_tr*255).astype(np.uint8)
            
            uint8_frame_copy=np.ascontiguousarray(uint8_frame)
            
            POSE_PAIRS = [
                  (5, 6), (5, 7), (6, 8),   # Arms
                 (11, 12), (11, 13), (12, 14),  # Legs
                 (0, 1), (1, 2), (2, 3), (3, 4),  # Head to arms
                 (1, 5), (2, 6), (13, 15), (14, 16)  # Body connections
                         ]
            for pair in POSE_PAIRS:
                
                partA = pair[0]
                partB = pair[1]

                if keypoints[partA] is not None and keypoints[partB] is not None:
                    print(keypoints[partA][0])
                    cv2.line(uint8_frame_copy, (int(keypoints[partA][0]),int(keypoints[partA][1])), 
                             (int(keypoints[partB][0]),int(keypoints[partB][1])), color=(0, 255, 255), thickness=2)
                    cv2.circle(uint8_frame_copy,(int(keypoints[partA][0]),int(keypoints[partA][1])),5,(0,255,0),-1)
                    cv2.circle(uint8_frame_copy,(int(keypoints[partB][0]),int(keypoints[partB][1])),5,(0,255,0),-1)
            cv2.imshow("processed frame",uint8_frame_copy)
            cv2.waitKey(0)

            

            #for keypoint in keypoints:

              #x, y = int(keypoint[0]), int(keypoint[1])
        
              #cv2.circle(uint8_frame_copy, (x,y), 5, (0, 255, 0),-1)
            
              #cv2.imshow("processed frame",uint8_frame_copy)
              #cv2.waitKey(1)

            out.write(uint8_frame_copy)
            frame_count += 1
            print(f"Processed frame {frame_count}/{total_frames}")

      
      # Release resources
      cap.release()
      cv2.destroyAllWindows()
      out.release()

In [None]:
import matplotlib.pyplot as plt
pose_estimator = MultipersonPoseEstimator(
      model=hr_model,
      device="cuda" if torch.cuda.is_available() else "cpu",
  )
   
# Process video
video_path = 'one_person_video.mp4'
output_video_path = "pose_output_pose.mp4"


pose_estimator.process_video(video_path,output_video_path) 

uint8
tensor(0.9771)
116.0
116.0
80.0
104.0
104.0
84.0
88.0
92.0
88.0
108.0
92.0
88.0
108.0
84.0
Processed frame 1/4001
uint8
tensor(0.9741)
116.0
116.0
76.0
108.0
108.0
84.0
88.0
92.0
88.0
104.0
92.0
88.0
108.0
84.0
Processed frame 2/4001
uint8
tensor(0.9717)
116.0
116.0
76.0
108.0
108.0
76.0
88.0
92.0
84.0
104.0
92.0
84.0
108.0
80.0
Processed frame 3/4001
uint8
tensor(0.9750)
116.0
116.0
76.0
104.0
104.0
84.0
88.0
92.0
84.0
104.0
92.0
84.0
108.0
84.0
Processed frame 4/4001
uint8
tensor(0.9827)
116.0
116.0
76.0
108.0
108.0
84.0
88.0
92.0
84.0
104.0
92.0
84.0
108.0
84.0
Processed frame 5/4001
uint8
tensor(0.9888)
116.0
116.0
76.0
108.0
108.0
84.0
88.0
92.0
84.0
104.0
92.0
84.0
108.0
84.0
Processed frame 6/4001
uint8
tensor(0.9862)
116.0
116.0
76.0
108.0
108.0
76.0
88.0
92.0
84.0
104.0
92.0
84.0
108.0
84.0
Processed frame 7/4001
uint8
tensor(0.9779)
112.0
112.0
68.0
108.0
108.0
76.0
84.0
92.0
84.0
104.0
92.0
84.0
112.0
76.0
Processed frame 8/4001
uint8
tensor(0.9785)
112.0
112.0
76.0
10