In [1]:
git clone https://github.com/open-mmlab/mmcv.git
cd mmcv
git checkout v1.3.9
MMCV_WITH_OPS=1 pip install -e .
cd ..
git clone https://github.com/ViTAE-Transformer/ViTPose.git
cd ViTPose
pip install -v -e .
pip install timm==0.4.9 einops

SyntaxError: invalid syntax (3022667317.py, line 1)

In [None]:
import cv2
import numpy as np
import mediapipe as mp
import matplotlib.pyplot as plt
from IPython.display import clear_output, display
import ipywidgets as widgets
from pathlib import Path
import torch
import torchvision.transforms as transforms
from collections import OrderedDict

class VitPoseWrapper:
    """Wrapper class for VitPose model initialization and inference"""
    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model = self.load_model()
        self.transform = transforms.Compose([
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])
        
    def load_model(self):
        """Load VitPose model with pretrained weights"""
        # Import here to avoid dependency issues if not using VitPose
        from vitpose.models.model import ViTPose
        
        # Model configuration
        model_cfg = {
            'backbone': {
                'type': 'ViT',
                'img_size': [256, 192],
                'patch_size': 16,
                'embed_dim': 768,
                'depth': 12,
                'num_heads': 12,
                'ratio': 1,
                'use_checkpoint': False,
                'mlp_ratio': 4,
                'qkv_bias': True,
                'drop_path_rate': 0.3,
            }
        }
        
        # Initialize model
        model = ViTPose(model_cfg)
        
        # Load pretrained weights
        weights_path = './ViTPose/weights/vitpose_large_coco_aic_mpii.pth'  # Update with actual path
        state_dict = torch.load(weights_path, map_location=self.device)
        
        # Handle potential state dict differences
        if 'state_dict' in state_dict:
            state_dict = state_dict['state_dict']
        
        new_state_dict = OrderedDict()
        for k, v in state_dict.items():
            if k.startswith('module.'):
                k = k[7:]  # Remove 'module.' prefix
            new_state_dict[k] = v
            
        model.load_state_dict(new_state_dict)
        model.to(self.device)
        model.eval()
        
        return model
    
    def preprocess_image(self, frame):
        """Preprocess image for VitPose model"""
        # Convert BGR to RGB
        frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # Resize to model input size
        frame_resized = cv2.resize(frame_rgb, (192, 256))
        
        # Apply normalization and convert to tensor
        frame_tensor = self.transform(frame_resized)
        frame_tensor = frame_tensor.unsqueeze(0).to(self.device)
        
        return frame_tensor
    
    def postprocess_keypoints(self, heatmaps, original_shape):
        """Convert heatmaps to keypoint coordinates"""
        # Get dimensions
        height, width = original_shape[:2]
        heatmap_height, heatmap_width = heatmaps.shape[2:]
        
        # Find keypoint locations from heatmaps
        keypoints = []
        confidences = []
        
        for heatmap in heatmaps[0]:
            # Find the location of maximum activation
            flat_id = torch.argmax(heatmap).item()
            y = flat_id // heatmap_width
            x = flat_id % heatmap_width
            
            # Get confidence score
            confidence = heatmap[y, x].item()
            
            # Convert to original image coordinates
            x_coord = int((x / heatmap_width) * width)
            y_coord = int((y / heatmap_height) * height)
            
            keypoints.append((x_coord, y_coord))
            confidences.append(confidence)
            
        return np.array(keypoints), np.array(confidences)


In [None]:
class PoseComparisonDebugger:
    def __init__(self, video_path):
        """Initialize pose estimation models and video source"""
        self.video_path = video_path
        self.cap = cv2.VideoCapture(video_path)
        
        # Initialize MediaPipe
        self.mp_pose = mp.solutions.pose
        self.mp_drawing = mp.solutions.drawing_utils
        self.mediapipe_pose = self.mp_pose.Pose(
            static_image_mode=False,
            model_complexity=2,
            min_detection_confidence=0.5
        )
        
        # Initialize VitPose
        self.vitpose = VitPoseWrapper()
        
        # Initialize 4DHumans (placeholder)
        self.humans4d_model = self.load_4dhumans_model()
        
        # Initialize WHAM (placeholder)
        self.wham_model = self.load_wham_model()
        
        # Setup display
        self.fig, self.axes = plt.subplots(1, 4, figsize=(20, 5))
        self.fig.suptitle('Pose Estimation Model Comparison')
        
        # Define keypoint connections for visualization
        self.vitpose_connections = [
            (15, 13), (13, 11), (16, 14), (14, 12),  # limbs
            (11, 12), (5, 11), (6, 12),  # hip
            (5, 6), (5, 7), (6, 8), (7, 9), (8, 10),  # spine and neck
            (1, 2), (0, 1), (0, 2), (1, 3), (2, 4), (3, 5), (4, 6)  # face and shoulders
        ]
        
    def process_frame_vitpose(self, frame):
        """Process frame with VitPose"""
        # Preprocess frame
        frame_tensor = self.vitpose.preprocess_image(frame)
        
        # Get model prediction
        with torch.no_grad():
            heatmaps = self.vitpose.model(frame_tensor)
        
        # Post-process to get keypoints
        keypoints, confidences = self.vitpose.postprocess_keypoints(heatmaps, frame.shape)
        
        # Draw keypoints and connections
        frame_out = frame.copy()
        
        # Draw connections
        for connection in self.vitpose_connections:
            if confidences[connection[0]] > 0.3 and confidences[connection[1]] > 0.3:
                pt1 = tuple(map(int, keypoints[connection[0]]))
                pt2 = tuple(map(int, keypoints[connection[1]]))
                cv2.line(frame_out, pt1, pt2, (0, 255, 0), 2)
        
        # Draw keypoints
        for i, (x, y) in enumerate(keypoints):
            if confidences[i] > 0.3:
                cv2.circle(frame_out, (int(x), int(y)), 4, (255, 0, 0), -1)
        
        return frame_out
    
    # ... [rest of the PoseComparisonDebugger class remains the same]

# Example usage
def debug_pose_estimation(video_path):
    """
    Create and run the pose estimation debugger
    
    Args:
        video_path (str): Path to the video file to analyze
    """
    debugger = PoseComparisonDebugger(video_path)
    debugger.run()
