In [1]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [3]:
import os
import json
from collections import defaultdict
import cv2
import mediapipe as mp
from ultralytics import YOLO



In [None]:
# GRU 모델에 훈련시키기 위한 랜드마크 좌표 추출 및 json으로 저장

# YOLO 모델 로드
model = YOLO('D:\\project\\prjvenv\\runs\\detect\\human_fall_s30\\weights\\best.pt')

# MediaPipe 설정
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.3)

# 원하는 랜드마크 정의
DESIRED_LANDMARKS = [0, 11, 12, 15, 16, 23, 24, 25, 26, 27, 28]

def adjust_bbox(bbox, scale_factor, frame_shape):
    x1, y1, x2, y2 = bbox
    width = x2 - x1
    height = y2 - y1
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2
    
    new_width = width * scale_factor
    new_height = height * scale_factor
    
    new_x1 = max(0, int(center_x - new_width / 2))
    new_y1 = max(0, int(center_y - new_height / 2))
    new_x2 = min(int(center_x + new_width / 2), frame_shape[1])
    new_y2 = min(int(center_y + new_height / 2), frame_shape[0])
    
    return [new_x1, new_y1, new_x2, new_y2]

# 영상 데이터셋 경로 지정
def process_video(video_path):
    cap = cv2.VideoCapture(video_path)
    if not cap.isOpened():
        print(f"열기 실패: {video_path}")
        return None

    pose_data = defaultdict(list)
    frame_count = 0

    while cap.isOpened():
        success, frame = cap.read()
        if not success:
            break

        # YOLO로 객체 감지
        results = model(frame)

        for r in results:
            boxes = r.boxes
            for box in boxes:
                class_name = model.names[int(box.cls)]
                
                # 원본 바운딩 박스 좌표 추출
                x1, y1, x2, y2 = map(int, box.xyxy[0])
                
                # 바운딩 박스 크기 조절(20% 확대)
                adjusted_bbox = adjust_bbox([x1, y1, x2, y2], scale_factor=1.2, frame_shape=frame.shape)
                
                # 조절된 바운딩 박스에서 person_image 추출
                person_image = frame[adjusted_bbox[1]:adjusted_bbox[3], adjusted_bbox[0]:adjusted_bbox[2]]
                
                # MediaPipe로 포즈 추정
                results_pose = pose.process(cv2.cvtColor(person_image, cv2.COLOR_BGR2RGB))
                    
                if results_pose.pose_landmarks:
                    frame_landmarks = {}
                    for idx, landmark in enumerate(results_pose.pose_landmarks.landmark):
                        if idx in DESIRED_LANDMARKS:
                            # 전체 프레임에 대한 상대적 좌표로 변환
                            global_x = (adjusted_bbox[0] + landmark.x * person_image.shape[1]) / frame.shape[1]
                            global_y = (adjusted_bbox[1] + landmark.y * person_image.shape[0]) / frame.shape[0]
                            frame_landmarks[f"landmark_{idx}"] = {
                                "x": global_x,
                                "y": global_y,
                                "z": landmark.z
                            }
                    frame_landmarks["class"] = class_name
                    frame_landmarks["confidence"] = box.conf.item()
                    pose_data[f"frame_{frame_count}"] = frame_landmarks
        
        frame_count += 1
        if frame_count % 100 == 0:
            print(f"Processed {frame_count} frames")

    cap.release()
    return pose_data

# 비디오 파일 처리 및 데이터 저장
video_directory = 'D:\\human_fall\\video\\Training\\N'
all_pose_data = []

for root, dirs, files in os.walk(video_directory):
    for file in files:
        if file.endswith(".mp4"):
            video_path = os.path.join(root, file)
            print(f"Processing video: {video_path}")
            video_pose_data = process_video(video_path)
            if video_pose_data:
                all_pose_data.append({
                    "video_path": video_path,
                    "pose_data": video_pose_data
                })

# JSON 파일로 저장
with open('all_pose_data.json', 'w') as f:
    json.dump(all_pose_data, f, indent=4)

print('저장 완료')

Processing video: D:\human_fall\video\Training\N\00002_H_A_N_C1.mp4

0: 384x640 1 Non_Fall, 70.6ms
Speed: 6.6ms preprocess, 70.6ms inference, 117.4ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 5.5ms
Speed: 2.0ms preprocess, 5.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 4.5ms
Speed: 0.0ms preprocess, 4.5ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 3.0ms
Speed: 2.0ms preprocess, 3.0ms inference, 3.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 7.0ms
Speed: 1.0ms preprocess, 7.0ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)





0: 384x640 1 Non_Fall, 3.0ms
Speed: 2.1ms preprocess, 3.0ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 4.5ms
Speed: 1.0ms preprocess, 4.5ms inference, 0.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 4.0ms
Speed: 1.0ms preprocess, 4.0ms inference, 2.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 5.0ms
Speed: 1.5ms preprocess, 5.0ms inference, 3.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 6.5ms
Speed: 1.0ms preprocess, 6.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 3.4ms
Speed: 1.0ms preprocess, 3.4ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 3.5ms
Speed: 1.2ms preprocess, 3.5ms inference, 1.0ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 Non_Fall, 8.3ms
Speed: 2.0ms preprocess, 8.3ms inference, 3.0ms postprocess per image at shape (1, 3, 384