## MacOS에서 아이폰을 웹캠으로 활용한 실시간 감지 코드

In [4]:
import cv2
import torch
import numpy as np
import torch.nn as nn
import mediapipe as mp
import time
from ultralytics import YOLO


In [5]:
# MPS 사용 가능 여부 확인
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("MPS")
else:
    device = torch.device("cpu")
    print("CPU")

MPS


In [None]:
# YOLOv8 모델 로드
yolo_model = YOLO('/Users/kimdeok-hwi/deeplearning/project/Project_humanFall/yolov8s.pt')

# MediaPipe 초기화
mp_pose = mp.solutions.pose
pose = mp_pose.Pose(static_image_mode=False, min_detection_confidence=0.5)

LANDMARKS = [0, 11, 12, 15, 16, 23, 24, 25, 26, 27, 28]

# 11개의 랜드마크만 선택 후 x, y 좌표만 추출
def process_landmarks(landmarks) : 
    selected_landmarks = landmarks[LANDMARKS]
    return selected_landmarks[ : , : 2].flatten()

# GRU 모델 로드
class GRUModel(torch.nn.Module):
    def __init__(self, input_size, hidden_size = 64, num_layers = 2, output_size = 3, dropout = 0.5):
        super(GRUModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True, dropout = dropout)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        out, _ = self.gru(x, h0)
        out = self.dropout(out[:, -1, :])
        out = self.fc(out)
        return out

input_size = 26 # 11개 랜드마크의 x, y 좌표 + 바운딩박스 비율
hidden_size = 64
num_layers = 2
output_size = 3
dropout = 0.5    

gru_model = GRUModel(input_size, hidden_size, num_layers, output_size, dropout)
gru_model.load_state_dict(torch.load('/Users/kimdeok-hwi/deeplearning/project/Project_humanFall/best_fall_detection_gru.pt', map_location=torch.device('cpu')))
gru_model.eval()

# iPhone 웹캠 초기화
cap = cv2.VideoCapture(0)  

# 비디오 저장 설정, MacOS에서는 'avc1' 코덱 사용
fourcc = cv2.VideoWriter_fourcc(*'avc1')
out = cv2.VideoWriter('output_with_iphone.mp4', fourcc, 24.0, (int(cap.get(3)), int(cap.get(4))))

# 낙상 감지 함수
def detect_fall(landmarks, bbox_width, bbox_height, bbox_ratio, confidence):
    processed_landmarks = process_landmarks(landmarks)
    # 바운딩 박스 너비와 높이를 포함하여 총 26개 특성
    input_data = np.concatenate([processed_landmarks, [bbox_width, bbox_height, bbox_ratio, confidence]])
    print('input_data shape:', input_data.shape)  # 디버깅 정보
    input_tensor = torch.FloatTensor(input_data).unsqueeze(0).unsqueeze(0)

    print(input_data.shape)
    
    with torch.no_grad():
        output = gru_model(input_tensor)
    
    predicted_class = torch.argmax(output, dim = 1).item()
    return predicted_class # 0 : 비낙상, 1 : 낙상 위험, 2 : 완전 낙상

# FPS 계산을 위한 변수
prev_time = 0
fps = 0

while True:
    ret, frame = cap.read()
    if not ret:
        break

    # FPS 계산
    current_time = time.time()
    fps = 1 / (current_time - prev_time)
    prev_time = current_time

    # YOLOv8로 사람 감지
    results = yolo_model(frame)
    
    for result in results:
        boxes = result.boxes.xyxy.cpu().numpy().astype(int)
        confidences = result.boxes.conf.cpu().numpy() # 신뢰도 점수 추출
        class_ids = result.boxes.cls.cpu().numpy() # 클래스 ID 추출
        
        for i, box in enumerate(boxes) :
            if class_ids[i] == 0 : 
                x1, y1, x2, y2 = box
                confidence = confidences[i] # 해당 박스의 신뢰도 점수
            
                # 바운딩 박스 비율 계산
                bbox_width = x2 - x1
                bbox_height = y2 - y1
                bbox_ratio = bbox_width / bbox_height
            
                # MediaPipe로 랜드마크 추출
                rgb_frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                results_pose = pose.process(rgb_frame)
            
                if results_pose.pose_landmarks:
                    landmarks = np.array([[lm.x, lm.y, lm.z] for lm in results_pose.pose_landmarks.landmark])
                
                    # 낙상 감지
                    is_fall = detect_fall(landmarks, bbox_width, bbox_height, bbox_ratio, confidence)
                
                    # 결과 표시
                    if is_fall == 0:
                        color = (0, 255, 0)  # 초록색 (비낙상)
                        label = "NORMAL"
                    elif is_fall == 1:
                        color = (0, 255, 255)  # 노란색 (낙상 위험)
                        label = "DANGER"
                    else:
                        color = (0, 0, 255)  # 빨간색 (완전 낙상)
                        label = "FALL"

                    cv2.rectangle(frame, (x1, y1), (x2, y2), color, 2)
                    cv2.putText(frame, label, (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, color, 2)

    # FPS 표시
    cv2.putText(frame, f"FPS: {fps:.2f}", (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

    # 프레임 저장
    out.write(frame)

    cv2.imshow('Fall Detection', frame)
    
    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

cap.release()
out.release()
cv2.waitKey()
cv2.destroyAllWindows()
cv2.waitKey(1)




0: 384x640 1 keyboard, 65.2ms
Speed: 1.6ms preprocess, 65.2ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)
input_data shape: (26,)
(26,)

0: 384x640 1 laptop, 1 mouse, 1 keyboard, 72.6ms
Speed: 1.7ms preprocess, 72.6ms inference, 0.7ms postprocess per image at shape (1, 3, 384, 640)
input_data shape: (26,)
(26,)
input_data shape: (26,)
(26,)
input_data shape: (26,)
(26,)

0: 384x640 1 laptop, 67.4ms
Speed: 1.5ms preprocess, 67.4ms inference, 0.4ms postprocess per image at shape (1, 3, 384, 640)
input_data shape: (26,)
(26,)

0: 384x640 1 laptop, 1 cell phone, 84.8ms
Speed: 1.5ms preprocess, 84.8ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)
input_data shape: (26,)
(26,)

0: 384x640 1 cell phone, 66.9ms
Speed: 1.5ms preprocess, 66.9ms inference, 0.5ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640 1 laptop, 1 keyboard, 72.7ms
Speed: 2.2ms preprocess, 72.7ms inference, 0.6ms postprocess per image at shape (1, 3, 384, 640)

0: 384x640

-1