# Hand Gesture Transformer – Inference notebook

훈련된 checkpoint를 불러와 실제 이미지를 Mediapipe로 전처리한 뒤 제스처를 분류합니다.


In [None]:
import cv2
import mediapipe as mp
import numpy as np
import torch
from collections import deque
from config import GESTURE

In [None]:
# ───────── 모델 로드 ─────────
def load_model(path="./checkpoint/best.pth", device="cpu"):
    model = torch.load(path, map_location=torch.device(device))
    model.eval()
    return model

# ───────── MediaPipe 초기화 ─────────
def init_mediapipe():
    mp_hands = mp.solutions.hands
    hands = mp_hands.Hands(max_num_hands=1,
                           min_detection_confidence=0.6,
                           min_tracking_confidence=0.6)
    return mp_hands, hands

# ───────── 프레임 → 21×3 랜드마크 벡터 ─────────
def extract_joint_vector(frame, hands, mp_hands):
    frame_rgb = cv2.cvtColor(cv2.flip(frame, 1), cv2.COLOR_BGR2RGB)
    res = hands.process(frame_rgb)
    if res.multi_hand_landmarks:
        lm = res.multi_hand_landmarks[0]          # 첫 손만 사용
        joint = np.array([[p.x, p.y, p.z] for p in lm.landmark], dtype=np.float32)
        return joint                              # (21,3)
    return None

# ───────── 30프레임 시퀀스 → 추론 ─────────
def predict(model, seq, device="cpu"):
    if len(seq) < 30:
        return None, None
    x = np.stack(seq[-30:], axis=0)               # (30,21,3)
    x = torch.from_numpy(x).unsqueeze(0)          # (1,30,21,3)
    with torch.no_grad():
        logits = model(x.to(device))
        conf, idx = torch.max(torch.softmax(logits, dim=1), dim=1)
    return conf.item(), idx.item()

In [None]:
model_path = "./checkpoint/best.pth"

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"device={device}")
model = load_model(model_path=model_path, device=device)
mp_hands, hands = init_mediapipe()
cap = cv2.VideoCapture(0)

seq = deque(maxlen=30)                       # 최신 30프레임 보관
try:
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            continue

        joint = extract_joint_vector(frame, hands, mp_hands)
        if joint is not None:
            seq.append(joint)
            conf, idx = predict(model, list(seq), device)
            if conf and conf >= 0.8:
                gesture = GESTURE[idx]
                print(f"{gesture} ({conf:.2f})")

        cv2.imshow("Webcam", frame)
        if cv2.waitKey(1) & 0xFF == ord('q'):
            break
finally:
    cap.release()
    cv2.destroyAllWindows()
