# metadata.csv 생성하기

## "raw_data" 내부 mp4 파일 metadata저장

In [1]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import os, subprocess, json
import pandas as pd
from pathlib import Path
from tqdm import tqdm

# ---------------- 경로 ----------------
RAW_DATA_ROOT = Path("../data/new_data/raw_data")
CSV_PATH = Path("../data/new_data/video_metadata.csv")
VIDEO_EXTS = [".mp4", ".MP4", ".mov", ".MOV"]

# ---------------- ffprobe 메타데이터 추출 ----------------
def get_video_metadata(video_path: Path):
    try:
        cmd = [
            "ffprobe", "-v", "error",
            "-select_streams", "v:0",
            "-show_entries", "stream=width,height,codec_name,r_frame_rate,duration,nb_frames",
            "-of", "json", str(video_path)
        ]
        result = subprocess.run(cmd, capture_output=True, text=True, check=True)
        info = json.loads(result.stdout)
        stream = info["streams"][0]

        num, den = map(int, stream["r_frame_rate"].split('/'))
        fps = num / den if den != 0 else None

        return {
            "width": int(stream.get("width", 0)),
            "height": int(stream.get("height", 0)),
            "codec": stream.get("codec_name", ""),
            "fps": float(fps) if fps else None,
            "duration_sec": float(stream.get("duration", 0)),
            "num_frames": int(stream.get("nb_frames", 0)) if stream.get("nb_frames", "0").isdigit() else None,
        }
    except Exception:
        return {k: None for k in ["width", "height", "codec", "fps", "duration_sec", "num_frames"]}

# ---------------- metadata.csv 생성 ----------------
def build_metadata_csv():
    records = []
    video_files = [f for f in RAW_DATA_ROOT.rglob("*") if f.suffix in VIDEO_EXTS]

    for video_path in tqdm(video_files, desc="Building metadata", unit="video"):
        file_name = video_path.name
        subdir = str(video_path.relative_to(RAW_DATA_ROOT).parent)

        # 상대경로
        video_rel = Path("..") / "data/new_data/raw_data" / subdir / file_name
        frame_rel = Path("..") / "data/new_data/frames_output" / subdir / f"{Path(file_name).stem}_frames"
        keypoints_rel = Path("..") / "data/new_data/keypoints_json" / subdir / f"{Path(file_name).stem}_JSON"

        # ✅ 프레임 수 세기 (frame_rel 이용)
        n_extracted_frames = len(list(Path(frame_rel).glob("*.jpg"))) if Path(frame_rel).exists() else 0

        # ffprobe 메타데이터
        meta = get_video_metadata(video_path)

        record = {
            "file_name": file_name,
            "video_path": str(video_rel),
            "subdir": subdir,
            **meta,
            "frames_dir": str(frame_rel),
            "n_extracted_frames": n_extracted_frames,
            "keypoints_dir": str(keypoints_rel),
            "frames_verified": 0,
            "sapiens_done": 0,
            "exists": int(video_path.exists())
        }
        records.append(record)

    df = pd.DataFrame(records)
    df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
    print(f"[INFO] metadata.csv 생성 완료 → {CSV_PATH}")
    return df

if __name__ == "__main__":
    build_metadata_csv()


Building metadata: 100% 1019/1019 [00:57<00:00, 17.69video/s]


[INFO] metadata.csv 생성 완료 → ../data/new_data/video_metadata.csv


## Frame 추출 확인
개수가 정확하게 일치하는지 확인하고 일치하지 않을 경우 frames_verified에 0, 일치할경우 1을 저장

In [2]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
from pathlib import Path

CSV_PATH = Path("../data/new_data/video_metadata.csv")

def verify_frames(debug_n=10):
    # CSV 불러오기
    df = pd.read_csv(CSV_PATH, encoding="utf-8-sig")

    # 정확히 일치 여부 확인 + 디버깅
    updated_flags = []
    for idx, row in df.iterrows():
        try:
            num_frames = int(row["num_frames"])
            n_extracted = int(row["n_extracted_frames"])
            result = 1 if num_frames == n_extracted else 0
        except Exception:
            result = 0

        updated_flags.append(result)

        if idx < debug_n:
            print(f"[DEBUG] {row['file_name']:<30} "
                  f"num_frames={row['num_frames']} "
                  f"n_extracted={row['n_extracted_frames']} "
                  f"→ frames_verified={result}")

    # ✅ 업데이트 적용
    df["frames_verified"] = updated_flags

    # 저장 (0/1로 저장)
    df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")
    print("[INFO] frames_verified 업데이트 완료")
    print(df["frames_verified"].value_counts())

    return df

if __name__ == "__main__":
    verify_frames(debug_n=20)  # 앞 20개 디버깅 출력


[DEBUG] Biceps Curl1.MP4               num_frames=361 n_extracted=361 → frames_verified=1
[DEBUG] Biceps Curl2.MOV               num_frames=303 n_extracted=337 → frames_verified=0
[DEBUG] Biceps Curl3.MP4               num_frames=2432 n_extracted=2432 → frames_verified=1
[DEBUG] Hip Extension1.MP4             num_frames=351 n_extracted=351 → frames_verified=1
[DEBUG] Hip Extension2.MOV             num_frames=301 n_extracted=353 → frames_verified=0
[DEBUG] Hip Extension3.MP4             num_frames=320 n_extracted=336 → frames_verified=0
[DEBUG] Hip Flexion Extension1.MP4     num_frames=341 n_extracted=341 → frames_verified=1
[DEBUG] Hip Flexion Extension2.MOV     num_frames=382 n_extracted=348 → frames_verified=0
[DEBUG] Hip Flexion Extension3.MP4     num_frames=332 n_extracted=358 → frames_verified=0
[DEBUG] Hip Flexion Extension_bobathtable1.MOV num_frames=668 n_extracted=647 → frames_verified=0
[DEBUG] Hip Flexion Extension_bobathtable2.MP4 num_frames=379 n_extracted=379 → frames_ver

## sapiens done확인
frame 수와 차이가 많이 나거나 ratio가 1 이상일경우 0, 0.98 이상 1이하일경우 1저장

In [3]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd
from pathlib import Path

CSV_PATH = "../data/new_data/video_metadata.csv"

def update_sapiens_done(threshold=0.98, debug_n=20):
    df = pd.read_csv(CSV_PATH, encoding="utf-8-sig")

    updated_flags = []
    debug_logs = []

    for idx, row in df.iterrows():
        try:
            # keypoints_dir + _JSON 폴더
            key_dir = Path(row["keypoints_dir"])
            file_stem = Path(row["file_name"]).stem
            json_dir = key_dir  # 이미 build_metadata_csv에서 ..._JSON까지 포함됨

            if not Path(json_dir).exists():
                updated_flags.append(0)
                if idx < debug_n:
                    debug_logs.append(f"[DEBUG] {row['file_name']:<30} JSON 디렉토리 없음 → sapiens_done=0")
                continue

            # JSON 파일 개수
            n_json = len(list(Path(json_dir).glob("*.json")))
            num_frames = int(row["num_frames"]) if not pd.isna(row["num_frames"]) else 0

            # 비율
            ratio = n_json / num_frames if num_frames > 0 else 0
            result = 1 if (ratio >= threshold and ratio <= 1.0) else 0

            updated_flags.append(result)

            if idx < debug_n:
                debug_logs.append(
                    f"[DEBUG] {row['file_name']:<30} num_frames={num_frames:<5} "
                    f"n_json={n_json:<5} ratio={ratio:6.2f} → sapiens_done={result}"
                )

        except Exception as e:
            updated_flags.append(0)
            if idx < debug_n:
                debug_logs.append(f"[DEBUG] {row['file_name']:<30} ERROR → {e}")

    # ✅ 업데이트
    df["sapiens_done"] = updated_flags
    df.to_csv(CSV_PATH, index=False, encoding="utf-8-sig")

    # 요약
    print("\n".join(debug_logs))
    print("\n[INFO] sapiens_done 업데이트 완료")
    print(df["sapiens_done"].value_counts())

    return df

if __name__ == "__main__":
    update_sapiens_done(threshold=0.98, debug_n=30)


[DEBUG] Biceps Curl1.MP4               num_frames=361   n_json=361   ratio=  1.00 → sapiens_done=1
[DEBUG] Biceps Curl2.MOV               num_frames=303   n_json=337   ratio=  1.11 → sapiens_done=0
[DEBUG] Biceps Curl3.MP4               num_frames=2432  n_json=2432  ratio=  1.00 → sapiens_done=1
[DEBUG] Hip Extension1.MP4             num_frames=351   n_json=351   ratio=  1.00 → sapiens_done=1
[DEBUG] Hip Extension2.MOV             num_frames=301   n_json=353   ratio=  1.17 → sapiens_done=0
[DEBUG] Hip Extension3.MP4             num_frames=320   n_json=336   ratio=  1.05 → sapiens_done=0
[DEBUG] Hip Flexion Extension1.MP4     num_frames=341   n_json=341   ratio=  1.00 → sapiens_done=1
[DEBUG] Hip Flexion Extension2.MOV     num_frames=382   n_json=348   ratio=  0.91 → sapiens_done=0
[DEBUG] Hip Flexion Extension3.MP4     num_frames=332   n_json=358   ratio=  1.08 → sapiens_done=0
[DEBUG] Hip Flexion Extension_bobathtable1.MOV num_frames=668   n_json=14    ratio=  0.02 → sapiens_done=0
[D

# EDA

In [4]:
import pandas as pd
from IPython.display import display  # Jupyter 환경일 때 표로 보기 좋게 출력

CSV_PATH = "../data/new_data/video_metadata.csv"

def check_last_rows(n=5):
    # CSV 불러오기
    df = pd.read_csv(CSV_PATH, encoding="utf-8-sig")

    print("[INFO] CSV 로드 완료")
    print(f"총 행 개수: {len(df)}\n")

    # 마지막 n개 행 보기 좋게 출력
    last_rows = df.tail(n)
    display(last_rows)   # Jupyter 표 스타일
    return last_rows

if __name__ == "__main__":
    check_last_rows(5)


[INFO] CSV 로드 완료
총 행 개수: 1019



Unnamed: 0,file_name,video_path,subdir,width,height,codec,fps,duration_sec,num_frames,frames_dir,n_extracted_frames,keypoints_dir,frames_verified,sapiens_done,exists
1014,경사진 침대에서 체스트 프레스(탄력밴드).MP4,../data/new_data/raw_data/sample_data/ICU_samp...,sample_data/ICU_sample_video,3840,2160,hevc,29.97003,10.2102,306,../data/new_data/frames_output/sample_data/ICU...,306,../data/new_data/keypoints_json/sample_data/IC...,1,0,1
1015,경사진 침대에서 체스트 프레스.MP4,../data/new_data/raw_data/sample_data/ICU_samp...,sample_data/ICU_sample_video,3840,2160,hevc,29.97003,9.1091,273,../data/new_data/frames_output/sample_data/ICU...,273,../data/new_data/keypoints_json/sample_data/IC...,1,0,1
1016,경사진 침대에서 팔 수평 벌리기(탄력밴드).MP4,../data/new_data/raw_data/sample_data/ICU_samp...,sample_data/ICU_sample_video,3840,2160,hevc,29.97003,11.745067,352,../data/new_data/frames_output/sample_data/ICU...,352,../data/new_data/keypoints_json/sample_data/IC...,1,0,1
1017,경사진 침대에서 팔 위로 뻗어 옆으로 벌리기(탄력밴드).MP4,../data/new_data/raw_data/sample_data/ICU_samp...,sample_data/ICU_sample_video,3840,2160,hevc,29.97003,10.977633,329,../data/new_data/frames_output/sample_data/ICU...,329,../data/new_data/keypoints_json/sample_data/IC...,1,1,1
1018,경사진 침대에서 팔꿈치 펴 팔 앞으로 모으기.MP4,../data/new_data/raw_data/sample_data/ICU_samp...,sample_data/ICU_sample_video,3840,2160,hevc,29.97003,12.946267,388,../data/new_data/frames_output/sample_data/ICU...,388,../data/new_data/keypoints_json/sample_data/IC...,1,0,1


In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import pandas as pd

CSV_PATH = "../data/new_data/video_metadata.csv"

def check_metadata(n=10):
    # CSV 로드
    df = pd.read_csv(CSV_PATH, encoding="utf-8-sig")

    print("[INFO] CSV 로드 완료")
    print(f"총 행 개수: {len(df)}\n")

    # ✅ frames_verified 통계
    print("[EDA] frames_verified 분포 (0=검증실패, 1=검증성공)")
    print(df["frames_verified"].value_counts(dropna=False))
    print()

    # ✅ sapiens_done 통계
    print("[EDA] sapiens_done 분포 (0=미완료, 1=완료)")
    print(df["sapiens_done"].value_counts(dropna=False))
    print()

    # ✅ 교차표
    print("[EDA] frames_verified vs sapiens_done 교차표")
    print(pd.crosstab(df["frames_verified"], df["sapiens_done"]))
    print()

    # ✅ 랜덤 샘플 확인
    print(f"[SAMPLE] 무작위 {n}개 행 확인")
    print(df.sample(min(n, len(df)), random_state=42)[
        ["file_name", "frames_verified", "sapiens_done", "n_extracted_frames", "num_frames"]
    ])

if __name__ == "__main__":
    check_metadata(n=10)


[INFO] CSV 로드 완료
총 행 개수: 1019

[EDA] frames_verified 분포 (0=검증실패, 1=검증성공)
frames_verified
1    961
0     58
Name: count, dtype: int64

[EDA] sapiens_done 분포 (0=미완료, 1=완료)
sapiens_done
0    851
1    168
Name: count, dtype: int64

[EDA] frames_verified vs sapiens_done 교차표
sapiens_done       0    1
frames_verified          
0                 54    4
1                797  164

[SAMPLE] 무작위 10개 행 확인
                                  file_name  frames_verified  sapiens_done  \
522                        Chest Press1.MOV                1             0   
453              One leg bridge_static1.MOV                1             0   
439                      Hip Extension2.MP4                1             0   
31                       Ankle Pumping2.MP4                1             0   
615                      Bridge_static1.MOV                1             0   
584  Hip Flexion Extension_bobathtable3.MP4                1             0   
442                   Hip knee flexion2.MP4              