In [5]:
import json
import numpy as np

def normalize_keypoints_from_json(input_json_path, output_json_path, mode='all'):
    
    with open(input_json_path, 'r') as f:
        data = json.load(f)
    people = data['landmarks']

    def to_np(key):
        arr = people.get(key, [])
        arr = np.array(arr)
        if arr.size == 0:
            return np.zeros((0, 3))
        return arr.reshape(-1, 3)

    pose = to_np('pose_keypoints_3d')
    hand_l = to_np('hand_left_keypoints_3d')
    hand_r = to_np('hand_right_keypoints_3d')
    face = to_np('face_keypoints_3d')

    # 정규화 함수 정의
    def normalize_all_parts(pose, hand_l, hand_r, face):
        all_pts = [x for x in [pose, hand_l, hand_r, face] if x.size > 0]
        if not all_pts:
            return pose, hand_l, hand_r, face
        all_pts = np.concatenate(all_pts, axis=0)
        min_xy = all_pts[:, :3].min(axis=0)
        max_xy = all_pts[:, :3].max(axis=0)
        scale = (max_xy - min_xy) + 1e-6
        def apply_norm(x):
            if x.size == 0:
                return x
            x_norm = x.copy()
            x_norm[:, :3] = (x[:, :3] - min_xy) / scale
            return x_norm
        return (
            apply_norm(pose),
            apply_norm(hand_l),
            apply_norm(hand_r),
            apply_norm(face)
        )

    def normalize_by_part(pose, hand_l, hand_r, face):
        def normalize(part, anchor_idx):
            if part.size == 0 or anchor_idx >= len(part):
                return part
            anchor = part[anchor_idx, :3]
            rel = part[:, :3] - anchor  # anchor 기준 상대 좌표
            scale = np.linalg.norm(rel, axis=1).max() + 1e-6
            part_norm = part.copy()
            part_norm[:, :3] = rel / scale
            return part_norm
        # 하체 제외 (pose index: 9~14 제거)
        upper_pose_indices = [i for i in range(len(pose)) if i not in [9,10,11,12,13,14]]
        pose_upper = pose[upper_pose_indices]
        pose_norm = normalize(pose_upper, anchor_idx=1)       # 목을 anchor로 (index 1: Neck)
        left_norm = normalize(hand_l, anchor_idx=0)           # 왼손: 손목 기준
        right_norm = normalize(hand_r, anchor_idx=0)          # 오른손: 손목 기준
        face_norm = normalize(face, anchor_idx=30)            # 얼굴: 코 기준
        return pose_norm, left_norm, right_norm, face_norm

    # 정규화 적용
    if mode == 'all':
        pose_n, hand_l_n, hand_r_n, face_n = normalize_all_parts(pose, hand_l, hand_r, face)
    elif mode == 'part':
        pose_n, hand_l_n, hand_r_n, face_n = normalize_by_part(pose, hand_l, hand_r, face)
    else:
        raise ValueError("mode은 'all' 또는 'part'만 가능합니다.")

    # 다시 json 형태로 변환
    def to_list(x):
        return x.flatten().tolist() if x.size > 0 else []
    people['pose_keypoints_3d'] = to_list(pose_n)
    people['hand_left_keypoints_3d'] = to_list(hand_l_n)
    people['hand_right_keypoints_3d'] = to_list(hand_r_n)
    people['face_keypoints_3d'] = to_list(face_n)

    with open(output_json_path, 'w') as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
        
input_json_path = 'C:/Users/dpwl1/Downloads/FIRE/1_1/NIA_SL_G2_FIRE001529_1_TW104.json'
output_json_path = 'C:/Users/dpwl1/Downloads/FIRE_normalized/1_1/NIA_SL_G2_FIRE001529_1_TW104.json'

normalize_keypoints_from_json(input_json_path, output_json_path, mode='part')


### Training dataset Normalization

In [1]:
import os
import json
import numpy as np
from glob import glob

def normalize_group_by_keyword(input_root, output_root, keywords, mode='all'):
    os.makedirs(output_root, exist_ok=True)

    def to_np(key, people):
        arr = people.get(key, [])
        arr = np.array(arr)
        if arr.size == 0:
            return np.zeros((0, 3))
        return arr.reshape(-1, 3)

    def normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale):
        def apply_norm(x):
            if x.size == 0:
                return x
            x_norm = x.copy()
            x_norm[:, :3] = (x[:, :3] - min_xy) / scale
            return x_norm
        return (
            apply_norm(pose),
            apply_norm(hand_l),
            apply_norm(hand_r),
            apply_norm(face)
        )

    def normalize_by_part(pose, hand_l, hand_r, face):
        def normalize(part, anchor_idx):
            if part.size == 0 or anchor_idx >= len(part):
                return part
            anchor = part[anchor_idx, :3]
            rel = part[:, :3] - anchor
            scale = np.linalg.norm(rel, axis=1).max() + 1e-6
            part_norm = part.copy()
            part_norm[:, :3] = rel / scale
            return part_norm

        upper_pose_indices = [i for i in range(len(pose)) if i not in [9,10,11,12,13,14]]
        pose_upper = pose[upper_pose_indices]
        pose_norm = normalize(pose_upper, anchor_idx=1)     # 목
        hand_l_norm = normalize(hand_l, anchor_idx=0)       # 왼손목
        hand_r_norm = normalize(hand_r, anchor_idx=0)       # 오른손목
        face_norm = normalize(face, anchor_idx=30)          # 코
        return pose_norm, hand_l_norm, hand_r_norm, face_norm

    def to_list(x):
        return x.flatten().tolist() if x.size > 0 else []

    for keyword in keywords:
        print(f'Processing keyword: {keyword}')
        files = glob(os.path.join(input_root, '**', f'*{keyword}*.json'), recursive=True)
        if not files:
            print(f'No files found for keyword: {keyword}')
            continue

        # mode='all'인 경우 전체 정규화 기준 계산
        if mode == 'all':
            all_keypoints = []
            for file_path in files:
                with open(file_path, 'r') as f:
                    data = json.load(f)
                people = data['landmarks']
                pose = to_np('pose_keypoints_3d', people)
                hand_l = to_np('hand_left_keypoints_3d', people)
                hand_r = to_np('hand_right_keypoints_3d', people)
                face = to_np('face_keypoints_3d', people)

                parts = [x for x in [pose, hand_l, hand_r, face] if x.size > 0]
                if parts:
                    all_keypoints.append(np.concatenate(parts, axis=0))
            if not all_keypoints:
                continue
            all_points = np.concatenate(all_keypoints, axis=0)
            min_xy = all_points[:, :3].min(axis=0)
            max_xy = all_points[:, :3].max(axis=0)
            scale = (max_xy - min_xy) + 1e-6

        # 각 파일에 정규화 적용 및 저장
        for file_path in files:
            with open(file_path, 'r') as f:
                data = json.load(f)
            people = data['landmarks']
            pose = to_np('pose_keypoints_3d', people)
            hand_l = to_np('hand_left_keypoints_3d', people)
            hand_r = to_np('hand_right_keypoints_3d', people)
            face = to_np('face_keypoints_3d', people)

            if mode == 'all':
                pose_n, hand_l_n, hand_r_n, face_n = normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale)
            elif mode == 'part':
                pose_n, hand_l_n, hand_r_n, face_n = normalize_by_part(pose, hand_l, hand_r, face)
            else:
                raise ValueError("mode는 'all' 또는 'part' 중 하나여야 합니다.")

            people['pose_keypoints_3d'] = to_list(pose_n)
            people['hand_left_keypoints_3d'] = to_list(hand_l_n)
            people['hand_right_keypoints_3d'] = to_list(hand_r_n)
            people['face_keypoints_3d'] = to_list(face_n)

            rel_path = os.path.relpath(file_path, input_root)
            output_file_path = os.path.join(output_root, rel_path)
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

            with open(output_file_path, 'w') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)

        print(f'Normalized {len(files)} files for keyword \"{keyword}\" in mode \"{mode}\".')

In [2]:
# 사용 예시
input_root = 'C:/Users/dpwl1/Downloads/tact_morpheme/1_1'
output_root = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['TW03', 'TW87', 'TW104']  # 원하는 키워드 목록

normalize_group_by_keyword(input_root, output_root, keywords, mode='part')

Processing keyword: TW03
Normalized 354 files for keyword "TW03" in mode "part".
Processing keyword: TW87
Normalized 270 files for keyword "TW87" in mode "part".
Processing keyword: TW104
Normalized 93 files for keyword "TW104" in mode "part".


In [3]:
# 사용 예시
input_root = 'C:/Users/dpwl1/Downloads/tact_morpheme/1_2'
output_root = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['TW03', 'TW114', 'TW04', 'TW30']  # 원하는 키워드 목록

normalize_group_by_keyword(input_root, output_root, keywords, mode='part')

Processing keyword: TW03
Normalized 438 files for keyword "TW03" in mode "part".
Processing keyword: TW114
Normalized 242 files for keyword "TW114" in mode "part".
Processing keyword: TW04
Normalized 78 files for keyword "TW04" in mode "part".
Processing keyword: TW30
Normalized 158 files for keyword "TW30" in mode "part".


In [4]:
# 사용 예시
input_root = 'C:/Users/dpwl1/Downloads/tact_morpheme/1_3'
output_root = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['KU02']  # 원하는 키워드 목록

normalize_group_by_keyword(input_root, output_root, keywords, mode='part')

Processing keyword: KU02
Normalized 1962 files for keyword "KU02" in mode "part".


In [5]:
# 사용 예시
input_root = 'C:/Users/dpwl1/Downloads/untact_morpheme/1_1'
output_root = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['CROWD04', 'CROWD1067644', 'CROWD1075', 'CROWD128', 'CROWD87', 'CROWD112']  # 원하는 키워드 목록

normalize_group_by_keyword(input_root, output_root, keywords, mode='part')

Processing keyword: CROWD04
Normalized 643 files for keyword "CROWD04" in mode "part".
Processing keyword: CROWD1067644
Normalized 943 files for keyword "CROWD1067644" in mode "part".
Processing keyword: CROWD1075
Normalized 724 files for keyword "CROWD1075" in mode "part".
Processing keyword: CROWD128
Normalized 29 files for keyword "CROWD128" in mode "part".
Processing keyword: CROWD87
Normalized 21 files for keyword "CROWD87" in mode "part".
Processing keyword: CROWD112
Normalized 281 files for keyword "CROWD112" in mode "part".


### Validation dataset Normalization

In [6]:
import os
import json
import numpy as np
from glob import glob

def to_np(key, people):
    arr = people.get(key, [])
    arr = np.array(arr)
    if arr.size == 0:
        return np.zeros((0, 3))
    return arr.reshape(-1, 3)

def to_list(x):
    return x.flatten().tolist() if x.size > 0 else []

def normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale):
    def apply_norm(x):
        if x.size == 0:
            return x
        x_norm = x.copy()
        x_norm[:, :3] = (x[:, :3] - min_xy) / scale
        return x_norm
    return (
        apply_norm(pose),
        apply_norm(hand_l),
        apply_norm(hand_r),
        apply_norm(face)
    )

def normalize_by_part(pose, hand_l, hand_r, face):
    def normalize(part, anchor_idx):
        if part.size == 0 or anchor_idx >= len(part):
            return part
        anchor = part[anchor_idx, :3]
        rel = part[:, :3] - anchor
        scale = np.linalg.norm(rel, axis=1).max() + 1e-6
        part_norm = part.copy()
        part_norm[:, :3] = rel / scale
        return part_norm

    upper_pose_indices = [i for i in range(len(pose)) if i not in [9,10,11,12,13,14]]
    pose_upper = pose[upper_pose_indices]
    pose_norm = normalize(pose_upper, anchor_idx=1)
    hand_l_norm = normalize(hand_l, anchor_idx=0)
    hand_r_norm = normalize(hand_r, anchor_idx=0)
    face_norm = normalize(face, anchor_idx=30)
    return pose_norm, hand_l_norm, hand_r_norm, face_norm

def compute_training_stats(input_root, keywords):
    """각 keyword별 (min, scale) 계산"""
    stats = {}

    for keyword in keywords:
        files = glob(os.path.join(input_root, '**', f'*{keyword}*.json'), recursive=True)
        all_keypoints = []
        for file_path in files:
            with open(file_path, 'r') as f:
                data = json.load(f)
            people = data['landmarks']
            pose = to_np('pose_keypoints_3d', people)
            hand_l = to_np('hand_left_keypoints_3d', people)
            hand_r = to_np('hand_right_keypoints_3d', people)
            face = to_np('face_keypoints_3d', people)

            parts = [x for x in [pose, hand_l, hand_r, face] if x.size > 0]
            if parts:
                all_keypoints.append(np.concatenate(parts, axis=0))

        if all_keypoints:
            all_points = np.concatenate(all_keypoints, axis=0)
            min_xy = all_points[:, :3].min(axis=0)
            max_xy = all_points[:, :3].max(axis=0)
            scale = (max_xy - min_xy) + 1e-6
            stats[keyword] = (min_xy, scale)
        else:
            print(f"[Warning] No training keypoints found for keyword: {keyword}")

    return stats

def normalize_validation_data(input_root, output_root, keywords, stats, mode='all'):
    os.makedirs(output_root, exist_ok=True)

    files = glob(os.path.join(input_root, '**', '*.json'), recursive=True)

    for file_path in files:
        filename = os.path.basename(file_path)
        matched_keyword = next((k for k in keywords if k in filename), None)

        if matched_keyword is None:
            print(f"[Skip] No matching keyword for file: {filename}")
            continue

        min_xy, scale = stats.get(matched_keyword, (None, None))
        if min_xy is None or scale is None:
            print(f"[Warning] No normalization stats for keyword: {matched_keyword}")
            continue

        with open(file_path, 'r') as f:
            data = json.load(f)
        people = data['landmarks']
        pose = to_np('pose_keypoints_3d', people)
        hand_l = to_np('hand_left_keypoints_3d', people)
        hand_r = to_np('hand_right_keypoints_3d', people)
        face = to_np('face_keypoints_3d', people)

        if mode == 'all':
            pose_n, hand_l_n, hand_r_n, face_n = normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale)
        elif mode == 'part':
            pose_n, hand_l_n, hand_r_n, face_n = normalize_by_part(pose, hand_l, hand_r, face)
        else:
            raise ValueError("mode는 'all' 또는 'part' 중 하나여야 합니다.")

        people['pose_keypoints_3d'] = to_list(pose_n)
        people['hand_left_keypoints_3d'] = to_list(hand_l_n)
        people['hand_right_keypoints_3d'] = to_list(hand_r_n)
        people['face_keypoints_3d'] = to_list(face_n)

        rel_path = os.path.relpath(file_path, input_root)
        output_file_path = os.path.join(output_root, rel_path)
        os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

        with open(output_file_path, 'w') as f:
            json.dump(data, f, ensure_ascii=False, indent=2)

        print(f'[OK] Normalized: {rel_path}')


In [7]:
train_root = 'C:/Users/dpwl1/Downloads/tact_morpheme/1_1'
val_root = 'C:/Users/dpwl1/Downloads/tact_morpheme_val/1_1'
val_out = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['TW03', 'TW87', 'TW104']  # 원하는 키워드 지정

# Step 1. training data 기준 계산
stats = compute_training_stats(train_root, keywords)

# Step 2. validation data 정규화 적용
normalize_validation_data(val_root, val_out, keywords, stats, mode='part')

[OK] Normalized: NIA_SL_G2_FIRE000009_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000012_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000054_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000059_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000086_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000088_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000093_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000110_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000113_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000176_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000189_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000241_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000269_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000309_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000313_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000325_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000326_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000341_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000368_1_TW03.json
[OK] Normalized: NIA_SL_G2_FIRE000382_1_TW03.json


In [8]:
train_root = 'C:/Users/dpwl1/Downloads/tact_morpheme/1_2'
val_root = 'C:/Users/dpwl1/Downloads/tact_morpheme_val/1_2'
val_out = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['TW03', 'TW114', 'TW04', 'TW30']  # 원하는 키워드 지정

# Step 1. training data 기준 계산
stats = compute_training_stats(train_root, keywords)

# Step 2. validation data 정규화 적용
normalize_validation_data(val_root, val_out, keywords, stats, mode='part')

[OK] Normalized: NIA_SL_G2_FIRE000043_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000043_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000121_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000121_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000212_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000212_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000219_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000219_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000221_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000221_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000247_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000247_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000346_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000346_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000396_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000396_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000460_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000460_2_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE000546_1_TW114.json
[OK] Normalized: NIA_SL_G2_FIRE

In [9]:
train_root = 'C:/Users/dpwl1/Downloads/tact_morpheme/1_3'
val_root = 'C:/Users/dpwl1/Downloads/tact_morpheme_val/1_3'
val_out = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['KU02']  # 원하는 키워드 지정

# Step 1. training data 기준 계산
stats = compute_training_stats(train_root, keywords)

# Step 2. validation data 정규화 적용
normalize_validation_data(val_root, val_out, keywords, stats, mode='part')

[OK] Normalized: NIA_SL_G2_FIRE000133_1_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000133_2_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000133_3_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000152_1_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000152_2_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000152_3_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000183_1_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000183_2_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000183_3_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000248_1_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000248_2_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000248_3_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000273_1_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000273_2_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000273_3_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000292_1_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000292_2_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000292_3_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000301_1_KU02.json
[OK] Normalized: NIA_SL_G2_FIRE000301_2_KU02.json


In [10]:
train_root = 'C:/Users/dpwl1/Downloads/untact_morpheme/1_1'
val_root = 'C:/Users/dpwl1/Downloads/untact_morpheme_val/1_1'
val_out = 'C:/Users/dpwl1/Downloads/normalized'
keywords = ['CROWD04', 'CROWD1067644', 'CROWD1075', 'CROWD128', 'CROWD87', 'CROWD112']  # 원하는 키워드 지정

# Step 1. training data 기준 계산
stats = compute_training_stats(train_root, keywords)

# Step 2. validation data 정규화 적용
normalize_validation_data(val_root, val_out, keywords, stats, mode='part')



In [21]:
import os
import json
import numpy as np
from glob import glob

def normalize_group_by_keyword_with_split(input_root, output_root, split_json_path, keywords, mode='all'):
    os.makedirs(output_root, exist_ok=True)

    # Load data split information
    with open(split_json_path, 'r', encoding='utf-8') as f:
        data_split = json.load(f)

    # Flatten all paths and group by split
    split_paths = {
        split: [os.path.join(input_root, path) for path in paths]
        for split, paths in data_split.items()
    }

    def to_np(key, people):
        arr = people.get(key, [])
        arr = np.array(arr)
        if arr.size == 0:
            return np.zeros((0, 3))
        return arr.reshape(-1, 3)

    def normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale):
        def apply_norm(x):
            if x.size == 0:
                return x
            x_norm = x.copy()
            x_norm[:, :3] = (x[:, :3] - min_xy) / scale
            return x_norm
        return (
            apply_norm(pose),
            apply_norm(hand_l),
            apply_norm(hand_r),
            apply_norm(face)
        )

    def normalize_by_part(pose, hand_l, hand_r, face):
        def normalize(part, anchor_idx):
            if part.size == 0 or anchor_idx >= len(part):
                return part
            anchor = part[anchor_idx, :3]
            rel = part[:, :3] - anchor
            scale = np.linalg.norm(rel, axis=1).max() + 1e-6
            part_norm = part.copy()
            part_norm[:, :3] = rel / scale
            return part_norm

        upper_pose_indices = [i for i in range(len(pose)) if i not in [9,10,11,12,13,14]]
        pose_upper = pose[upper_pose_indices]
        pose_norm = normalize(pose_upper, anchor_idx=1)     # 목
        hand_l_norm = normalize(hand_l, anchor_idx=0)       # 왼손목
        hand_r_norm = normalize(hand_r, anchor_idx=0)       # 오른손목
        face_norm = normalize(face, anchor_idx=30)          # 코
        return pose_norm, hand_l_norm, hand_r_norm, face_norm

    def to_list(x):
        return x.flatten().tolist() if x.size > 0 else []

    # Process each keyword
    for keyword in keywords:
        print(f'Processing keyword: {keyword}')

        # Step 1: Compute normalization parameters from TRAIN data only
        train_files = [f for f in split_paths['train'] if keyword in os.path.basename(f)]
        if not train_files:
            print(f'No train files found for keyword: {keyword}')
            continue

        if mode == 'all':
            all_keypoints = []
            for file_path in train_files:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                people = data['landmarks']
                pose = to_np('pose_keypoints_3d', people)
                hand_l = to_np('hand_left_keypoints_3d', people)
                hand_r = to_np('hand_right_keypoints_3d', people)
                face = to_np('face_keypoints_3d', people)

                parts = [x for x in [pose, hand_l, hand_r, face] if x.size > 0]
                if parts:
                    all_keypoints.append(np.concatenate(parts, axis=0))
            if not all_keypoints:
                print(f'No valid keypoints in train files for keyword: {keyword}')
                continue
            all_points = np.concatenate(all_keypoints, axis=0)
            min_xy = all_points[:, :3].min(axis=0)
            max_xy = all_points[:, :3].max(axis=0)
            scale = (max_xy - min_xy) + 1e-6

        # Step 2: Apply normalization to all (train + valid + test) files
        all_files = sum(split_paths.values(), [])  # flatten all splits
        target_files = [f for f in all_files if keyword in os.path.basename(f)]
        print(f'Applying normalization to {len(target_files)} files.')

        for file_path in target_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            people = data['landmarks']
            pose = to_np('pose_keypoints_3d', people)
            hand_l = to_np('hand_left_keypoints_3d', people)
            hand_r = to_np('hand_right_keypoints_3d', people)
            face = to_np('face_keypoints_3d', people)

            if mode == 'all':
                pose_n, hand_l_n, hand_r_n, face_n = normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale)
            elif mode == 'part':
                pose_n, hand_l_n, hand_r_n, face_n = normalize_by_part(pose, hand_l, hand_r, face)
            else:
                raise ValueError("mode는 'all' 또는 'part' 중 하나여야 합니다.")

            people['pose_keypoints_3d'] = to_list(pose_n)
            people['hand_left_keypoints_3d'] = to_list(hand_l_n)
            people['hand_right_keypoints_3d'] = to_list(hand_r_n)
            people['face_keypoints_3d'] = to_list(face_n)

            rel_path = os.path.relpath(file_path, input_root)
            output_file_path = os.path.join(output_root, rel_path)
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

            with open(output_file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)

        print(f'Finished keyword: {keyword}\n')

# 사용 예시
input_root = 'C:/Users/dpwl1/Downloads/labeled'
output_root = 'C:/Users/dpwl1/Downloads/normalized'
split_json_path = 'C:/Users/dpwl1/Desktop/대학교/KUBIG/2025-1 Conference/AI 모델 소스코드/NIA-1-69 재난안전정보 수어영상 데이터_AI 모델소스/data/data_split.json'
keywords = ['TW03', 'TW87', 'TW104', 'TW114', 'TW04', 'TW30', 'KU02', 'CROWD04', 'CROWD1067644', 'CROWD1075', 'CROWD128', 'CROWD87', 'CROWD112']

normalize_group_by_keyword_with_split(input_root, output_root, split_json_path, keywords, mode='part')  # 또는 'all'


Processing keyword: TW03
Applying normalization to 996 files.


FileNotFoundError: [Errno 2] No such file or directory: 'C:/Users/dpwl1/Downloads/labeled\\NIA_SL_G2_FIRE000412_1_TW03.json'

In [25]:
import os
import json
import numpy as np
from glob import glob

def normalize_group_by_keyword_with_split(input_root, output_root, split_json_path, keywords, mode='all'):
    os.makedirs(output_root, exist_ok=True)

    # Load data split information
    with open(split_json_path, 'r', encoding='utf-8') as f:
        data_split = json.load(f)

    missing_files = []
    rev_data_split = {k: v.copy() for k, v in data_split.items()}  # 사본을 만들어 수정에 사용

    # Flatten all paths and group by split, checking file existence
    split_paths = {}
    for split, paths in data_split.items():
        full_paths = []
        for path in paths:
            full_path = os.path.join(input_root, path)
            if os.path.exists(full_path):
                full_paths.append(full_path)
            else:
                missing_files.append(full_path)
                if path in rev_data_split[split]:  # 제거
                    rev_data_split[split].remove(path)
        split_paths[split] = full_paths

    def to_np(key, people):
        arr = people.get(key, [])
        arr = np.array(arr)
        if arr.size == 0:
            return np.zeros((0, 3))
        return arr.reshape(-1, 3)

    def normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale):
        def apply_norm(x):
            if x.size == 0:
                return x
            x_norm = x.copy()
            x_norm[:, :3] = (x[:, :3] - min_xy) / scale
            return x_norm
        return (
            apply_norm(pose),
            apply_norm(hand_l),
            apply_norm(hand_r),
            apply_norm(face)
        )

    def normalize_by_part(pose, hand_l, hand_r, face):
        def normalize(part, anchor_idx):
            if part.size == 0 or anchor_idx >= len(part):
                return part
            anchor = part[anchor_idx, :3]
            rel = part[:, :3] - anchor
            scale = np.linalg.norm(rel, axis=1).max() + 1e-6
            part_norm = part.copy()
            part_norm[:, :3] = rel / scale
            return part_norm

        upper_pose_indices = [i for i in range(len(pose)) if i not in [9,10,11,12,13,14]]
        pose_upper = pose[upper_pose_indices]
        pose_norm = normalize(pose_upper, anchor_idx=1)     # 목
        hand_l_norm = normalize(hand_l, anchor_idx=0)       # 왼손목
        hand_r_norm = normalize(hand_r, anchor_idx=0)       # 오른손목
        face_norm = normalize(face, anchor_idx=30)          # 코
        return pose_norm, hand_l_norm, hand_r_norm, face_norm

    def to_list(x):
        return x.flatten().tolist() if x.size > 0 else []

    for keyword in keywords:
        print(f'Processing keyword: {keyword}')

        # Step 1: Compute normalization parameters from TRAIN data only
        train_files = [f for f in split_paths['train'] if keyword in os.path.basename(f)]
        if not train_files:
            print(f'No train files found for keyword: {keyword}')
            continue

        if mode == 'all':
            all_keypoints = []
            for file_path in train_files:
                with open(file_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                people = data['landmarks']
                pose = to_np('pose_keypoints_3d', people)
                hand_l = to_np('hand_left_keypoints_3d', people)
                hand_r = to_np('hand_right_keypoints_3d', people)
                face = to_np('face_keypoints_3d', people)

                parts = [x for x in [pose, hand_l, hand_r, face] if x.size > 0]
                if parts:
                    all_keypoints.append(np.concatenate(parts, axis=0))
            if not all_keypoints:
                print(f'No valid keypoints in train files for keyword: {keyword}')
                continue
            all_points = np.concatenate(all_keypoints, axis=0)
            min_xy = all_points[:, :3].min(axis=0)
            max_xy = all_points[:, :3].max(axis=0)
            scale = (max_xy - min_xy) + 1e-6

        # Step 2: Apply normalization to all (train + valid + test) files
        all_files = sum(split_paths.values(), [])  # flatten all splits
        target_files = [f for f in all_files if keyword in os.path.basename(f)]
        print(f'Applying normalization to {len(target_files)} files.')

        for file_path in target_files:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            people = data['landmarks']
            pose = to_np('pose_keypoints_3d', people)
            hand_l = to_np('hand_left_keypoints_3d', people)
            hand_r = to_np('hand_right_keypoints_3d', people)
            face = to_np('face_keypoints_3d', people)

            if mode == 'all':
                pose_n, hand_l_n, hand_r_n, face_n = normalize_all_parts(pose, hand_l, hand_r, face, min_xy, scale)
            elif mode == 'part':
                pose_n, hand_l_n, hand_r_n, face_n = normalize_by_part(pose, hand_l, hand_r, face)
            else:
                raise ValueError("mode는 'all' 또는 'part' 중 하나여야 합니다.")

            people['pose_keypoints_3d'] = to_list(pose_n)
            people['hand_left_keypoints_3d'] = to_list(hand_l_n)
            people['hand_right_keypoints_3d'] = to_list(hand_r_n)
            people['face_keypoints_3d'] = to_list(face_n)

            rel_path = os.path.relpath(file_path, input_root)
            output_file_path = os.path.join(output_root, rel_path)
            os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

            with open(output_file_path, 'w', encoding='utf-8') as f:
                json.dump(data, f, ensure_ascii=False, indent=2)

        print(f'Finished keyword: {keyword}\n')

    # Save revised data split without missing files
    rev_json_path = split_json_path.replace('.json', '_rev.json')
    with open(rev_json_path, 'w', encoding='utf-8') as f:
        json.dump(rev_data_split, f, ensure_ascii=False, indent=2)
    print(f"\n수정된 split 파일 저장 완료: {rev_json_path}")

    # 마지막에 누락된 파일 경로 출력
    if missing_files:
        print("\n누락된 파일들 (split JSON에는 있지만 실제로 존재하지 않음):")
        for path in missing_files:
            print(f'- {path}')
    else:
        print("\n모든 파일이 존재합니다.")


# 사용 예시
input_root = 'C:/Users/dpwl1/Downloads/labeled'
output_root = 'C:/Users/dpwl1/Downloads/normalized'
split_json_path = 'C:/Users/dpwl1/Desktop/대학교/KUBIG/2025-1 Conference/AI 모델 소스코드/NIA-1-69 재난안전정보 수어영상 데이터_AI 모델소스/data/data_split.json'
keywords = ['TW03', 'TW87', 'TW104', 'TW114', 'TW04', 'TW30', 'KU02', 'CROWD04', 'CROWD1067644', 'CROWD1075', 'CROWD128', 'CROWD87', 'CROWD112']

normalize_group_by_keyword_with_split(input_root, output_root, split_json_path, keywords, mode='part')


Processing keyword: TW03
Applying normalization to 896 files.
Finished keyword: TW03

Processing keyword: TW87
Applying normalization to 304 files.
Finished keyword: TW87

Processing keyword: TW104
Applying normalization to 108 files.
Finished keyword: TW104

Processing keyword: TW114
Applying normalization to 134 files.
Finished keyword: TW114

Processing keyword: TW04
Applying normalization to 92 files.
Finished keyword: TW04

Processing keyword: TW30
Applying normalization to 164 files.
Finished keyword: TW30

Processing keyword: KU02
Applying normalization to 2206 files.
Finished keyword: KU02

Processing keyword: CROWD04
No train files found for keyword: CROWD04
Processing keyword: CROWD1067644
No train files found for keyword: CROWD1067644
Processing keyword: CROWD1075
Applying normalization to 817 files.
Finished keyword: CROWD1075

Processing keyword: CROWD128
No train files found for keyword: CROWD128
Processing keyword: CROWD87
No train files found for keyword: CROWD87
Proces