#### [이미지 연령별 분류 코드] <hr>

In [3]:
# import shutil
# from pathlib import Path

# # =========================
# # 설정
# # =========================
# dir_path = r"C:\Users\kdt008\Downloads\archive\utkcropped"
# out_root = r"C:\Users\kdt008\face_imgs"

# MOVE_FILES = False   # True: 이동, False: 복사

# # =========================
# # 연령대 분류 규칙
# # =========================
# # (start, end, folder_name)
# AGE_BUCKETS = [
#     (0, 5,   "infant"),       # 영유아
#     (8, 13,  "child"),       # 아동
#     (15, 19, "teen"),       # 청소년
#     (20, 35, "young_adult"),      # 청년  (※ 겹침 방지 위해 21~38로 처리)
#     (45, 60, "middle_aged"),     # 중년
#     (70, 80,"senior"),    # 노년
#     (81, 120,"elderly"),    # 고령
    
# ]

# def get_bucket(age: int) -> str | None:
#     for s, e, name in AGE_BUCKETS:
#         if s <= age <= e:
#             return name
#     return None

# def is_image_file(p: Path) -> bool:
#     return p.suffix.lower() in [".jpg", ".jpeg", ".png", ".bmp", ".webp"]

# # =========================
# # 메인
# # =========================
# src = Path(dir_path)
# dst_root = Path(out_root)
# dst_root.mkdir(parents=True, exist_ok=True)

# total = 0
# ok = 0
# skipped = 0
# bad_name = 0

# for p in src.iterdir():
#     if not p.is_file() or not is_image_file(p):
#         continue

#     total += 1

#     # 파일명 형식: age_...jpg
#     parts = p.stem.split("_")
#     if len(parts) < 2:
#         skipped += 1
#         bad_name += 1
#         continue

#     age_str = parts[0]
#     if not age_str.isdigit():
#         skipped += 1
#         bad_name += 1
#         continue

#     age = int(age_str)
#     if not (0 <= age <= 120):
#         skipped += 1
#         bad_name += 1
#         continue

#     bucket = get_bucket(age)
#     if bucket is None:
#         skipped += 1
#         continue

#     target_dir = dst_root / bucket
#     target_dir.mkdir(parents=True, exist_ok=True)

#     target_path = target_dir / p.name

#     # 같은 파일명 충돌 시 뒤에 __dupN 붙이기
#     if target_path.exists():
#         base = target_path.stem
#         ext = target_path.suffix
#         i = 1
#         while True:
#             candidate = target_dir / f"{base}__dup{i}{ext}"
#             if not candidate.exists():
#                 target_path = candidate
#                 break
#             i += 1

#     try:
#         if MOVE_FILES:
#             shutil.move(str(p), str(target_path))
#         else:
#             shutil.copy2(str(p), str(target_path))
#         ok += 1
#     except Exception as e:
#         skipped += 1
#         print(f"[ERROR] {p.name} -> {e}")

# print("====== DONE ======")
# print(f"총 이미지: {total}")
# print(f"{'이동' if MOVE_FILES else '복사'} 성공: {ok}")
# print(f"스킵: {skipped} (파일명 문제: {bad_name})")
# print(f"결과 폴더: {dst_root}")


총 이미지: 23709
복사 성공: 19708
스킵: 4001 (파일명 문제: 0)
결과 폴더: C:\Users\kdt008\face_imgs


In [5]:
import random
import shutil
from pathlib import Path

# =========================
# 설정
# =========================
dir_path = r"C:\Users\kdt008\Downloads\archive\utkcropped"
out_root = r"C:\Users\kdt008\face_imgs_gender_balanced"

MOVE_FILES = False   # True: 이동, False: 복사
TARGET_PER_GROUP = 366
SEED = 42
random.seed(SEED)

# =========================
# 연령대 분류 규칙
# =========================
AGE_BUCKETS = [
    (0, 5,   "infant"),       # 영유아
    (8, 13,  "child"),        # 아동
    (15, 19, "teen"),         # 청소년
    (20, 35, "young_adult"),  # 청년
    (45, 60, "middle_aged"),  # 중년
    (70, 80, "senior"),       # 노년
    (81, 120,"elderly"),      # 고령
]

IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}

def get_bucket(age: int) -> str | None:
    for s, e, name in AGE_BUCKETS:
        if s <= age <= e:
            return name
    return None

def is_image_file(p: Path) -> bool:
    return p.suffix.lower() in IMG_EXTS

def safe_copy_or_move(src: Path, dst: Path, move: bool):
    dst.parent.mkdir(parents=True, exist_ok=True)

    # 파일명 충돌 방지
    if dst.exists():
        base = dst.stem
        ext = dst.suffix
        i = 1
        while True:
            cand = dst.parent / f"{base}__dup{i}{ext}"
            if not cand.exists():
                dst = cand
                break
            i += 1

    if move:
        shutil.move(str(src), str(dst))
    else:
        shutil.copy2(str(src), str(dst))

# =========================
# 1) 전체 파일을 (bucket, gender)로 모으기
# =========================
src = Path(dir_path)
dst_root = Path(out_root)
dst_root.mkdir(parents=True, exist_ok=True)

bucket_gender_map = {name: {0: [], 1: []} for _, _, name in AGE_BUCKETS}
skipped = 0
bad_name = 0
total = 0

for p in src.iterdir():
    if not p.is_file() or not is_image_file(p):
        continue
    total += 1

    parts = p.stem.split("_")
    # UTKFace: age_gender_race_...
    if len(parts) < 3:
        skipped += 1
        bad_name += 1
        continue

    age_str, gender_str = parts[0], parts[1]

    if not age_str.isdigit() or not gender_str.isdigit():
        skipped += 1
        bad_name += 1
        continue

    age = int(age_str)
    gender = int(gender_str)  # 보통 0=male, 1=female (데이터셋 설명 확인 권장)
    if gender not in (0, 1) or not (0 <= age <= 120):
        skipped += 1
        bad_name += 1
        continue

    bucket = get_bucket(age)
    if bucket is None:
        skipped += 1
        continue

    bucket_gender_map[bucket][gender].append(p)

print(f"총 이미지 파일: {total}, 스킵: {skipped} (파일명 문제: {bad_name})")

# =========================
# 2) bucket별로 gender 균형 맞춰 최대 540장 선택
# =========================
half = TARGET_PER_GROUP // 2  # 270

for bucket, gmap in bucket_gender_map.items():
    males = gmap[0]
    females = gmap[1]

    random.shuffle(males)
    random.shuffle(females)

    pick_m = min(len(males), half)
    pick_f = min(len(females), half)

    selected = males[:pick_m] + females[:pick_f]

    # 부족하면 남은 쪽에서 추가로 채워서 TARGET_PER_GROUP까지 맞추기
    need = TARGET_PER_GROUP - len(selected)
    if need > 0:
        rest_m = males[pick_m:]
        rest_f = females[pick_f:]
        rest = rest_m + rest_f
        random.shuffle(rest)
        selected += rest[:need]

    # 그래도 부족하면(전체가 540 미만) 있는 만큼만 저장
    out_dir = dst_root / bucket
    out_dir.mkdir(parents=True, exist_ok=True)

    for p in selected:
        target_path = out_dir / p.name
        safe_copy_or_move(p, target_path, MOVE_FILES)

    print(
        f"[{bucket}] male={len(males)}, female={len(females)} "
        f"-> saved={len(selected)} (m:{min(pick_m, len(males))}, f:{min(pick_f, len(females))})"
    )

print("\nDONE ->", dst_root)


총 이미지 파일: 23709, 스킵: 4001 (파일명 문제: 1)
[infant] male=1156, female=1207 -> saved=366 (m:183, f:183)
[child] male=401, female=460 -> saved=366 (m:183, f:183)
[teen] male=408, female=534 -> saved=366 (m:183, f:183)
[young_adult] male=4761, female=5753 -> saved=366 (m:183, f:183)
[middle_aged] male=2566, female=1090 -> saved=366 (m:183, f:183)
[senior] male=519, female=313 -> saved=366 (m:183, f:183)
[elderly] male=183, female=357 -> saved=366 (m:183, f:183)

DONE -> C:\Users\kdt008\face_imgs_gender_balanced
