In [2]:
# Data Manipulation and Numerical Libraries
import numpy as np
import pandas as pd

# Deep Learning Libraries (PyTorch)
import torch
import torch.nn as nn
import torchvision
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision import models

# Data Preprocessing and Utilities
from sklearn.model_selection import train_test_split
from ultralytics import YOLO

# Image Processing and Visualization
import cv2
import matplotlib.pyplot as plt
from PIL import Image

# sys
import os
from glob import glob
from pathlib import Path
from shutil import copy2
import json

import warnings
warnings.filterwarnings('ignore', category=FutureWarning)

# 데이터 전처리

In [4]:
# 경로 설정 (사용자 맞춤 절대 경로)
ANNOTATIONS_DIR = Path("./data/ai02-level1-project/train_annotations")
IMAGES_DIR = Path("./data/ai02-level1-project/train_images")
OUTPUT_DIR = Path("./data/project_yolo")
OUTPUT_IMAGE_TRAIN = OUTPUT_DIR / "images" / "train"
OUTPUT_IMAGE_VAL = OUTPUT_DIR / "images" / "val"
OUTPUT_LABEL = OUTPUT_DIR / "labels"   # 추가함
OUTPUT_LABEL_TRAIN = OUTPUT_DIR / "labels" / "train"
OUTPUT_LABEL_VAL = OUTPUT_DIR / "labels" / "val"
DATA_YAML_PATH = OUTPUT_DIR / "data.yaml"

In [6]:
# 검증 데이터 비율
VAL_RATIO = 0.2 

In [8]:
# bbox 정의
def convert_bbox_coco_to_yolo(bbox, image_width, image_height):
    x, y, w, h = bbox
    x_center = x + w / 2
    y_center = y + h / 2
    return [
        round(x_center / image_width, 6),
        round(y_center / image_height, 6),
        round(w / image_width, 6),
        round(h / image_height, 6)
    ]

In [10]:
def process_annotation(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    image_info = data.get("images", [{}])[0]
    image_name = image_info.get("file_name")
    image_width = image_info.get("width")
    image_height = image_info.get("height")
    annotations = data.get("annotations", [])

    yolo_lines = []
    for ann in annotations:
        if "bbox" not in ann or not ann["bbox"]:
            continue
        bbox = convert_bbox_coco_to_yolo(ann["bbox"], image_width, image_height)
        class_id = ann.get("category_id", 0)
        yolo_lines.append(f"{class_id} {' '.join(map(str, bbox))}")

    return image_name, yolo_lines

In [13]:
def prepare_output_dirs():
    OUTPUT_IMAGE_TRAIN.mkdir(parents=True, exist_ok=True)
    OUTPUT_IMAGE_VAL.mkdir(parents=True, exist_ok=True)
    OUTPUT_LABEL_TRAIN.mkdir(parents=True, exist_ok=True)
    OUTPUT_LABEL_VAL.mkdir(parents=True, exist_ok=True)

In [15]:
# 추가함
def extract_categories(annotations_dir):
    category_map = {}
    for root, _, files in os.walk(annotations_dir):
        for file in files:
            if not file.endswith(".json"):
                continue
            json_path = Path(root) / file
            try:
                with open(json_path, 'r', encoding='utf-8') as f:
                    data = json.load(f)
                    categories = data.get("categories", [])
                    for cat in categories:
                        cat_id = cat["id"]
                        cat_name = cat["name"]
                        category_map[cat_id] = cat_name
            except Exception as e:
                print(f"<카테고리 파싱 실패>: {file} → {e}")
    return dict(sorted(category_map.items()))  # id순 정렬

In [None]:
# 변경함
def generate_yaml(category_map):
    nc = len(category_map)
    names = [f"'{category_map[cid]}'" for cid in sorted(category_map)]
    content = f"""path: {OUTPUT_DIR.resolve()}
                  train: images/train
                  val: images/val
                  nc: {nc}
                  names: [{', '.join(names)}]
                  """
    with open(DATA_YAML_PATH, "w", encoding="utf-8") as f:
        f.write(content)

In [None]:
# 추가한 함수들을 반영하기 위해서 수정함
def run():
    prepare_output_dirs()
    all_items = []

    for root, _, files in os.walk(ANNOTATIONS_DIR):
        for file in files:
            if not file.endswith(".json"):
                continue
            json_path = Path(root) / file
            try:
                image_name, yolo_labels = process_annotation(json_path)
                if yolo_labels:
                    all_items.append((json_path, image_name, yolo_labels))
            except Exception as e:
                print(f"<실패>: {json_path.name} → {e}")

    random.shuffle(all_items)
    val_count = int(len(all_items) * VAL_RATIO)

    for idx, (json_path, image_name, yolo_labels) in enumerate(all_items):
        is_val = idx < val_count
        image_src = IMAGES_DIR / image_name
        image_dst = OUTPUT_IMAGE_VAL / image_name if is_val else OUTPUT_IMAGE_TRAIN / image_name
        label_dst = OUTPUT_LABEL_VAL / image_name.replace(".png", ".txt") if is_val else OUTPUT_LABEL_TRAIN / image_name.replace(".png", ".txt")

        if image_src.exists():
            copy2(image_src, image_dst)
            with open(label_dst, "w", encoding="utf-8") as f:
                f.write("\n".join(yolo_labels))

    category_map = extract_categories(ANNOTATIONS_DIR)
    generate_yaml(category_map)

    print(f"► 총 {len(all_items)}개 이미지 전처리 완료 (train/val split 포함)")
    print(f"\n► data.yaml 생성 완료 → {DATA_YAML_PATH}")

if __name__ == "__main__":
    run()