In [1]:
# 환경설정
import sys
sys.path.append('../src')

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import yaml
from PIL import Image
import os

In [None]:
# 설정 로드
with open('../configs/config.yaml', 'r', encoding='utf-8') as f:
    config = yaml.safe_load(f)

print("✅ 환경 설정 완료!")

UnicodeDecodeError: 'cp949' codec can't decode byte 0xed in position 2: illegal multibyte sequence

In [None]:
# 데이터 탐색
data_path = Path('../data/raw')

In [None]:
# 데이터셋 구조 확인
print("📁 데이터셋 구조:")
for split in ['train', 'validation', 'test']:
    split_path = data_path / split
    if split_path.exists():
        print(f"\n{split}:")
        for class_dir in split_path.iterdir():
            if class_dir.is_dir():
                count = len(list(class_dir.glob('*.png')))
                print(f"  {class_dir.name}: {count} images")

In [None]:
# 샘플 이미지 시각화
def plot_sample_images(data_path, split='train', num_samples=3):
    """각 클래스별 샘플 이미지 표시"""
    split_path = data_path / split
    classes = [d.name for d in split_path.iterdir() if d.is_dir()]
    
    fig, axes = plt.subplots(len(classes), num_samples, figsize=(15, 2*len(classes)))
    
    for i, class_name in enumerate(classes):
        class_path = split_path / class_name
        images = list(class_path.glob('*.png'))[:num_samples]
        
        for j, img_path in enumerate(images):
            img = Image.open(img_path)
            axes[i, j].imshow(img, cmap='gray')
            axes[i, j].set_title(f'{class_name}')
            axes[i, j].axis('off')
    
    plt.tight_layout()
    plt.show()

In [None]:
# 샘플 이미지 표시
plot_sample_images(data_path)

In [None]:
# 클래스 분포 확인
def analyze_class_distribution(data_path):
    """클래스별 데이터 분포 분석"""
    distribution = {}
    
    for split in ['train', 'validation', 'test']:
        split_path = data_path / split
        if split_path.exists():
            distribution[split] = {}
            for class_dir in split_path.iterdir():
                if class_dir.is_dir():
                    count = len(list(class_dir.glob('*.png')))
                    distribution[split][class_dir.name] = count
    # 시각화
    df = pd.DataFrame(distribution).fillna(0)
    
    fig, axes = plt.subplots(1, 2, figsize=(15, 6))
    # 클래스별 분포
    df.sum(axis=1).plot(kind='bar', ax=axes[0], title='Total Samples per Class')
    axes[0].tick_params(axis='x', rotation=45)
     # 분할별 분포
    df.T.plot(kind='bar', ax=axes[1], title='Samples per Split')
    axes[1].tick_params(axis='x', rotation=0)
    axes[1].legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    
    plt.tight_layout()
    plt.show()
    
    return df

In [None]:
# 분포 분석 실행
distribution_df = analyze_class_distribution(data_path)
print("\n📊 클래스별 데이터 분포:")
print(distribution_df)