# 환경설치

In [1]:
# 필요한 라이브러리 설치 (코랩에서 실행)
!pip install torch torchvision matplotlib seaborn pandas numpy Pillow scikit-learn

# 기본 라이브러리 import
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.datasets as datasets

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from PIL import Image
import io
import requests
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import time
import os

# 시드 설정 (재현 가능한 결과를 위해)
torch.manual_seed(42)
np.random.seed(42)

print(" 모든 라이브러리가 성공적으로 로드되었습니다!")
print(f"PyTorch 버전: {torch.__version__}")
print(f"CUDA 사용 가능: {torch.cuda.is_available()}")

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [3]:
import torch
from torch.utils.data import Dataset
import random

class FakeImageDataset(Dataset):
    """
    Creates a fake dataset of images and labels for demonstration purposes.
    """
    def __init__(self, num_samples=1000, image_size=(3, 224, 224), num_classes=10, transform=None):
        """
        Args:
            num_samples (int): Number of samples in the dataset.
            image_size (tuple): Size of the images (C, H, W).
            num_classes (int): Number of classes.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        self.num_samples = num_samples
        self.image_size = image_size
        self.num_classes = num_classes
        self.transform = transform
        self.data = torch.randn(num_samples, *image_size) # Generate random image data
        self.labels = torch.randint(0, num_classes, (num_samples,)) # Generate random labels

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        image = self.data[idx]
        label = self.labels[idx]

        if self.transform:
            # Convert tensor to PIL Image to apply torchvision transforms
            # Note: This is a simplified approach. For real image data,
            # you would load images from files.
            image_np = image.mul(255).clamp(0, 255).byte().permute(1, 2, 0).numpy()
            image_pil = Image.fromarray(image_np, 'RGB')
            image = self.transform(image_pil)

        return image, label

print(" FakeImageDataset 클래스가 정의되었습니다.")

 FakeImageDataset 클래스가 정의되었습니다.


In [4]:
class CompletePipeline:
    """데이터 파이프라인 예제"""

    def __init__(self):
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        print(f"  파이프라인 초기화 (Device: {self.device})")

    def create_transforms(self):
        """Transform 생성"""
        train_transform = transforms.Compose([
            transforms.RandomResizedCrop(224, scale=(0.8, 1.0)),
            transforms.RandomHorizontalFlip(p=0.5),
            transforms.RandomRotation(degrees=10),
            transforms.ColorJitter(brightness=0.2, contrast=0.2),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
        ])

        val_transform = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
        ])

        return train_transform, val_transform

    def create_datasets(self, num_train=1000, num_val=200):
        """데이터셋 생성"""
        train_transform, val_transform = self.create_transforms()

        # 훈련 및 검증 데이터셋 생성
        train_dataset = FakeImageDataset(
            num_samples=num_train,
            image_size=(3, 224, 224),
            num_classes=10,
            transform=train_transform
        )

        val_dataset = FakeImageDataset(
            num_samples=num_val,
            image_size=(3, 224, 224),
            num_classes=10,
            transform=val_transform
        )

        return train_dataset, val_dataset

    def create_dataloaders(self, train_dataset, val_dataset, batch_size=32):
        """DataLoader 생성"""
        train_loader = DataLoader(
            train_dataset,
            batch_size=batch_size,
            shuffle=True,
            num_workers=2,
            pin_memory=torch.cuda.is_available()
        )

        val_loader = DataLoader(
            val_dataset,
            batch_size=batch_size,
            shuffle=False,
            num_workers=2,
            pin_memory=torch.cuda.is_available()
        )

        return train_loader, val_loader

    def analyze_data(self, train_loader, val_loader):
        """데이터 분석"""
        print(f"\n 데이터 분석:")

        # 훈련 데이터 분석
        train_batch = next(iter(train_loader))
        train_images, train_labels = train_batch

        print(f"   훈련 데이터:")
        print(f"     - 배치 크기: {train_images.size(0)}")
        print(f"     - 이미지 shape: {train_images.shape}")
        print(f"     - 값 범위: [{train_images.min():.3f}, {train_images.max():.3f}]")
        print(f"     - 라벨 분포: {torch.bincount(train_labels, minlength=10)}")

        # 검증 데이터 분석
        val_batch = next(iter(val_loader))
        val_images, val_labels = val_batch

        print(f"   검증 데이터:")
        print(f"     - 배치 크기: {val_images.size(0)}")
        print(f"     - 이미지 shape: {val_images.shape}")
        print(f"     - 값 범위: [{val_images.min():.3f}, {val_images.max():.3f}]")
        print(f"     - 라벨 분포: {torch.bincount(val_labels, minlength=10)}")

        return train_batch, val_batch

    def run_complete_pipeline(self):
        """전체 파이프라인 실행"""
        print("  데이터 파이프라인 실행")
        print("=" * 60)

        # 1. 데이터셋 생성
        print("\n1 데이터셋 생성...")
        train_dataset, val_dataset = self.create_datasets()

        # 2. DataLoader 생성
        print("\n2 DataLoader 생성...")
        train_loader, val_loader = self.create_dataloaders(train_dataset, val_dataset)

        # 3. 데이터 분석
        print("\n3 데이터 분석...")
        train_batch, val_batch = self.analyze_data(train_loader, val_loader)

        # 4. 성능 측정
        print("\n4 성능 측정...")
        start_time = time.time()

        batch_count = 0
        for batch_images, batch_labels in train_loader:
            # GPU로 이동 (사용 가능한 경우)
            batch_images = batch_images.to(self.device)
            batch_labels = batch_labels.to(self.device)

            # 간단한 연산 (실제로는 모델 훈련)
            _ = batch_images.mean()

            batch_count += 1
            if batch_count >= 10:
                break

        elapsed_time = time.time() - start_time
        throughput = (batch_count * train_loader.batch_size) / elapsed_time

        print(f"     - 처리 시간: {elapsed_time:.3f}초")
        print(f"     - 처리량: {throughput:.1f} 샘플/초")

        print("\n 파이프라인 실행 완료!")

        return {
            'train_loader': train_loader,
            'val_loader': val_loader,
            'performance': {
                'time': elapsed_time,
                'throughput': throughput
            }
        }

# 완전한 파이프라인 실행
pipeline = CompletePipeline()
results = pipeline.run_complete_pipeline()

# 결과 요약
print(f"\n 최종 결과 요약:")
print(f"   - 훈련 배치 수: {len(results['train_loader'])}")
print(f"   - 검증 배치 수: {len(results['val_loader'])}")
print(f"   - 처리 성능: {results['performance']['throughput']:.1f} 샘플/초")
print(f"   - Device: {pipeline.device}")

  파이프라인 초기화 (Device: cuda)
  데이터 파이프라인 실행

1 데이터셋 생성...

2 DataLoader 생성...

3 데이터 분석...

 데이터 분석:


  image_pil = Image.fromarray(image_np, 'RGB')
  image_pil = Image.fromarray(image_np, 'RGB')


   훈련 데이터:
     - 배치 크기: 32
     - 이미지 shape: torch.Size([32, 3, 224, 224])
     - 값 범위: [-2.118, 2.640]
     - 라벨 분포: tensor([5, 4, 4, 1, 1, 2, 2, 5, 4, 4])


  image_pil = Image.fromarray(image_np, 'RGB')
  image_pil = Image.fromarray(image_np, 'RGB')


   검증 데이터:
     - 배치 크기: 32
     - 이미지 shape: torch.Size([32, 3, 224, 224])
     - 값 범위: [-2.118, 2.640]
     - 라벨 분포: tensor([2, 1, 5, 6, 2, 2, 2, 2, 5, 5])

4 성능 측정...


  image_pil = Image.fromarray(image_np, 'RGB')
  image_pil = Image.fromarray(image_np, 'RGB')


     - 처리 시간: 1.183초
     - 처리량: 270.5 샘플/초

 파이프라인 실행 완료!

 최종 결과 요약:
   - 훈련 배치 수: 32
   - 검증 배치 수: 7
   - 처리 성능: 270.5 샘플/초
   - Device: cuda
