### 뭐부터 할까
- dateset Normalize 평균과 분산 확인하기 
    - base코드와 많이 다르면 새로 구한 평균과 분산 사용해서 Normalize


In [1]:
import torch
import random
import os 
import numpy as np

CFG = {
    'IMG_SIZE': 224,
    'BATCH_SIZE': 32,
    'EPOCHS': 20,
    'LEARNING_RATE': 1e-4,
    'SEED' : 42
}

def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

seed_everything(CFG['SEED']) # Seed 고정

In [2]:
from torch.utils.data import Dataset
from PIL import Image
import os

class CustomDataset(Dataset):
    def __init__(self, path, transform=None, is_test=False):
        self.path = path
        self.transform = transform
        self.is_test = is_test
        self.samples = []

        if is_test:
            # 테스트셋: 라벨 없이 이미지 경로만 저장
            for fname in sorted(os.listdir(path)):
                if fname.lower().endswith(('.jpg')):
                    img_path = os.path.join(path, fname)
                    self.samples.append((img_path,))
        else:
            # 학습셋: 클래스별 폴더 구조에서 라벨 추출
            self.classes = sorted(os.listdir(path))
            self.class_to_idx = {cls_name: i for i, cls_name in enumerate(self.classes)}

            for cls_name in self.classes:
                cls_folder = os.path.join(path, cls_name)
                for fname in os.listdir(cls_folder):
                    if fname.lower().endswith(('.jpg')):
                        img_path = os.path.join(cls_folder, fname)
                        label = self.class_to_idx[cls_name]
                        self.samples.append((img_path, label))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        if self.is_test:
            img_path = self.samples[idx][0]
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image
        else:
            img_path, label = self.samples[idx]
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, label


In [16]:
from torchvision.transforms import transforms

train_root = '../data/train'
test_root = '../data/test'

train_transform = transforms.Compose([
    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

val_transform = transforms.Compose([
    transforms.Resize((CFG['IMG_SIZE'], CFG['IMG_SIZE'])),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

In [17]:
# companys = [
#     "HYUNDAI","KIA","GENESIS","BMW","Chevrolet"
#     "PEUGEOT","PORSCHE","BENZ","AUDI","KG"
#     "JAGUAR","NISSAN","LEXUS","etc","VOLVO"
#     "LINCOLN", "RENAULT", "TOYOTA", "RANGE ROVER"L
# ]

In [18]:
from torch.utils.data import DataLoader, Subset
from sklearn.model_selection import train_test_split

# 전체 데이터셋 로드
full_dataset = CustomDataset(train_root, transform=None)
print(f"총 이미지 수: {len(full_dataset)}")

targets = [label for _, label in full_dataset.samples]
class_names = full_dataset.classes

# Stratified Split
train_idx, val_idx = train_test_split(
    range(len(targets)), test_size=0.2, stratify=targets, random_state=42
)

# Subset + transform 각각 적용
train_dataset = Subset(CustomDataset(train_root, transform=train_transform), train_idx)
val_dataset = Subset(CustomDataset(train_root, transform=val_transform), val_idx)
print(f'train 이미지 수: {len(train_dataset)}, valid 이미지 수: {len(val_dataset)}')


# DataLoader 정의
train_loader = DataLoader(train_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

총 이미지 수: 33137
train 이미지 수: 26509, valid 이미지 수: 6628


In [19]:
import torchvision.models as models
import torch.nn.functional as F
from torch import nn

class BaseModel(nn.Module):
    def __init__(self, num_classes):
        super(BaseModel, self).__init__()
        self.backbone = models.resnet18(pretrained=True)  # ResNet18 모델 불러오기
        self.feature_dim = self.backbone.fc.in_features 
        self.backbone.fc = nn.Identity()  # feature extractor로만 사용
        self.head = nn.Linear(self.feature_dim, num_classes)  # 분류기

    def forward(self, x):
        x = self.backbone(x)       
        x = self.head(x) 
        return x

In [20]:
from torch import optim
from tqdm import tqdm
from sklearn.metrics import log_loss

device = "cuda" if torch.cuda.is_available() else "cpu"

model = BaseModel(num_classes=len(class_names)).to(device)
best_logloss = float('inf')

# 손실 함수
criterion = nn.CrossEntropyLoss()

# 옵티마이저
optimizer = optim.Adam(model.parameters(), lr=CFG['LEARNING_RATE'])

# 학습 및 검증 루프
for epoch in range(CFG['EPOCHS']):
    # Train
    model.train()
    train_loss = 0.0
    for images, labels in tqdm(train_loader, desc=f"[Epoch {epoch+1}/{CFG['EPOCHS']}] Training"):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)  # logits
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        train_loss += loss.item()

    avg_train_loss = train_loss / len(train_loader)

    # Validation
    model.eval()
    val_loss = 0.0
    correct = 0
    total = 0
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for images, labels in tqdm(val_loader, desc=f"[Epoch {epoch+1}/{CFG['EPOCHS']}] Validation"):
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            val_loss += loss.item()

            # Accuracy
            _, preds = torch.max(outputs, 1)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

            # LogLoss
            probs = F.softmax(outputs, dim=1)
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    val_accuracy = 100 * correct / total
    val_logloss = log_loss(all_labels, all_probs, labels=list(range(len(class_names))))

    # 결과 출력
    print(f"Train Loss : {avg_train_loss:.4f} || Valid Loss : {avg_val_loss:.4f} | Valid Accuracy : {val_accuracy:.4f}%")

    # Best model 저장
    if val_logloss < best_logloss:
        best_logloss = val_logloss
        torch.save(model.state_dict(), f'best_model.pth')
        print(f"📦 Best model saved at epoch {epoch+1} (logloss: {val_logloss:.4f})")

[Epoch 1/20] Training: 100%|██████████| 829/829 [12:20<00:00,  1.12it/s]
[Epoch 1/20] Validation: 100%|██████████| 208/208 [01:23<00:00,  2.48it/s]


Train Loss : 3.6297 || Valid Loss : 1.4838 | Valid Accuracy : 77.7158%
📦 Best model saved at epoch 1 (logloss: 1.4856)


[Epoch 2/20] Training: 100%|██████████| 829/829 [12:32<00:00,  1.10it/s]
[Epoch 2/20] Validation: 100%|██████████| 208/208 [01:22<00:00,  2.51it/s]


Train Loss : 0.8292 || Valid Loss : 0.5101 | Valid Accuracy : 88.8805%
📦 Best model saved at epoch 2 (logloss: 0.5097)


[Epoch 3/20] Training: 100%|██████████| 829/829 [12:37<00:00,  1.09it/s]
[Epoch 3/20] Validation: 100%|██████████| 208/208 [01:22<00:00,  2.52it/s]


Train Loss : 0.2534 || Valid Loss : 0.3293 | Valid Accuracy : 92.0489%
📦 Best model saved at epoch 3 (logloss: 0.3296)


[Epoch 4/20] Training: 100%|██████████| 829/829 [12:38<00:00,  1.09it/s]
[Epoch 4/20] Validation: 100%|██████████| 208/208 [01:22<00:00,  2.51it/s]


Train Loss : 0.1093 || Valid Loss : 0.2811 | Valid Accuracy : 93.0748%
📦 Best model saved at epoch 4 (logloss: 0.2817)


[Epoch 5/20] Training: 100%|██████████| 829/829 [12:41<00:00,  1.09it/s]
[Epoch 5/20] Validation: 100%|██████████| 208/208 [01:24<00:00,  2.45it/s]


Train Loss : 0.0614 || Valid Loss : 0.2661 | Valid Accuracy : 93.1804%
📦 Best model saved at epoch 5 (logloss: 0.2669)


[Epoch 6/20] Training: 100%|██████████| 829/829 [12:41<00:00,  1.09it/s]
[Epoch 6/20] Validation: 100%|██████████| 208/208 [01:24<00:00,  2.47it/s]


Train Loss : 0.0402 || Valid Loss : 0.2833 | Valid Accuracy : 93.1654%


[Epoch 7/20] Training: 100%|██████████| 829/829 [12:43<00:00,  1.09it/s]
[Epoch 7/20] Validation: 100%|██████████| 208/208 [01:24<00:00,  2.46it/s]


Train Loss : 0.0389 || Valid Loss : 0.3528 | Valid Accuracy : 91.1889%


[Epoch 8/20] Training: 100%|██████████| 829/829 [12:40<00:00,  1.09it/s]
[Epoch 8/20] Validation: 100%|██████████| 208/208 [01:23<00:00,  2.48it/s]


Train Loss : 0.0513 || Valid Loss : 0.2806 | Valid Accuracy : 92.3808%


[Epoch 9/20] Training: 100%|██████████| 829/829 [12:41<00:00,  1.09it/s]
[Epoch 9/20] Validation: 100%|██████████| 208/208 [01:24<00:00,  2.47it/s]


Train Loss : 0.0295 || Valid Loss : 0.3194 | Valid Accuracy : 91.7019%


[Epoch 10/20] Training: 100%|██████████| 829/829 [12:40<00:00,  1.09it/s]
[Epoch 10/20] Validation: 100%|██████████| 208/208 [01:25<00:00,  2.43it/s]


Train Loss : 0.0285 || Valid Loss : 0.2775 | Valid Accuracy : 92.7580%


[Epoch 11/20] Training: 100%|██████████| 829/829 [12:38<00:00,  1.09it/s]
[Epoch 11/20] Validation: 100%|██████████| 208/208 [01:22<00:00,  2.51it/s]


Train Loss : 0.0292 || Valid Loss : 0.3021 | Valid Accuracy : 92.3808%


[Epoch 12/20] Training: 100%|██████████| 829/829 [12:38<00:00,  1.09it/s]
[Epoch 12/20] Validation: 100%|██████████| 208/208 [01:23<00:00,  2.48it/s]


Train Loss : 0.0323 || Valid Loss : 0.2800 | Valid Accuracy : 93.1352%


[Epoch 13/20] Training: 100%|██████████| 829/829 [12:42<00:00,  1.09it/s]
[Epoch 13/20] Validation: 100%|██████████| 208/208 [01:24<00:00,  2.48it/s]


Train Loss : 0.0173 || Valid Loss : 0.3091 | Valid Accuracy : 91.9282%


[Epoch 14/20] Training: 100%|██████████| 829/829 [12:34<00:00,  1.10it/s]
[Epoch 14/20] Validation: 100%|██████████| 208/208 [01:24<00:00,  2.47it/s]


Train Loss : 0.0310 || Valid Loss : 0.2657 | Valid Accuracy : 93.3011%
📦 Best model saved at epoch 14 (logloss: 0.2668)


[Epoch 15/20] Training: 100%|██████████| 829/829 [12:39<00:00,  1.09it/s]
[Epoch 15/20] Validation: 100%|██████████| 208/208 [01:24<00:00,  2.47it/s]


Train Loss : 0.0172 || Valid Loss : 0.2443 | Valid Accuracy : 93.4671%
📦 Best model saved at epoch 15 (logloss: 0.2453)


[Epoch 16/20] Training: 100%|██████████| 829/829 [12:34<00:00,  1.10it/s]
[Epoch 16/20] Validation: 100%|██████████| 208/208 [01:22<00:00,  2.52it/s]


Train Loss : 0.0204 || Valid Loss : 0.2841 | Valid Accuracy : 92.8787%


[Epoch 17/20] Training: 100%|██████████| 829/829 [12:29<00:00,  1.11it/s]
[Epoch 17/20] Validation: 100%|██████████| 208/208 [01:21<00:00,  2.54it/s]


Train Loss : 0.0300 || Valid Loss : 0.2519 | Valid Accuracy : 93.4520%


[Epoch 18/20] Training: 100%|██████████| 829/829 [12:30<00:00,  1.10it/s]
[Epoch 18/20] Validation: 100%|██████████| 208/208 [01:22<00:00,  2.52it/s]


Train Loss : 0.0092 || Valid Loss : 0.2847 | Valid Accuracy : 92.7580%


[Epoch 19/20] Training: 100%|██████████| 829/829 [12:35<00:00,  1.10it/s]
[Epoch 19/20] Validation: 100%|██████████| 208/208 [01:23<00:00,  2.48it/s]


Train Loss : 0.0309 || Valid Loss : 0.2954 | Valid Accuracy : 92.4261%


[Epoch 20/20] Training: 100%|██████████| 829/829 [12:38<00:00,  1.09it/s]
[Epoch 20/20] Validation: 100%|██████████| 208/208 [01:22<00:00,  2.53it/s]

Train Loss : 0.0130 || Valid Loss : 0.2965 | Valid Accuracy : 92.6222%





In [21]:
test_dataset = CustomDataset(test_root, transform=val_transform, is_test=True)
test_loader = DataLoader(test_dataset, batch_size=CFG['BATCH_SIZE'], shuffle=False)

In [22]:
import pandas as pd

# 저장된 모델 로드
model = BaseModel(num_classes=len(class_names))
model.load_state_dict(torch.load('best_model.pth', map_location=device))
model.to(device)

# 추론
model.eval()
results = []

with torch.no_grad():
    for images in test_loader:
        images = images.to(device)
        outputs = model(images)
        probs = F.softmax(outputs, dim=1)

        # 각 배치의 확률을 리스트로 변환
        for prob in probs.cpu():  # prob: (num_classes,)
            result = {
                class_names[i]: prob[i].item()
                for i in range(len(class_names))
            }
            results.append(result)
            
pred = pd.DataFrame(results)



In [23]:
submission = pd.read_csv('../data/sample_submission.csv', encoding='utf-8-sig')

# 'ID' 컬럼을 제외한 클래스 컬럼 정렬
class_columns = submission.columns[1:]
pred = pred[class_columns]

submission[class_columns] = pred.values
submission.to_csv('0525_submission.csv', index=False, encoding='utf-8-sig')