In [1]:
# ===== 셀 1: 환경 설정 및 Import =====
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import json
from tqdm import tqdm
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# 프로젝트 경로 추가
import sys
sys.path.append('.')

# 모델 import
from models.unified.unified_model import UnifiedModel
from models.heads.mask2former_damage_head import Mask2FormerLoss
from utils.dataset import UnifiedDamageDataset, create_dataloaders
from utils.evaluate import ModelEvaluator

print(f"PyTorch Version: {torch.__version__}")
print(f"CUDA Available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")

PyTorch Version: 2.5.1+cu121
CUDA Available: True
GPU: NVIDIA GeForce RTX 4090
GPU Memory: 25.76 GB


In [2]:
# ===== 셀 2: Mask2Former 설정 =====
class Config:
    # 데이터 경로 - 분리된 경로
    blade_data_root = Path(r'C:\EngineBladeAI\EngineInspectionAI_MS_back_up\data\blade_data')  # Head-A용
    damage_data_root = Path(r'C:\EngineBladeAI\EngineInspectionAI_MS\data\multilabeled_data_augmented')  # Head-B용
    
    blade_checkpoint = 'best_unified_blade_model.pth'
    
    # 모델 타입
    model_type = 'mask2former'
    
    # 모델 기본 설정
    backbone_type = 'tiny'
    use_fpn = True
    num_blade_classes = 2
    num_damage_classes = 3
    
    # Mask2Former 특화 설정
    batch_size = 2  # 메모리 절약
    accumulate_grad_batches = 2  # Gradient accumulation
    num_workers = 0
    
    # Mask2Former Head 설정
    mask2former_config = {
        'num_queries': 100,  # 처음엔 적게
        'hidden_dim': 256,
        'num_heads': 8,
        'dec_layers': 3,  # 처음엔 적은 레이어
        'dropout': 0.1
    }
    
    # 학습 설정
    epochs = 30
    learning_rate = 1e-5  # Mask2Former는 작은 lr
    weight_decay = 0.05
    gradient_clip = 0.01  # 작은 gradient clipping
    
    # Mixed Precision Training
    use_amp = True
    
    # 학습 전략
    freeze_blade_initially = True
    unfreeze_epoch = 15
    
    # Loss weights
    blade_loss_weight = 1.0
    aux_loss_weight = 0.4
    
    # 기타
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    save_dir = Path('outputs_mask2former')
    save_dir.mkdir(exist_ok=True)
    
    experiment_name = f"mask2former_{datetime.now().strftime('%Y%m%d_%H%M%S')}"

config = Config()
print(f"Experiment: {config.experiment_name}")
print(f"Model Type: {config.model_type}")
print(f"Batch Size: {config.batch_size} x {config.accumulate_grad_batches} = {config.batch_size * config.accumulate_grad_batches}")
print(f"Blade Data: {config.blade_data_root}")
print(f"Damage Data: {config.damage_data_root}")

Experiment: mask2former_20250911_234610
Model Type: mask2former
Batch Size: 2 x 2 = 4
Blade Data: C:\EngineBladeAI\EngineInspectionAI_MS_back_up\data\blade_data
Damage Data: C:\EngineBladeAI\EngineInspectionAI_MS\data\multilabeled_data_augmented


In [3]:
# ===== 셀 3: 데이터로더 생성 =====
print("데이터로더 생성 중...")

train_loader, valid_loader, test_loader = create_dataloaders(
    blade_data_root=config.blade_data_root,
    damage_data_root=config.damage_data_root,
    batch_size=config.batch_size,
    num_workers=config.num_workers,
    model_type='mask2former'
)

print(f"✅ Train: {len(train_loader)} batches")
print(f"✅ Valid: {len(valid_loader)} batches")
print(f"✅ Test: {len(test_loader)} batches")

# 데이터 샘플 확인
for batch in train_loader:
    print(f"\n데이터 샘플:")
    for key, value in batch.items():
        if torch.is_tensor(value):
            print(f"  {key}: {value.shape}")
        elif isinstance(value, list):
            print(f"  {key}: {len(value)} items")
    break

데이터로더 생성 중...
✅ Train: 2352 batches
✅ Valid: 453 batches
✅ Test: 460 batches

데이터 샘플:
  image: torch.Size([2, 3, 640, 640])
  blade_mask: torch.Size([2, 640, 640])
  multilabel: torch.Size([2, 3])
  instance_masks: 2 items
  instance_labels: 2 items


In [4]:
# ===== 셀 4: Mask2Former 모델 생성 =====
print("Mask2Former 모델 생성 중...")

model = UnifiedModel(
    backbone_type=config.backbone_type,
    num_blade_classes=config.num_blade_classes,
    num_damage_classes=config.num_damage_classes,
    pretrained_backbone=True,
    blade_checkpoint=config.blade_checkpoint if Path(config.blade_checkpoint).exists() else None,
    freeze_blade=config.freeze_blade_initially,
    use_fpn=config.use_fpn,
    damage_head_type='mask2former',
    damage_head_config=config.mask2former_config
)

model = model.to(config.device)

# 파라미터 수 계산
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"✅ Total parameters: {total_params/1e6:.2f}M")
print(f"✅ Trainable parameters: {trainable_params/1e6:.2f}M")
print(f"✅ Blade head frozen: {config.freeze_blade_initially}")
print(f"✅ Damage head type: Mask2Former")
print(f"  - Queries: {config.mask2former_config['num_queries']}")
print(f"  - Decoder layers: {config.mask2former_config['dec_layers']}")

Mask2Former 모델 생성 중...
✅ Total parameters: 50.31M
✅ Trainable parameters: 49.78M
✅ Blade head frozen: True
✅ Damage head type: Mask2Former
  - Queries: 100
  - Decoder layers: 3


In [None]:
# ===== 셀 5 완전 교체: SimpleLoss 사용 =====
class SimpleLoss(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.blade_ce = nn.CrossEntropyLoss()
        self.ml_loss = nn.BCEWithLogitsLoss()
    
    def forward(self, outputs, batch):
        losses = {}
        total_loss = 0
        
        # Blade loss
        if 'blade' in outputs and 'blade_mask' in batch:
            losses['blade'] = self.blade_ce(outputs['blade'], batch['blade_mask'])
            total_loss += losses['blade'] * self.config.blade_loss_weight
        
        # Multilabel loss
        if 'multilabel' in outputs and 'multilabel' in batch:
            # Check if outputs need sigmoid
            if outputs['multilabel'].max() > 1.0 or outputs['multilabel'].min() < 0:
                # Use BCEWithLogitsLoss for raw logits
                losses['ml'] = self.ml_loss(outputs['multilabel'], batch['multilabel'])
            else:
                # Already sigmoid applied
                losses['ml'] = F.binary_cross_entropy(outputs['multilabel'], batch['multilabel'])
            total_loss += losses['ml'] * 2.0
        
        losses['total'] = total_loss
        return total_loss, losses

criterion = SimpleLoss(config)
print("✅ SimpleLoss ready - no Hungarian matching")

✅ Loss ready (BCEWithLogitsLoss)


In [6]:
# ===== 셀 6: Optimizer =====
param_groups = [
    {'params': model.backbone.parameters(), 'lr': config.learning_rate * 0.1, 'name': 'backbone'}
]

# Damage head parameters
for name, param in model.damage_head.named_parameters():
    param_groups.append({'params': [param], 'lr': config.learning_rate, 'name': 'damage_head'})

if not config.freeze_blade_initially:
    param_groups.append({'params': model.blade_head.parameters(), 'lr': config.learning_rate * 0.5, 'name': 'blade_head'})

optimizer = torch.optim.AdamW(param_groups[:2], weight_decay=config.weight_decay)  # 처음 2개 그룹만
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=config.epochs, eta_min=1e-7)
scaler = GradScaler() if config.use_amp else None

print(f"✅ Optimizer ready with {len(param_groups[:2])} groups")

✅ Optimizer ready with 2 groups


In [7]:
# ===== 셀 7: Training Functions =====
def train_epoch(model, train_loader, criterion, optimizer, scheduler, scaler, config, epoch):
    model.train()
    total_loss = 0
    num_batches = 0
    
    pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{config.epochs}')
    for batch_idx, batch in enumerate(pbar):
        # Move to device
        for key in batch:
            if torch.is_tensor(batch[key]):
                batch[key] = batch[key].to(config.device)
            elif isinstance(batch[key], list):
                batch[key] = [item.to(config.device) if torch.is_tensor(item) else item for item in batch[key]]
        
        with autocast(enabled=config.use_amp):
            outputs = model(batch['image'])
            loss, loss_dict = criterion(outputs, batch)
            loss = loss / config.accumulate_grad_batches
        
        if scaler:
            scaler.scale(loss).backward()
        else:
            loss.backward()
        
        if (batch_idx + 1) % config.accumulate_grad_batches == 0:
            if scaler:
                scaler.unscale_(optimizer)
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
                scaler.step(optimizer)
                scaler.update()
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), config.gradient_clip)
                optimizer.step()
            optimizer.zero_grad()
        
        total_loss += loss.item() * config.accumulate_grad_batches
        num_batches += 1
        pbar.set_postfix({'loss': f"{loss.item() * config.accumulate_grad_batches:.4f}"})
    
    scheduler.step()
    return {'total': total_loss / num_batches}

def validate_epoch(model, valid_loader, criterion, config):
    model.eval()
    total_loss = 0
    num_batches = 0
    
    with torch.no_grad():
        for batch in tqdm(valid_loader, desc='Validation'):
            for key in batch:
                if torch.is_tensor(batch[key]):
                    batch[key] = batch[key].to(config.device)
                elif isinstance(batch[key], list):
                    batch[key] = [item.to(config.device) if torch.is_tensor(item) else item for item in batch[key]]
            
            outputs = model(batch['image'])
            loss, _ = criterion(outputs, batch)
            total_loss += loss.item()
            num_batches += 1
    
    return {'loss': total_loss / num_batches}

print("✅ Training functions ready")

✅ Training functions ready


In [8]:
# ===== 셀 8: Training Loop =====
print("="*60)
print("Mask2Former 학습 시작")
print("="*60)

history = {'train_loss': [], 'val_loss': []}
best_loss = float('inf')

for epoch in range(config.epochs):
    print(f"\nEpoch {epoch+1}/{config.epochs}")
    
    if epoch == config.unfreeze_epoch and config.freeze_blade_initially:
        print("🔓 Unfreezing Blade Head")
        for param in model.blade_head.parameters():
            param.requires_grad = True
        optimizer.add_param_group({'params': model.blade_head.parameters(), 'lr': config.learning_rate * 0.1})
    
    train_metrics = train_epoch(model, train_loader, criterion, optimizer, scheduler, scaler, config, epoch)
    val_metrics = validate_epoch(model, valid_loader, criterion, config)
    
    history['train_loss'].append(train_metrics['total'])
    history['val_loss'].append(val_metrics['loss'])
    
    print(f"Train Loss: {train_metrics['total']:.4f}, Val Loss: {val_metrics['loss']:.4f}")
    
    if val_metrics['loss'] < best_loss:
        best_loss = val_metrics['loss']
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'best_loss': best_loss
        }, config.save_dir / f'{config.experiment_name}_best.pth')
        print(f"✅ Best model saved!")

print(f"\n학습 완료! Best loss: {best_loss:.4f}")

Mask2Former 학습 시작

Epoch 1/30


Epoch 1/30:   0%|          | 0/2352 [00:00<?, ?it/s]


RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x640000 and 6400x1)