<a href="https://colab.research.google.com/github/Kwannn666/DL_Report2/blob/main/DL_Report2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# 📦 安裝套件與清理舊資料
!pip install -q torchvision lightning

# ✅ 清除殘留資料（避免解壓時交互式提示）
!rm -rf data
!rm -rf imagenette2-160.tgz VOCtrainval_11-May-2012.tar val2017.zip annotations_trainval2017.zip

import os
import shutil
import random
import json
import glob
from pathlib import Path
from tqdm import tqdm

# 固定亂數種子
random.seed(42)

# ✅ 建立主結構
base_dir = "data"
os.makedirs(base_dir, exist_ok=True)

for name in ["mini_coco_det", "mini_voc_seg", "imagenette_160"]:
    for sub in ["train", "val"]:
        os.makedirs(os.path.join(base_dir, name, sub), exist_ok=True)

# ✅ 下載資料
!wget https://s3.amazonaws.com/fast-ai-imageclas/imagenette2-160.tgz
!tar -xf imagenette2-160.tgz -C data/
!mv data/imagenette2-160 data/imagenette_160_raw

!wget http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar
!tar -xf VOCtrainval_11-May-2012.tar -C data/

!wget http://images.cocodataset.org/zips/val2017.zip
!unzip -q val2017.zip -d data/mini_coco_det/train

!wget http://images.cocodataset.org/annotations/annotations_trainval2017.zip
!unzip -q annotations_trainval2017.zip -d data/tmp
!mv data/tmp/annotations/instances_val2017.json data/mini_coco_det/annotations/
!rm -r data/tmp


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m51.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m39.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m211.5/211.5 MB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m56.3/56.3 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m127.9/127.9 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# 建立正確的目錄（如果還沒建立）
!mkdir -p data/mini_coco_det/annotations

# 將標註檔搬到正式目錄中
!mv data/tmp/annotations/instances_val2017.json data/mini_coco_det/annotations/

# 刪除暫存資料夾
!rm -r data/tmp



mv: cannot stat 'data/tmp/annotations/instances_val2017.json': No such file or directory
rm: cannot remove 'data/tmp': No such file or directory


In [None]:
# =============================
# 🔄 清除舊的目標資料夾（不刪原始解壓檔）
# =============================
for subdir in ["data/imagenette_160", "data/mini_voc_seg", "data/mini_coco_det"]:
    if os.path.exists(subdir):
        shutil.rmtree(subdir)
    os.makedirs(subdir, exist_ok=True)


In [9]:
!unzip -q annotations_trainval2017.zip -d data/tmp


In [10]:
!ls data/tmp/annotations/


captions_train2017.json   instances_val2017.json
captions_val2017.json	  person_keypoints_train2017.json
instances_train2017.json  person_keypoints_val2017.json


In [11]:
!mkdir -p data/mini_coco_det/annotations
!mv data/tmp/annotations/instances_val2017.json data/mini_coco_det/annotations/
!rm -r data/tmp


In [8]:
import os
print(os.path.exists("data/mini_coco_det/annotations/instances_val2017.json"))


False


In [12]:
import os, shutil, random, json, glob
from pathlib import Path
from tqdm import tqdm

random.seed(42)


# =============================
# 1️⃣ Imagenette 抽樣與複製
# =============================
imagenette_src = Path("data/imagenette_160_raw/train")
imagenette_dst_train = Path("data/imagenette_160/train")
imagenette_dst_val = Path("data/imagenette_160/val")
imagenette_dst_train.mkdir(parents=True, exist_ok=True)
imagenette_dst_val.mkdir(parents=True, exist_ok=True)

all_imgs = list(imagenette_src.rglob("*.JPEG"))
print(f"📦 找到 Imagenette 原始圖片數量：{len(all_imgs)}")
if len(all_imgs) < 300:
    raise ValueError("❌ Imagenette 圖片數不足 300 張，請確認是否正確解壓 imagenette2-160.tgz")

selected = random.sample(all_imgs, 300)
train_imgs, val_imgs = selected[:240], selected[240:]

for img in tqdm(train_imgs, desc="Imagenette Train"):
    shutil.copy(img, imagenette_dst_train / img.name)
for img in tqdm(val_imgs, desc="Imagenette Val"):
    shutil.copy(img, imagenette_dst_val / img.name)

# =============================
# 2️⃣ VOC Segmentation 抽樣與遮罩對應
# =============================
voc_img_dir = Path("data/VOCdevkit/VOC2012/JPEGImages")
voc_mask_dir = Path("data/VOCdevkit/VOC2012/SegmentationClass")
voc_dst_train = Path("data/mini_voc_seg/train")
voc_dst_val = Path("data/mini_voc_seg/val")
voc_dst_train.mkdir(parents=True, exist_ok=True)
voc_dst_val.mkdir(parents=True, exist_ok=True)

all_voc_images = list(voc_img_dir.glob("*.jpg"))
mask_names = {p.stem for p in voc_mask_dir.glob("*.png")}
valid_voc_images = [img for img in all_voc_images if img.stem in mask_names]

print(f"📦 可用 VOC 圖片（含遮罩）數量：{len(valid_voc_images)}")
if len(valid_voc_images) < 300:
    raise ValueError("❌ VOC 可用圖片數不足 300 張，請確認 VOCtrainval 是否正確解壓")

selected_voc = random.sample(valid_voc_images, 300)
voc_train = selected_voc[:240]
voc_val = selected_voc[240:]

def copy_voc(img_list, target_dir):
    for img in tqdm(img_list, desc=f"VOC → {target_dir.name}"):
        mask = voc_mask_dir / (img.stem + ".png")
        if mask.exists():
            shutil.copy(img, target_dir / img.name)
            shutil.copy(mask, target_dir / mask.name)

copy_voc(voc_train, voc_dst_train)
copy_voc(voc_val, voc_dst_val)

# =============================
# 3️⃣ COCO Detection 抽樣與標註過濾
# =============================
coco_img_dir = Path("data/mini_coco_det/train/val2017")
coco_dst_train = Path("data/mini_coco_det/train")
coco_dst_val = Path("data/mini_coco_det/val")
coco_dst_train.mkdir(parents=True, exist_ok=True)
coco_dst_val.mkdir(parents=True, exist_ok=True)

all_coco_images = list(coco_img_dir.glob("*.jpg"))
print(f"📦 找到 COCO val2017 圖片數量：{len(all_coco_images)}")
if len(all_coco_images) < 300:
    raise ValueError("❌ COCO 圖片不足 300 張，請確認 val2017 是否正確解壓")

selected_coco = random.sample(all_coco_images, 300)
coco_train, coco_val = selected_coco[:240], selected_coco[240:]

for img in tqdm(coco_train, desc="COCO Train"):
    shutil.copy(img, coco_dst_train / img.name)
for img in tqdm(coco_val, desc="COCO Val"):
    shutil.copy(img, coco_dst_val / img.name)

# 指定 COCO 標註檔
coco_ann_path = Path("data/mini_coco_det/annotations/instances_val2017.json")
if not coco_ann_path.exists():
    raise FileNotFoundError("❌ 找不到 COCO 標註檔 instances_val2017.json，請確認是否正確搬移")

with open(coco_ann_path) as f:
    coco_ann = json.load(f)

valid_img_names = {p.name for p in selected_coco}
valid_img_ids = set()
filtered_images = []
for img in coco_ann["images"]:
    if img["file_name"] in valid_img_names:
        filtered_images.append(img)
        valid_img_ids.add(img["id"])

filtered_anns = [ann for ann in coco_ann["annotations"] if ann["image_id"] in valid_img_ids]
filtered_coco = {
    "info": coco_ann.get("info", {}),
    "licenses": coco_ann.get("licenses", []),
    "categories": coco_ann["categories"],
    "images": filtered_images,
    "annotations": filtered_anns
}

# 儲存過濾後的標註 JSON
mini_json_path = Path("data/mini_coco_det/annotations/instances_val2017.json")
with open(mini_json_path, "w") as f:
    json.dump(filtered_coco, f)

# 移除原始 val2017
shutil.rmtree(coco_img_dir)

print("\n✅ 所有 Mini Dataset 準備完成！三組資料各包含 train 240 張、val 60 張。")


📦 找到 Imagenette 原始圖片數量：9469


Imagenette Train: 100%|██████████| 240/240 [00:00<00:00, 6056.25it/s]
Imagenette Val: 100%|██████████| 60/60 [00:00<00:00, 5459.44it/s]


📦 可用 VOC 圖片（含遮罩）數量：2913


VOC → train: 100%|██████████| 240/240 [00:00<00:00, 954.48it/s]
VOC → val: 100%|██████████| 60/60 [00:00<00:00, 889.25it/s]


📦 找到 COCO val2017 圖片數量：5000


COCO Train: 100%|██████████| 240/240 [00:00<00:00, 2719.51it/s]
COCO Val: 100%|██████████| 60/60 [00:00<00:00, 2454.56it/s]



✅ 所有 Mini Dataset 準備完成！三組資料各包含 train 240 張、val 60 張。


In [17]:
# 🔄 清空原有 VOC 輸出資料夾
!rm -rf data/mini_voc_seg/train/*
!rm -rf data/mini_voc_seg/val/*

# 🟩 重建 VOC train/val
voc_img_dir = Path("data/VOCdevkit/VOC2012/JPEGImages")
voc_mask_dir = Path("data/VOCdevkit/VOC2012/SegmentationClass")
voc_dst_train = Path("data/mini_voc_seg/train")
voc_dst_val = Path("data/mini_voc_seg/val")
voc_dst_train.mkdir(parents=True, exist_ok=True)
voc_dst_val.mkdir(parents=True, exist_ok=True)

all_voc_images = list(voc_img_dir.glob("*.jpg"))
mask_names = {p.stem for p in voc_mask_dir.glob("*.png")}
valid_voc_images = [img for img in all_voc_images if img.stem in mask_names]

print(f"📦 可用 VOC 圖片（含遮罩）數量：{len(valid_voc_images)}")
selected_voc = random.sample(valid_voc_images, 150)
voc_train = selected_voc[:120]
voc_val = selected_voc[120:]

def copy_voc(img_list, target_dir):
    for img in tqdm(img_list, desc=f"VOC → {target_dir.name}"):
        mask = voc_mask_dir / (img.stem + ".png")
        if mask.exists():
            shutil.copy(img, target_dir / img.name)
            shutil.copy(mask, target_dir / mask.name)

copy_voc(voc_train, voc_dst_train)
copy_voc(voc_val, voc_dst_val)


📦 可用 VOC 圖片（含遮罩）數量：2913


VOC → train: 100%|██████████| 120/120 [00:00<00:00, 1407.40it/s]
VOC → val: 100%|██████████| 30/30 [00:00<00:00, 588.25it/s]


In [14]:
import shutil, os

# 🔄 刪除整個 train/val 資料夾
shutil.rmtree("data/mini_voc_seg/train", ignore_errors=True)
shutil.rmtree("data/mini_voc_seg/val", ignore_errors=True)

# ✅ 重新建立資料夾
os.makedirs("data/mini_voc_seg/train", exist_ok=True)
os.makedirs("data/mini_voc_seg/val", exist_ok=True)


In [19]:
from pathlib import Path

def count_images(folder):
    return len(list(Path(folder).glob("*.jpg"))) + len(list(Path(folder).glob("*.jpeg"))) + len(list(Path(folder).glob("*.png")))+ len(list(Path(folder).glob("*.JPEG")))

print("COCO Train:", count_images("data/mini_coco_det/train"))
print("COCO Val:", count_images("data/mini_coco_det/val"))

print("VOC Train:", count_images("data/mini_voc_seg/train"))
print("VOC Val:", count_images("data/mini_voc_seg/val"))

print("Imagenette Train:", count_images("data/imagenette_160/train"))
print("Imagenette Val:", count_images("data/imagenette_160/val"))


COCO Train: 240
COCO Val: 60
VOC Train: 240
VOC Val: 60
Imagenette Train: 240
Imagenette Val: 60


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [18]:
#顯示三個任務資料夾的總檔案大小
def get_folder_size_mb(path):
    total_size = 0
    for dirpath, _, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            if os.path.isfile(fp):
                total_size += os.path.getsize(fp)
    return round(total_size / (1024 * 1024), 2)  # 轉成 MB 並四捨五入

# 三個資料夾路徑
folders = {
    "mini_coco_det": "data/mini_coco_det",
    "mini_voc_seg": "data/mini_voc_seg",
    "imagenette_160": "data/imagenette_160"
}

# 印出大小
print("📦 各資料夾總檔案大小（MB）")
for name, path in folders.items():
    size_mb = get_folder_size_mb(path)
    print(f"{name.ljust(20)}: {size_mb} MB")


📦 各資料夾總檔案大小（MB）
mini_coco_det       : 48.53 MB
mini_voc_seg        : 17.08 MB
imagenette_160      : 2.27 MB


In [20]:
!pip install -q thop

In [21]:
import torch
import torch.nn as nn
import torchvision.models as models
from torchvision.ops.feature_pyramid_network import FeaturePyramidNetwork, LastLevelMaxPool
from thop import profile
import time

In [22]:
# 🧠 統一模型定義
class UnifiedModel(nn.Module):
    def __init__(self, num_classes_cls=10, num_classes_det=10, num_classes_seg=21):
        super(UnifiedModel, self).__init__()

        # 🔗 Backbone: MobileNetV3 Small
        backbone = models.mobilenet_v3_small(weights=models.MobileNet_V3_Small_Weights.IMAGENET1K_V1)
        self.backbone = backbone.features

        # 🔗 FPN Neck：接 3 層輸出 (channel=24, 48, 96)
        self.fpn = FeaturePyramidNetwork(
           in_channels_list=[24, 40, 576],  # ← 修正這裡
           out_channels=128,
           extra_blocks=LastLevelMaxPool()
        )


        # 🧠 Shared conv head
        self.shared_head = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(),
            nn.Conv2d(128, 128, kernel_size=3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU()
        )

        # 🎯 Output heads
        self.classifier = nn.Linear(128, num_classes_cls)         # 分類
        self.det_head = nn.Conv2d(128, num_classes_det * 5, 1)    # 偵測 (class + bbox)
        self.seg_head = nn.Conv2d(128, num_classes_seg, 1)        # 分割

    def forward(self, x):
        feats = []
        for i, layer in enumerate(self.backbone):
            x = layer(x)
            if i in [3, 6, 12]:
                feats.append(x)

        fpn_feats = self.fpn({str(i): f for i, f in enumerate(feats)})
        fpn_out = list(fpn_feats.values())[0]

        shared = self.shared_head(fpn_out)
        B, C, H, W = shared.shape

        cls_logits = self.classifier(torch.mean(shared.view(B, C, -1), dim=2))
        det_raw = self.det_head(shared)
        det_output = det_raw.permute(0, 2, 3, 1).reshape(B, -1, 5)
        seg_mask = self.seg_head(shared)

        return cls_logits, det_output, seg_mask

# 📦 參數數量統計
def count_params(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# ⏱️ 推論時間測量（ms）
def measure_inference_time(model, input_shape=(1, 3, 512, 512), device="cpu"):
    x = torch.randn(input_shape).to(device)
    model.eval()
    with torch.no_grad():
        for _ in range(5):  # warm-up
            _ = model(x)
        start = time.time()
        for _ in range(10):
            _ = model(x)
        end = time.time()
    return (end - start) / 10 * 1000  # 單位: ms

# 🚀 建立與測試
device = "cuda" if torch.cuda.is_available() else "cpu"
model = UnifiedModel().to(device)
params = count_params(model)
inference_ms = measure_inference_time(model, device=device)

# ✅ 顯示結果
print(f"✅ 模型參數數量：{params:,} 個")
print(f"✅ 單張推論時間：{inference_ms:.2f} ms")


Downloading: "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v3_small-047dcff4.pth
100%|██████████| 9.83M/9.83M [00:00<00:00, 78.9MB/s]


✅ 模型參數數量：1,758,193 個
✅ 單張推論時間：29.23 ms


In [23]:
import os
import json
import torch
import numpy as np
from pathlib import Path
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.datasets import ImageFolder
from PIL import Image

# ----------- Segmentation (VOC) Dataset -----------
class VOCSegmentationDataset(Dataset):
    def __init__(self, image_dir, transform=None, target_transform=None):
        self.image_paths = sorted([os.path.join(image_dir, f) for f in os.listdir(image_dir) if f.endswith('.jpg')])
        self.mask_paths = [p.replace('.jpg', '.png') for p in self.image_paths]
        self.transform = transform
        self.target_transform = target_transform

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        img = Image.open(self.image_paths[idx]).convert('RGB')
        mask = Image.open(self.mask_paths[idx]).convert('L')  # segmentation mask
        if self.transform:
            img = self.transform(img)
        if self.target_transform:
            mask = self.target_transform(mask)
        mask = mask.long()
        # --- 防呆：只允許 0~20/255 ---
        mask = mask.clone()
        mask[(mask > 20) & (mask != 255)] = 255
        return img, mask

# ----------- Detection (COCO mini) Dataset -----------
class MiniCocoDetection(Dataset):
    def __init__(self, img_dir, ann_path, transform=None):
        self.img_dir = Path(img_dir)
        self.transform = transform

        with open(ann_path) as f:
            coco = json.load(f)

        self.img_id_to_filename = {img['id']: img['file_name'] for img in coco['images']}
        self.ann_by_img = {img_id: [] for img_id in self.img_id_to_filename}
        for ann in coco['annotations']:
            self.ann_by_img[ann['image_id']].append(ann)

        self.ids = list(self.img_id_to_filename.keys())

    def __len__(self):
        return len(self.ids)

    def __getitem__(self, idx):
        img_id = self.ids[idx]
        file_name = self.img_id_to_filename[img_id]
        img_path = self.img_dir / file_name
        img = Image.open(img_path).convert("RGB")
        anns = self.ann_by_img[img_id]

        boxes = []
        for ann in anns:
            x, y, w, h = ann['bbox']
            boxes.append([x, y, x + w, y + h])
        boxes = torch.tensor(boxes, dtype=torch.float32)
        if self.transform:
            img = self.transform(img)
        # 目標格式為 (N, 5): cx, cy, w, h, 1
        targets = []
        for box in boxes:
            cx = (box[0] + box[2]) / 2
            cy = (box[1] + box[3]) / 2
            w = box[2] - box[0]
            h = box[3] - box[1]
            targets.append([cx, cy, w, h, 1])  # 1 for conf
        targets = torch.tensor(targets, dtype=torch.float32)
        return img, targets

# ----------- Classification (Imagenette) Dataset -----------
def build_cls_dataset(root_dir, input_size=512):
    transform = transforms.Compose([transforms.Resize((input_size, input_size)), transforms.ToTensor()])
    train_set = ImageFolder(root=Path(root_dir) / "train", transform=transform)
    val_set = ImageFolder(root=Path(root_dir) / "val", transform=transform)
    return train_set, val_set


In [24]:
det_train = MiniCocoDetection(
    "data/mini_coco_det/train",
    "data/mini_coco_det/annotations/mini_instances_val2017.json",
    transform=det_tf
)

NameError: name 'det_tf' is not defined

In [25]:
import torch.nn.functional as F
from tqdm import tqdm
from sklearn.metrics import average_precision_score

# mIoU 計算
def compute_mIoU(preds, targets, num_classes=21):
    preds = preds.cpu().numpy()
    targets = targets.cpu().numpy()
    ious = []
    for cls in range(num_classes):
        pred_inds = preds == cls
        target_inds = targets == cls
        intersection = (pred_inds & target_inds).sum()
        union = (pred_inds | target_inds).sum()
        if union == 0:
            continue
        ious.append(intersection / union)
    return np.mean(ious) if ious else 0

# Segmentation 訓練
def train_segmentation_stage1(model, train_dir, val_dir, epochs=10, batch_size=8, lr=1e-3, device="cuda"):
    input_size = 512
    transform = transforms.Compose([
        transforms.Resize((input_size, input_size)),
        transforms.ToTensor()
    ])
    mask_transform = transforms.Compose([
        transforms.Resize((input_size, input_size), interpolation=Image.NEAREST),
        transforms.PILToTensor()
    ])

    train_set = VOCSegmentationDataset(train_dir, transform, mask_transform)
    val_set = VOCSegmentationDataset(val_dir, transform, mask_transform)
    train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_set, batch_size=1)

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = torch.nn.CrossEntropyLoss(ignore_index=255)
    best_mIoU = 0

    for epoch in range(epochs):
        model.train()
        running_loss = 0
        for imgs, masks in tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}"):
            imgs, masks = imgs.to(device), masks.to(device)
            _, _, seg = model(imgs)
            seg_H, seg_W = seg.shape[2:]

            # === 強制修正 mask 的值範圍 ===
            if masks.ndim == 3:
                masks = masks.unsqueeze(1)
            masks = masks.float()
            masks = torch.nn.functional.interpolate(masks, size=(seg_H, seg_W), mode="nearest")
            masks = masks.squeeze(1).long()
            masks = masks.clone()
            masks[(masks > 20) & (masks != 255)] = 255

            loss = criterion(seg, masks)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            running_loss += loss.item()
        print(f"🔧 Epoch {epoch+1} Loss: {running_loss/len(train_loader):.4f}")

        # Evaluate
        model.eval()
        all_preds, all_targets = [], []
        with torch.no_grad():
            for imgs, masks in val_loader:
                imgs, masks = imgs.to(device), masks.to(device)
                _, _, seg = model(imgs)
                seg_H, seg_W = seg.shape[2:]
                if masks.ndim == 3:
                    masks = masks.unsqueeze(1)
                masks = masks.float()
                masks = torch.nn.functional.interpolate(masks, size=(seg_H, seg_W), mode="nearest")
                masks = masks.squeeze(1).long()
                masks = masks.clone()
                masks[(masks > 20) & (masks != 255)] = 255

                preds = torch.argmax(seg, dim=1)
                all_preds.append(preds)
                all_targets.append(masks)
        all_preds = torch.cat(all_preds)
        all_targets = torch.cat(all_targets)
        mIoU = compute_mIoU(all_preds, all_targets)
        best_mIoU = max(best_mIoU, mIoU)
        print(f"📊 mIoU: {mIoU:.4f}")
    print(f"\n✅ Stage 1 完成！記錄 mIoU_base = {best_mIoU:.4f}")
    return best_mIoU

# mIoU for segmentation快速評估
def evaluate_segmentation(model, val_loader, device="cuda", num_samples=20):
    model.eval()
    preds, targets = [], []
    with torch.no_grad():
        for i, (imgs, masks) in enumerate(val_loader):
            if i >= num_samples:
                break
            imgs, masks = imgs.to(device), masks.to(device)
            _, _, seg = model(imgs)
            seg_H, seg_W = seg.shape[2:]
            if masks.ndim == 3:
                masks = masks.unsqueeze(1)
            masks = masks.float()
            masks = torch.nn.functional.interpolate(masks, size=(seg_H, seg_W), mode="nearest")
            masks = masks.squeeze(1).long()
            masks = masks.clone()
            masks[(masks > 20) & (masks != 255)] = 255

            pred = torch.argmax(seg, dim=1)
            preds.append(pred)
            targets.append(masks)
    preds = torch.cat(preds)
    targets = torch.cat(targets)
    return compute_mIoU(preds, targets)

# Classification評估
def evaluate_classification(model, val_loader, device="cuda"):
    model.eval()
    all_logits = []
    all_labels = []
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs = imgs.to(device)
            logits, _, _ = model(imgs)
            all_logits.append(logits.cpu())
            all_labels.append(labels)
    logits = torch.cat(all_logits)
    labels = torch.cat(all_labels)
    probs = F.softmax(logits, dim=1).numpy()
    preds = np.argmax(probs, axis=1)
    top1 = (preds == labels.numpy()).mean()
    try:
        mAP = average_precision_score(np.eye(probs.shape[1])[labels.numpy()], probs, average='macro')
    except:
        mAP = 0.0
    return mAP, top1


In [26]:
# ========== Warmup ========== #
def stage0_warmup(model, dummy_input_shape=(1, 3, 512, 512), device="cuda"):
    print("🔥 Stage 0: warm-up / ImageNet pretrain (可跳過，已加載權重)")
    model = model.to(device)
    model.eval()
    x = torch.randn(dummy_input_shape).to(device)
    with torch.no_grad():
        _ = model(x)
    print("✅ Warm-up 完成")

# ========== Segmentation 專訓 ========== #
def stage1_segmentation(model, train_dir, val_dir, device="cuda"):
    print("🎯 Stage 1: segmentation 訓練中（只更新 seg_head）")
    mIoU_base = train_segmentation_stage1(model, train_dir, val_dir, device=device)
    return mIoU_base

# ========== Detection 專訓 ========== #
def stage2_detection(model, train_loader, val_loader, mIoU_base, device="cuda"):
    print("🔎 Stage 2: detection-only 訓練中")
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    model = model.to(device)
    loss_fn = torch.nn.MSELoss()  # (簡化) 可用 L1+conf BCE
    model.train()
    for epoch in range(3):
        for imgs, det_targets in train_loader:
            imgs = imgs.to(device)
            det_targets = det_targets.to(device)
            _, det_out, _ = model(imgs)
            loss = loss_fn(det_out, det_targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        print(f"Epoch {epoch+1}/3 Done.")
    # Segmentation再評估一次
    mIoU_after_det = evaluate_segmentation(model, val_loader, device=device)
    mIoU_drop = max(0, mIoU_base - mIoU_after_det)
    print(f"📉 mIoU_drop: {mIoU_drop:.4f}")
    return mIoU_drop

# ========== Classification 專訓 ========== #
def stage3_classification(model, train_loader, val_loader, mIoU_base, mAP_base, Top1_base, device="cuda"):
    print("📘 Stage 3: classification-only 訓練中")
    optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
    criterion = torch.nn.CrossEntropyLoss()
    model = model.to(device)
    model.train()
    for epoch in range(3):
        for imgs, labels in train_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            cls_logits, _, _ = model(imgs)
            loss = criterion(cls_logits, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    # 評估 Seg / Det / Cls 效能
    mIoU_now = evaluate_segmentation(model, val_loader, device=device)
    mIoU_drop = max(0, mIoU_base - mIoU_now)
    mAP_now, top1_now = evaluate_classification(model, val_loader, device=device)
    mAP_drop = max(0, mAP_base - mAP_now)
    top1_drop = max(0, Top1_base - top1_now)
    print(f"📉 mIoU_drop: {mIoU_drop:.4f}, mAP_drop: {mAP_drop:.4f}, Top1_drop: {top1_drop:.4f}")
    return mIoU_drop, mAP_drop, top1_drop


In [28]:
# === Step 1: 初始化模型 ===
model = UnifiedModel()

# === Step 2: Stage 0 Warm-up（可選）===
stage0_warmup(model)

# === Step 3: Stage 1 segmentation（VOC）===
mIoU_base = stage1_segmentation(model, train_dir="data/mini_voc_seg/train", val_dir="data/mini_voc_seg/val")

# === Step 4: Stage 2 detection（COCO）===
det_tf = transforms.Compose([transforms.Resize((512, 512)), transforms.ToTensor()])
det_train = MiniCocoDetection("data/mini_coco_det/train", "data/mini_coco_det/annotations/mini_instances_val2017.json", transform=det_tf)
det_val = VOCSegmentationDataset("data/mini_voc_seg/val", det_tf, None)
mIoU_drop = stage2_detection(model, DataLoader(det_train, batch_size=8), DataLoader(det_val, batch_size=1), mIoU_base)

# === Step 5: Stage 3 classification（Imagenette）===
cls_train, cls_val = build_cls_dataset("data/imagenette_160")
mAP_base, Top1_base = evaluate_classification(model, DataLoader(cls_val, batch_size=8))
mIoU_drop_3, mAP_drop, Top1_drop = stage3_classification(
    model, DataLoader(cls_train, batch_size=8), DataLoader(cls_val, batch_size=8), mIoU_base, mAP_base, Top1_base
)


🔥 Stage 0: warm-up / ImageNet pretrain (可跳過，已加載權重)
✅ Warm-up 完成
🎯 Stage 1: segmentation 訓練中（只更新 seg_head）


Epoch 1/10: 100%|██████████| 15/15 [00:03<00:00,  4.18it/s]


🔧 Epoch 1 Loss: 1.6657
📊 mIoU: 0.0804


Epoch 2/10: 100%|██████████| 15/15 [00:03<00:00,  4.39it/s]


🔧 Epoch 2 Loss: 0.6541
📊 mIoU: 0.1903


Epoch 3/10: 100%|██████████| 15/15 [00:02<00:00,  6.22it/s]


🔧 Epoch 3 Loss: 0.3023
📊 mIoU: 0.2378


Epoch 4/10: 100%|██████████| 15/15 [00:02<00:00,  6.32it/s]


🔧 Epoch 4 Loss: 0.1986
📊 mIoU: 0.2240


Epoch 5/10: 100%|██████████| 15/15 [00:02<00:00,  5.49it/s]


🔧 Epoch 5 Loss: 0.1440
📊 mIoU: 0.2320


Epoch 6/10: 100%|██████████| 15/15 [00:03<00:00,  4.63it/s]


🔧 Epoch 6 Loss: 0.0838
📊 mIoU: 0.2432


Epoch 7/10: 100%|██████████| 15/15 [00:02<00:00,  6.41it/s]


🔧 Epoch 7 Loss: 0.0563
📊 mIoU: 0.2560


Epoch 8/10: 100%|██████████| 15/15 [00:02<00:00,  6.54it/s]


🔧 Epoch 8 Loss: 0.0391
📊 mIoU: 0.2645


Epoch 9/10: 100%|██████████| 15/15 [00:02<00:00,  5.04it/s]


🔧 Epoch 9 Loss: 0.0328
📊 mIoU: 0.2712


Epoch 10/10: 100%|██████████| 15/15 [00:02<00:00,  6.25it/s]


🔧 Epoch 10 Loss: 0.0261
📊 mIoU: 0.2735

✅ Stage 1 完成！記錄 mIoU_base = 0.2735
🔎 Stage 2: detection-only 訓練中


RuntimeError: stack expects each tensor to be equal size, but got [19, 5] at entry 0 and [14, 5] at entry 1

In [27]:
import os, json

img_dir = "data/mini_coco_det/train"
json_path = "data/mini_coco_det/annotations/instances_val2017.json"
mini_json_path = "data/mini_coco_det/annotations/mini_instances_val2017.json"

with open(json_path) as f:
    ann = json.load(f)
img_files = {f for f in os.listdir(img_dir) if f.endswith('.jpg')}
filtered_images = []
valid_img_ids = set()

for img in ann['images']:
    if img['file_name'] in img_files:
        filtered_images.append(img)
        valid_img_ids.add(img['id'])

filtered_anns = [a for a in ann['annotations'] if a['image_id'] in valid_img_ids]
mini_ann = {
    "info": ann.get('info', {}),
    "licenses": ann.get('licenses', []),
    "categories": ann['categories'],
    "images": filtered_images,
    "annotations": filtered_anns,
}
with open(mini_json_path, "w") as f:
    json.dump(mini_ann, f)
print(f"已寫入 mini_instances_val2017.json，共有 {len(filtered_images)} 張圖")


已寫入 mini_instances_val2017.json，共有 240 張圖


In [29]:
import json
from pathlib import Path

# 路徑
img_dir = Path("data/mini_coco_det/train")
json_path = "data/mini_coco_det/annotations/mini_instances_val2017.json"

# 讀 JSON
with open(json_path) as f:
    coco = json.load(f)

json_files = set(img['file_name'] for img in coco['images'])
real_files = set(p.name for p in img_dir.glob("*.jpg"))

missing_files = json_files - real_files
print(f"🚨 JSON 中缺失的圖片：{len(missing_files)}")
for f in list(missing_files)[:10]:
    print(" -", f)


🚨 JSON 中缺失的圖片：0
