# Task A - All-in-One Notebook: Dynamic Convolution Module with Full Pipeline

# 載入套件

In [1]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from PIL import Image
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from thop import profile
from torch.cuda.amp import GradScaler, autocast
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)


# 自定義通道選擇設定

In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
channel_dict = {"R": 0, "G": 1, "B": 2}

def get_channel_mask(combo, batch_size):
    mask = torch.zeros((batch_size, 3))
    for c in combo:
        mask[:, channel_dict[c]] = 1
    return mask.to(device)

# 資料集設定

In [3]:
class ImageNetMiniDataset(Dataset):
    def __init__(self, txt_file, img_dir, transform=None):
        self.img_labels = []
        self.img_dir = img_dir
        self.transform = transform
        with open(txt_file, 'r') as f:
            for line in f:
                path, label = line.strip().split()
                self.img_labels.append((path, int(label)))

    def __len__(self):
        return len(self.img_labels)

    def __getitem__(self, idx):
        img_path, label = self.img_labels[idx]
        image = Image.open(os.path.join(self.img_dir, img_path)).convert("RGB")
        if self.transform:
            image = self.transform(image)
        return image, label


# 動態捲積

In [4]:
class DynamicConv(nn.Module):
    def __init__(self, max_in_channels, out_channels, hidden_dim=64, kernel_size=3):
        super().__init__()
        self.max_in_channels = max_in_channels
        self.out_channels = out_channels
        self.kernel_size = kernel_size
        self.weight_gen = nn.Sequential(
            nn.Linear(max_in_channels, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, out_channels * max_in_channels * kernel_size * kernel_size)
        )
        self.bias = nn.Parameter(torch.zeros(out_channels))

    def forward(self, x, channel_mask):
        B, C, H, W = x.size()
        padded = torch.zeros((B, self.max_in_channels, H, W), device=x.device)
        padded[:, :C] = x
        weights = self.weight_gen(channel_mask)
        weights = weights.view(B, self.out_channels, self.max_in_channels, self.kernel_size, self.kernel_size)
        out = []
        for i in range(B):
            weight_i = weights[i, :, :C, :, :]
            out_i = F.conv2d(x[i:i+1], weight_i, bias=self.bias, padding=self.kernel_size // 2)
            out.append(out_i)
        return torch.cat(out, dim=0)

# 分類器

In [5]:
class ToyClassifier(nn.Module):
    def __init__(self, conv_layer, num_classes, feature_dim=16):
        super().__init__()
        self.conv = conv_layer
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.fc = nn.Linear(feature_dim, num_classes)

    def forward(self, x, channel_mask):
        x = self.conv(x, channel_mask)
        x = self.pool(x).squeeze(-1).squeeze(-1)
        return self.fc(x)

# Baseline CNN

In [6]:
class StaticConvNet(nn.Module):
    def __init__(self, num_classes=50):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv2d(3, 64, 3, padding=1),
            nn.ReLU(),
            nn.AdaptiveAvgPool2d(1),
            nn.Flatten(),
            nn.Linear(64, num_classes)
        )

    def forward(self, x):
        return self.net(x)

@torch.no_grad()
def evaluate_baseline(model, dataloader):
    model.eval()
    correct = total = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        pred = model(x).argmax(dim=1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return 100 * correct / total

# 評估函數

In [7]:
@torch.no_grad()
def evaluate(model, dataloader, combo):
    model.eval()
    correct = total = 0
    for x, y in dataloader:
        x, y = x.to(device), y.to(device)
        channel_mask = get_channel_mask(combo, x.size(0))
        pred = model(x, channel_mask).argmax(dim=1)
        correct += (pred == y).sum().item()
        total += y.size(0)
    return 100 * correct / total

# 程式主流程

In [8]:
def run_baseline(img_dir, transform, epochs=4):
    print("⇒ Training Static Baseline (RGB only)")
    # 資料
    train_ds = ImageNetMiniDataset(os.path.join(img_dir,"train.txt"), img_dir, transform)
    val_ds   = ImageNetMiniDataset(os.path.join(img_dir,"val.txt"),   img_dir, transform)
    test_ds  = ImageNetMiniDataset(os.path.join(img_dir,"test.txt"),  img_dir, transform)
    train_loader = DataLoader(train_ds, batch_size=128, shuffle=True)
    val_loader   = DataLoader(val_ds,   batch_size=128)
    test_loader  = DataLoader(test_ds,  batch_size=128)

    model = StaticConvNet(num_classes=50).to(device)
    opt   = torch.optim.Adam(model.parameters(), lr=1e-3)
    loss_fn = nn.CrossEntropyLoss()

    for ep in range(epochs):
        model.train()
        for x, y in tqdm(train_loader, desc=f"Baseline Epoch {ep+1}"):
            x, y = x.to(device), y.to(device)
            opt.zero_grad()
            loss_fn(model(x), y).backward()
            opt.step()

    val_acc  = evaluate_baseline(model, val_loader)
    test_acc = evaluate_baseline(model, test_loader)
    # FLOPs / Params
    dummy = torch.randn(1,3,32,32).to(device)
    flops, params = profile(model, inputs=(dummy,), verbose=False)
    return {
        "combo":"RGB", "model":"StaticBaseline",
        "hidden_dim":None, "out_channels":None,
        "val_acc":val_acc, "test_acc":test_acc,
        "FLOPs(M)":flops/1e6, "Params(K)":params/1e3
    }

def run_flexible_dynamicconv_combinations(img_dir, transform, hyper_configs, num_classes=50, epochs=4):
    channel_combos = ["RGB", "RG", "RB", "GB", "R", "G", "B"]
    results = []

    for hidden_dim, out_ch in hyper_configs:
        print(f"\nTraining: hidden_dim={hidden_dim}, out_channels={out_ch}")
        # 構建模型
        conv = DynamicConv(max_in_channels=3, out_channels=out_ch, hidden_dim=hidden_dim)
        model = ToyClassifier(conv, num_classes=num_classes, feature_dim=out_ch).to(device)

        # 讀資料
        train_set = ImageNetMiniDataset(os.path.join(img_dir, "train.txt"), img_dir, transform)
        train_loader = DataLoader(train_set, batch_size=128, shuffle=True)

        optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
        loss_fn = nn.CrossEntropyLoss()
        scaler = GradScaler()

        # 訓練
        for epoch in range(epochs):
            model.train()
            pbar = tqdm(train_loader, desc=f"Train Epoch {epoch+1}")
            for x, y in pbar:
                x, y = x.to(device), y.to(device)
                optimizer.zero_grad()
                with autocast():
                    mask = get_channel_mask("RGB", x.size(0))
                    pred = model(x, mask)
                    loss = loss_fn(pred, y)
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
                pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        # 對每個通道組合分別算 val_acc 和 test_acc
        for combo in channel_combos:
            # 建立該 combo 的 val loader
            val_set_combo  = ImageNetMiniDataset(os.path.join(img_dir, "val.txt"), img_dir, transform)
            val_loader_combo = DataLoader(val_set_combo, batch_size=128)
            val_acc_combo = evaluate(model, val_loader_combo, combo)

            # 建立該 combo 的 test loader
            test_set_combo  = ImageNetMiniDataset(os.path.join(img_dir, "test.txt"), img_dir, transform)
            test_loader_combo = DataLoader(test_set_combo, batch_size=128)
            test_acc_combo = evaluate(model, test_loader_combo, combo)

            # 計算 FLOPs & Params
            dummy_input = torch.randn(1, len(combo), 32, 32).to(device)
            channel_mask = get_channel_mask(combo, 1)
            class WrappedModel(nn.Module):
                def __init__(self, model, channel_mask):
                    super().__init__()
                    self.model = model
                    self.channel_mask = channel_mask
                def forward(self, x):
                    return self.model(x, self.channel_mask)

            wrapped_model = WrappedModel(model, channel_mask)
            flops, params = profile(wrapped_model, inputs=(dummy_input,), verbose=False)

            # 收集結果
            results.append({
                "combo": combo,
                "hidden_dim": hidden_dim,
                "out_channels": out_ch,
                "model": "DynamicConv",
                "val_acc":  val_acc_combo,
                "test_acc": test_acc_combo,
                "FLOPs(M)": flops  / 1e6,
                "Params(K)": params / 1e3
            })

    df = pd.DataFrame(results)
    df.to_csv("flexible_dynamicconv_result.csv", index=False)
    return df


# 執行程式

In [9]:
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor()
])
img_dir = "C:/Users/james/Desktop/DL_Report1/image"
hyper_configs = [(64, 32), (64, 64), (64, 128), (128, 32), (128, 64), (128, 128)]
df_flex = run_flexible_dynamicconv_combinations(img_dir, transform, hyper_configs)
df_base = pd.DataFrame([run_baseline(img_dir, transform)])
df_all = pd.concat([df_flex, df_base], ignore_index=True)
df_all.to_csv("final_comparison_result.csv", index=False)
print(df_all)


Training: hidden_dim=64, out_channels=32


Train Epoch 1: 100%|████████████████████████████████████████████████████| 495/495 [02:35<00:00,  3.19it/s, loss=3.8102]
Train Epoch 2: 100%|████████████████████████████████████████████████████| 495/495 [02:32<00:00,  3.24it/s, loss=3.6991]
Train Epoch 3: 100%|████████████████████████████████████████████████████| 495/495 [02:26<00:00,  3.38it/s, loss=3.7186]
Train Epoch 4: 100%|████████████████████████████████████████████████████| 495/495 [02:26<00:00,  3.38it/s, loss=3.8359]



Training: hidden_dim=64, out_channels=64


Train Epoch 1: 100%|████████████████████████████████████████████████████| 495/495 [02:35<00:00,  3.19it/s, loss=3.8109]
Train Epoch 2: 100%|████████████████████████████████████████████████████| 495/495 [02:34<00:00,  3.20it/s, loss=3.7933]
Train Epoch 3: 100%|████████████████████████████████████████████████████| 495/495 [02:32<00:00,  3.24it/s, loss=3.8117]
Train Epoch 4: 100%|████████████████████████████████████████████████████| 495/495 [02:32<00:00,  3.24it/s, loss=3.8286]



Training: hidden_dim=64, out_channels=128


Train Epoch 1: 100%|████████████████████████████████████████████████████| 495/495 [02:29<00:00,  3.32it/s, loss=3.7994]
Train Epoch 2: 100%|████████████████████████████████████████████████████| 495/495 [02:32<00:00,  3.24it/s, loss=3.7479]
Train Epoch 3: 100%|████████████████████████████████████████████████████| 495/495 [02:29<00:00,  3.30it/s, loss=3.9405]
Train Epoch 4: 100%|████████████████████████████████████████████████████| 495/495 [02:33<00:00,  3.23it/s, loss=3.6185]



Training: hidden_dim=128, out_channels=32


Train Epoch 1: 100%|████████████████████████████████████████████████████| 495/495 [02:24<00:00,  3.43it/s, loss=3.8196]
Train Epoch 2: 100%|████████████████████████████████████████████████████| 495/495 [02:25<00:00,  3.41it/s, loss=3.8034]
Train Epoch 3: 100%|████████████████████████████████████████████████████| 495/495 [02:28<00:00,  3.32it/s, loss=3.8102]
Train Epoch 4: 100%|████████████████████████████████████████████████████| 495/495 [02:29<00:00,  3.31it/s, loss=3.7134]



Training: hidden_dim=128, out_channels=64


Train Epoch 1: 100%|████████████████████████████████████████████████████| 495/495 [02:33<00:00,  3.22it/s, loss=3.7998]
Train Epoch 2: 100%|████████████████████████████████████████████████████| 495/495 [02:31<00:00,  3.26it/s, loss=3.7791]
Train Epoch 3: 100%|████████████████████████████████████████████████████| 495/495 [02:32<00:00,  3.25it/s, loss=3.7711]
Train Epoch 4: 100%|████████████████████████████████████████████████████| 495/495 [02:32<00:00,  3.25it/s, loss=3.7510]



Training: hidden_dim=128, out_channels=128


Train Epoch 1: 100%|████████████████████████████████████████████████████| 495/495 [02:34<00:00,  3.20it/s, loss=3.8955]
Train Epoch 2: 100%|████████████████████████████████████████████████████| 495/495 [02:34<00:00,  3.21it/s, loss=3.7676]
Train Epoch 3: 100%|████████████████████████████████████████████████████| 495/495 [02:31<00:00,  3.27it/s, loss=3.6875]
Train Epoch 4: 100%|████████████████████████████████████████████████████| 495/495 [02:31<00:00,  3.27it/s, loss=3.8391]


⇒ Training Static Baseline (RGB only)


Baseline Epoch 1: 100%|██████████████████████████████████████████████████████████████| 495/495 [01:48<00:00,  4.56it/s]
Baseline Epoch 2: 100%|██████████████████████████████████████████████████████████████| 495/495 [01:47<00:00,  4.59it/s]
Baseline Epoch 3: 100%|██████████████████████████████████████████████████████████████| 495/495 [01:47<00:00,  4.60it/s]
Baseline Epoch 4: 100%|██████████████████████████████████████████████████████████████| 495/495 [01:46<00:00,  4.63it/s]


   combo hidden_dim out_channels           model   val_acc  test_acc  \
0    RGB         64           32     DynamicConv  4.444444  5.555556   
1     RG         64           32     DynamicConv  5.777778  5.333333   
2     RB         64           32     DynamicConv  6.444444  4.000000   
3     GB         64           32     DynamicConv  4.666667  5.111111   
4      R         64           32     DynamicConv  4.222222  4.000000   
5      G         64           32     DynamicConv  5.111111  4.444444   
6      B         64           32     DynamicConv  5.333333  5.111111   
7    RGB         64           64     DynamicConv  5.777778  6.222222   
8     RG         64           64     DynamicConv  5.777778  6.444444   
9     RB         64           64     DynamicConv  7.111111  4.666667   
10    GB         64           64     DynamicConv  4.000000  6.000000   
11     R         64           64     DynamicConv  5.555556  5.333333   
12     G         64           64     DynamicConv  4.444444  5.55