In [1]:
!pip install -q wandb
import wandb
wandb.login(key='66e08c83e7351f9e1b1030e876e1e37674030b28')

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mmayankshivhare45[0m ([33mmayank17[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [2]:
import os
import cv2
import numpy as np
import torch
import csv
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms, models
from torch.utils.data import Dataset, DataLoader, random_split
from torchmetrics.segmentation import MeanIoU
import matplotlib.pyplot as plt
from tqdm import tqdm
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
import torchvision.transforms as transforms
import os
import cv2 # pip install opencv-python
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm # Use tqdm.notebook for Jupyter/Colab, or just tqdm

In [3]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# --- Block Definitions (Must be defined before UNet class) ---

class EncoderBlock(nn.Module):
    """Standard convolutional block for U-Net encoder"""
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, 3, padding=1, bias=False), # Often bias=False if using BatchNorm
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True))
        self.pool = nn.MaxPool2d(2)

    def forward(self, x):
        conv_out = self.conv(x) # Save output before pooling for skip connection
        pool_out = self.pool(conv_out)
        return pool_out, conv_out # Return pooled output and skip connection

class DecoderBlock(nn.Module):
    """Standard convolutional block for U-Net decoder with skip connections"""
    def __init__(self, in_channels, out_channels):
        super().__init__()
        # Up-convolution doubles spatial resolution, halves channels (usually)
        self.up = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2)
        # Convolutions after concatenating up-sampled map and skip connection
        # Input channels = out_channels (from up-conv) + out_channels (from skip connection)
        self.conv = nn.Sequential(
            nn.Conv2d(out_channels * 2, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True))

    def forward(self, x, skip):
        x = self.up(x)
        # Ensure spatial dimensions match for concatenation
        # Sometimes necessary due to rounding in pooling/conv layers
        if x.shape != skip.shape:
             # Basic center cropping (adjust if needed)
             diffY = skip.size()[2] - x.size()[2]
             diffX = skip.size()[3] - x.size()[3]
             skip = skip[:, :, diffY // 2 : skip.size()[2] - diffY // 2 - (diffY % 2),
                           diffX // 2 : skip.size()[3] - diffX // 2 - (diffX % 2)]

        x = torch.cat([x, skip], dim=1) # Concatenate along channel dimension
        x = self.conv(x)
        return x

class DecoderBlockNoSkip(nn.Module):
    """Decoder block without skip connections"""
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.up = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2)
        self.conv = nn.Sequential(
            # Input channels = out_channels (from up-conv only)
            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x):
        x = self.up(x)
        return self.conv(x)

class ResidualBlock(nn.Module):
    """Basic Residual Block"""
    def __init__(self, in_channels, out_channels, alpha=1.0):
        super().__init__()
        self.use_projection = in_channels != out_channels
        self.alpha = alpha

        # Convolution path
        self.conv1 = nn.Conv2d(in_channels, out_channels, 3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)

        # Projection shortcut for channel mismatch
        if self.use_projection:
            self.projection = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, 1, bias=False),
                nn.BatchNorm2d(out_channels)
            )
        # No activation after final addition in standard ResNet blocks

    def forward(self, x):
        identity = x

        # Main path
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)

        # Shortcut path
        if self.use_projection:
            identity = self.projection(identity)

        # Add shortcut
        out += identity * self.alpha # Apply alpha scaling if needed
        out = self.relu(out) # Activation after addition
        return out

class ResidualEncoderBlock(nn.Module):
    """Residual block followed by MaxPool for encoder"""
    def __init__(self, in_channels, out_channels, alpha=1.0):
        super().__init__()
        self.resblock = ResidualBlock(in_channels, out_channels, alpha)
        self.pool = nn.MaxPool2d(2)

    def forward(self, x):
        res_out = self.resblock(x) # Save output for skip connection
        pool_out = self.pool(res_out)
        return pool_out, res_out # Return pooled output and skip connection

class ResidualDecoderBlock(nn.Module):
    """Up-convolution followed by residual block for decoder"""
    def __init__(self, in_channels, out_channels, alpha=1.0):
        super().__init__()
        self.up = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2)
        # After concatenation, input channels = out_channels (from up-conv) + out_channels (from skip)
        self.resblock = ResidualBlock(out_channels * 2, out_channels, alpha)

    def forward(self, x, skip):
        x = self.up(x)
        # Ensure spatial dimensions match for concatenation
        if x.shape != skip.shape:
             diffY = skip.size()[2] - x.size()[2]
             diffX = skip.size()[3] - x.size()[3]
             skip = skip[:, :, diffY // 2 : skip.size()[2] - diffY // 2 - (diffY % 2),
                           diffX // 2 : skip.size()[3] - diffX // 2 - (diffX % 2)]

        x = torch.cat([x, skip], dim=1)
        return self.resblock(x)

class AttentionGate(nn.Module):
    """Additive attention gate from https://arxiv.org/abs/1804.03999"""
    def __init__(self, F_g, F_l, F_int):
        super().__init__()
        # Gating signal path
        self.W_g = nn.Sequential(
            nn.Conv2d(F_g, F_int, kernel_size=1, stride=1, padding=0, bias=True),
            nn.BatchNorm2d(F_int)
        )
        # Skip connection path
        self.W_x = nn.Sequential(
            nn.Conv2d(F_l, F_int, kernel_size=1, stride=1, padding=0, bias=True),
            nn.BatchNorm2d(F_int)
        )
        # Combine and generate attention coefficients
        self.psi = nn.Sequential(
            nn.Conv2d(F_int, 1, kernel_size=1, stride=1, padding=0, bias=True),
            nn.BatchNorm2d(1),
            nn.Sigmoid()
        )
        self.relu = nn.ReLU(inplace=True)
        # Note: The original paper doesn't mention alpha scaling here.
        # Keeping it 1.0 unless specified otherwise.
        # self.alpha = 1.0

    def forward(self, g, x):
        # g: gating signal (from lower layer, after up-sampling)
        # x: skip connection (from encoder)
        g1 = self.W_g(g) # (Batch, F_int, H, W)
        x1 = self.W_x(x) # (Batch, F_int, H, W)

        # Add and apply ReLU
        psi = self.relu(g1 + x1) # (Batch, F_int, H, W)

        # Generate attention map (alpha coefficients)
        psi = self.psi(psi) # (Batch, 1, H, W)

        # Apply attention map to the skip connection features
        # Element-wise multiplication, broadcasts psi across channels of x
        return x * psi # * self.alpha # Apply alpha scaling if needed

class AttnDecoderBlock(nn.Module):
    """Decoder block incorporating the Attention Gate"""
    def __init__(self, in_channels, out_channels, attn_channels):
        super().__init__()
        self.up = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2)
        # F_g = channels of gating signal (output of self.up) = out_channels
        # F_l = channels of skip connection = out_channels
        # F_int = intermediate channels = attn_channels
        self.attn_gate = AttentionGate(F_g=out_channels, F_l=out_channels, F_int=attn_channels)
        # Input channels to conv = out_channels (from up-conv) + out_channels (from attended skip)
        self.conv = nn.Sequential(
            nn.Conv2d(out_channels * 2, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, 3, padding=1, bias=False),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True)
        )

    def forward(self, x, skip):
        # x: feature map from previous decoder layer
        # skip: skip connection from corresponding encoder layer
        g = self.up(x) # Gating signal 'g', spatial size increased, channels = out_channels

        # Ensure spatial dimensions match for attention gate inputs
        if g.shape != skip.shape:
             diffY = skip.size()[2] - g.size()[2]
             diffX = skip.size()[3] - g.size()[3]
             # Crop skip connection to match size of g
             skip_cropped = skip[:, :, diffY // 2 : skip.size()[2] - diffY // 2 - (diffY % 2),
                                   diffX // 2 : skip.size()[3] - diffX // 2 - (diffX % 2)]
             # Pad g if needed (less common if using valid padding/pooling)
             # g = F.pad(g, [diffX // 2, diffX - diffX // 2, diffY // 2, diffY - diffY // 2])

             # Check again after potential cropping/padding
             if g.shape != skip_cropped.shape:
                 raise ValueError(f"Shape mismatch after crop: g={g.shape}, skip_cropped={skip_cropped.shape}")
             attn_skip = self.attn_gate(g=g, x=skip_cropped) # Calculate attention-weighted skip connection
        else:
             attn_skip = self.attn_gate(g=g, x=skip)


        # Concatenate the gating signal (up-sampled features) and the attended skip connection
        x = torch.cat([g, attn_skip], dim=1)
        return self.conv(x)


# --- Unified U-Net Class ---

class UNet(nn.Module):
    def __init__(self, num_classes=13, variant="vanilla", alpha=1.0):
        super().__init__()
        # Added "attention" to the valid variants
        assert variant in ["vanilla", "noskip", "residual", "attention"], f"Invalid variant: {variant}"

        self.variant = variant
        self.alpha = alpha # Used mainly for residual variant scaling

        # --- Select block types based on variant ---
        if variant == "residual":
            Encoder = lambda in_ch, out_ch: ResidualEncoderBlock(in_ch, out_ch, alpha)
            # Note: ResidualDecoderBlock needs modification if alpha used differently
            Decoder = lambda in_ch, out_ch: ResidualDecoderBlock(in_ch, out_ch, alpha)
            BottleneckBlock = lambda in_ch, mid_ch, out_ch: nn.Sequential(
                ResidualBlock(in_ch, mid_ch, alpha),
                ResidualBlock(mid_ch, out_ch, alpha)
            )
        elif variant == "attention":
            Encoder = EncoderBlock # Standard encoder
            # Use Attention Decoder Block, requires attn_channels argument
            # We'll define attn_channels inline during instantiation below
            Decoder = AttnDecoderBlock
            BottleneckBlock = lambda in_ch, mid_ch, out_ch: nn.Sequential(
                nn.Conv2d(in_ch, mid_ch, 3, padding=1, bias=False), nn.BatchNorm2d(mid_ch), nn.ReLU(inplace=True),
                nn.Conv2d(mid_ch, out_ch, 3, padding=1, bias=False), nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True)
            )
        else: # "vanilla" or "noskip"
            Encoder = EncoderBlock
            Decoder = DecoderBlock if variant == "vanilla" else DecoderBlockNoSkip
            BottleneckBlock = lambda in_ch, mid_ch, out_ch: nn.Sequential(
                nn.Conv2d(in_ch, mid_ch, 3, padding=1, bias=False), nn.BatchNorm2d(mid_ch), nn.ReLU(inplace=True),
                nn.Conv2d(mid_ch, out_ch, 3, padding=1, bias=False), nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True)
            )

        # --- Define Network Layers ---
        # Encoder Path
        self.enc1 = Encoder(3, 64)       # Output: 64 channels
        self.enc2 = Encoder(64, 128)     # Output: 128 channels
        self.enc3 = Encoder(128, 256)    # Output: 256 channels
        self.enc4 = Encoder(256, 512)    # Output: 512 channels

        # Bottleneck
        self.bottleneck = BottleneckBlock(512, 1024, 1024) # Output: 1024 channels

        # Decoder Path
        if variant == "attention":
            # Specify attn_channels (F_int), typically half of F_l/F_g
            self.dec1 = Decoder(1024, 512, attn_channels=256) # Input: 1024 (bottle) + 512 (skip s4), Output: 512
            self.dec2 = Decoder(512, 256, attn_channels=128)  # Input: 512 (dec1) + 256 (skip s3), Output: 256
            self.dec3 = Decoder(256, 128, attn_channels=64)   # Input: 256 (dec2) + 128 (skip s2), Output: 128
            self.dec4 = Decoder(128, 64, attn_channels=32)    # Input: 128 (dec3) + 64 (skip s1), Output: 64
        else:
            # These decoders don't need attn_channels
            self.dec1 = Decoder(1024, 512) # Input: 1024 (bottle) + [512 (skip s4) if skips], Output: 512
            self.dec2 = Decoder(512, 256)  # Input: 512 (dec1) + [256 (skip s3) if skips], Output: 256
            self.dec3 = Decoder(256, 128)  # Input: 256 (dec2) + [128 (skip s2) if skips], Output: 128
            self.dec4 = Decoder(128, 64)   # Input: 128 (dec3) + [64 (skip s1) if skips], Output: 64

        # Final 1x1 Convolution
        self.final_conv = nn.Conv2d(64, num_classes, kernel_size=1)

    def forward(self, x):
        # Encoder Path
        x, s1 = self.enc1(x)
        x, s2 = self.enc2(x)
        x, s3 = self.enc3(x)
        x, s4 = self.enc4(x)

        # Bottleneck
        x = self.bottleneck(x)

        # Decoder Path
        # Variants that use skip connections
        if self.variant in ["vanilla", "residual", "attention"]:
            x = self.dec1(x, s4)
            x = self.dec2(x, s3)
            x = self.dec3(x, s2)
            x = self.dec4(x, s1)
        # Variant without skip connections
        elif self.variant == "noskip":
            x = self.dec1(x)
            x = self.dec2(x)
            x = self.dec3(x)
            x = self.dec4(x)

        # Final Output
        return self.final_conv(x)

# --- Instantiate Models ---

# Vanilla U-Net
model_vanilla = UNet(variant="vanilla")

# U-Net without skip connections
model_noskip = UNet(variant="noskip")

# Residual U-Net (using alpha for residual scaling)
model_residual = UNet(variant="residual", alpha=0.7)

# Attention U-Net
model_attention = UNet(variant="attention") # alpha is ignored unless AttentionGate/AttnDecoderBlock uses it

print("--------------------------------------- VANILLA U-NET -------------------------------------------")
print(model_vanilla)
print("\n--------------------------------------- NO-SKIP U-NET -------------------------------------------")
print(model_noskip)
print("\n--------------------------------------- RESIDUAL U-NET ------------------------------------------")
print(model_residual)
print("\n--------------------------------------- ATTENTION U-NET -----------------------------------------")
print(model_attention)

# --- Example Input and Forward Pass (Optional) ---
print("\n--- Testing Forward Pass with Dummy Input (Batch Size 2, Height/Width 256) ---")
dummy_input = torch.randn(2, 3, 256, 256) # Example: Batch size 2, 3 channels, 256x256 image

print("Vanilla Output Shape:", model_vanilla(dummy_input).shape)
print("No-Skip Output Shape:", model_noskip(dummy_input).shape)
print("Residual Output Shape:", model_residual(dummy_input).shape)
print("Attention Output Shape:", model_attention(dummy_input).shape)

--------------------------------------- VANILLA U-NET -------------------------------------------
UNet(
  (enc1): EncoderBlock(
    (conv): Sequential(
      (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (5): ReLU(inplace=True)
    )
    (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (enc2): EncoderBlock(
    (conv): Sequential(
      (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU(inplace=True)
      (3): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(

In [4]:
# --- Dataset Class ---
class SegmentationDataset(Dataset):
    def __init__(self, image_dir, label_dir, target_size=(256, 256), transform=None):
        super().__init__()
        self.image_dir = image_dir
        self.label_dir = label_dir
        self.target_size = target_size
        self.transform = transform

        self.image_files = sorted([f for f in os.listdir(image_dir) if f.endswith('.png')])
        self.label_files = sorted([f for f in os.listdir(label_dir) if f.endswith('.png')])

        assert len(self.image_files) == len(self.label_files), "Mismatch between images and labels"
        base_image_files = [os.path.splitext(f)[0] for f in self.image_files]
        base_label_files = [os.path.splitext(f)[0] for f in self.label_files]
        assert base_image_files == base_label_files, f"Mismatched filenames: {self.image_files[:5]} vs {self.label_files[:5]}"


    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_files[idx])
        image = cv2.imread(img_path)
        if image is None: raise FileNotFoundError(f"Could not read image: {img_path}")
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        image = cv2.resize(image, self.target_size, interpolation=cv2.INTER_LINEAR)

        mask_path = os.path.join(self.label_dir, self.label_files[idx])
        mask = cv2.imread(mask_path, cv2.IMREAD_GRAYSCALE)
        if mask is None: raise FileNotFoundError(f"Could not read mask: {mask_path}")
        mask = cv2.resize(mask, self.target_size, interpolation=cv2.INTER_NEAREST)

        img_tensor = transforms.ToTensor()(image).float()
        mask_tensor = torch.from_numpy(mask).long()

        if self.transform:
            seed = np.random.randint(2147483647)
            torch.manual_seed(seed)
            img_tensor = self.transform(img_tensor)

            mask_pil = transforms.ToPILImage()(mask_tensor.unsqueeze(0).byte())
            torch.manual_seed(seed)
            # Apply only spatial transforms from self.transform to mask if needed
            # For simplicity, assuming self.transform only contains spatial ones if applied to mask
            mask_transformed_pil = self.transform(mask_pil) # Needs careful check if transform includes non-spatial
            mask_tensor = transforms.ToTensor()(mask_transformed_pil).squeeze(0).long()

        return img_tensor, mask_tensor

# --- mIoU Function ---
def compute_mIoU(pred, target, num_classes):
    pred = pred.argmax(1)
    pred = pred.cpu()
    target = target.cpu()

    ious = []
    for cls in range(num_classes):
        pred_inds = (pred == cls)
        target_inds = (target == cls)
        intersection = (pred_inds & target_inds).sum().item()
        union = (pred_inds | target_inds).sum().item()
        if union == 0:
            ious.append(float('nan'))
        else:
            ious.append(intersection / union)

    valid_ious = [iou for iou in ious if not np.isnan(iou)]
    if not valid_ious: return 0.0
    mean_iou = sum(valid_ious) / len(valid_ious)
    return mean_iou


# --- Plotting Function (Kept for local plotting if needed) ---
def plot_metrics(train_losses, val_losses, train_mious, val_mious, title_prefix=""):
    # This function is now less critical as W&B handles plotting,
    # but can be useful for quick local checks.
    epochs = range(1, len(train_losses) + 1)
    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(epochs, train_losses, 'bo-', label='Training Loss')
    plt.plot(epochs, val_losses, 'ro-', label='Validation Loss')
    plt.title(f'{title_prefix} Training and Validation Loss')
    plt.xlabel('Epochs')
    plt.ylabel('Loss')
    plt.legend(); plt.grid(True)
    plt.subplot(1, 2, 2)
    plt.plot(epochs, train_mious, 'bo-', label='Training mIoU')
    plt.plot(epochs, val_mious, 'ro-', label='Validation mIoU')
    plt.title(f'{title_prefix} Training and Validation mIoU')
    plt.xlabel('Epochs'); plt.ylabel('mIoU')
    plt.legend(); plt.grid(True)
    plt.tight_layout(); plt.show()

# --- Visualization Function (Adapted for W&B) ---
def log_predictions_to_wandb(model, dataloader, device, num_samples=5, num_classes=13, epoch=None):
    model.eval()
    samples_shown = 0
    colors = plt.cm.get_cmap('tab20', num_classes)
    log_data = [] # List to store wandb Image objects for logging

    def colorize_mask(mask_tensor):
        mask_np = mask_tensor.cpu().numpy().astype(np.uint8)
        colored_mask = np.zeros((*mask_np.shape, 3), dtype=np.uint8)
        for cls in range(num_classes):
            colored_mask[mask_np == cls] = (np.array(colors(cls)[:3]) * 255).astype(np.uint8)
        return colored_mask

    with torch.no_grad():
        for images, masks_true in dataloader:
            if samples_shown >= num_samples: break
            images = images.to(device)
            masks_true = masks_true.to(device)

            outputs = model(images)
            masks_pred = outputs.argmax(1)

            for i in range(images.size(0)):
                if samples_shown >= num_samples: break

                img_np = images[i].cpu().permute(1, 2, 0).numpy()
                img_np = (img_np * 255).astype(np.uint8)

                mask_true_color = colorize_mask(masks_true[i])
                mask_pred_color = colorize_mask(masks_pred[i])

                # Create wandb.Image objects
                log_data.append(wandb.Image(img_np, caption=f"Epoch_{epoch}_Input_{samples_shown}"))
                log_data.append(wandb.Image(mask_true_color, caption=f"Epoch_{epoch}_TrueMask_{samples_shown}"))
                log_data.append(wandb.Image(mask_pred_color, caption=f"Epoch_{epoch}_PredMask_{samples_shown}"))

                samples_shown += 1
            if samples_shown >= num_samples: break # Break outer loop too

    # Log the collected images to W&B
    if log_data:
         wandb.log({"test_predictions": log_data}, step=epoch if epoch is not None else wandb.run.step) # Associate with epoch if available

# --- Training Loop Function (Adapted for W&B) ---
def train_model(model, train_loader, val_loader, criterion, optimizer, num_epochs, device, num_classes, log_preds_freq=10):
    # W&B automatically tracks gradients and parameters if watch is called
    wandb.watch(model, criterion, log="all", log_freq=100) # Log gradients, params every 100 steps

    best_val_miou = -1.0
    history = {'train_loss': [], 'val_loss': [], 'train_miou': [], 'val_miou': []} # For local plotting if needed

    for epoch in range(num_epochs):
        # --- Training Phase ---
        model.train()
        running_loss = 0.0
        running_miou = 0.0
        train_samples = 0

        pbar_train = tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Train]")
        for images, masks in pbar_train:
            images, masks = images.to(device), masks.to(device)

            optimizer.zero_grad()
            outputs = model(images)
            loss = criterion(outputs, masks)
            loss.backward()
            optimizer.step()

            running_loss += loss.item() * images.size(0)
            batch_miou = compute_mIoU(outputs, masks, num_classes)
            running_miou += batch_miou * images.size(0)
            train_samples += images.size(0)
            pbar_train.set_postfix({'Loss': running_loss/train_samples, 'mIoU': running_miou/train_samples})

        epoch_train_loss = running_loss / len(train_loader.dataset)
        epoch_train_miou = running_miou / len(train_loader.dataset)
        history['train_loss'].append(epoch_train_loss)
        history['train_miou'].append(epoch_train_miou)

        # --- Validation Phase ---
        model.eval()
        running_val_loss = 0.0
        running_val_miou = 0.0
        val_samples = 0

        pbar_val = tqdm(val_loader, desc=f"Epoch {epoch+1}/{num_epochs} [Val]")
        with torch.no_grad():
            for images, masks in pbar_val:
                images, masks = images.to(device), masks.to(device)
                outputs = model(images)
                loss = criterion(outputs, masks)
                running_val_loss += loss.item() * images.size(0)
                batch_miou = compute_mIoU(outputs, masks, num_classes)
                running_val_miou += batch_miou * images.size(0)
                val_samples += images.size(0)
                pbar_val.set_postfix({'Loss': running_val_loss/val_samples, 'mIoU': running_val_miou/val_samples})

        epoch_val_loss = running_val_loss / len(val_loader.dataset)
        epoch_val_miou = running_val_miou / len(val_loader.dataset)
        history['val_loss'].append(epoch_val_loss)
        history['val_miou'].append(epoch_val_miou)

        print(f"Epoch {epoch+1}/{num_epochs} => "
              f"Train Loss: {epoch_train_loss:.4f}, Train mIoU: {epoch_train_miou:.4f} | "
              f"Val Loss: {epoch_val_loss:.4f}, Val mIoU: {epoch_val_miou:.4f}")

        # --- W&B Logging ---
        wandb.log({
            "epoch": epoch + 1,
            "train_loss": epoch_train_loss,
            "train_miou": epoch_train_miou,
            "val_loss": epoch_val_loss,
            "val_miou": epoch_val_miou
        }, step=epoch + 1) # Use epoch as the step

        # Log sample predictions periodically to W&B during validation
        if (epoch + 1) % log_preds_freq == 0 or epoch == num_epochs -1 :
             print(f"Logging predictions to W&B for epoch {epoch+1}...")
             log_predictions_to_wandb(model, val_loader, device, num_samples=5, num_classes=num_classes, epoch=epoch+1)


        if epoch_val_miou > best_val_miou:
            best_val_miou = epoch_val_miou
            # Optional: Save model checkpoint locally and log as W&B artifact
            # model_path = f"best_model_{wandb.run.name}_epoch_{epoch+1}.pth"
            # torch.save(model.state_dict(), model_path)
            # artifact = wandb.Artifact(f'model-{wandb.run.name}', type='model')
            # artifact.add_file(model_path)
            # wandb.log_artifact(artifact)
            # print(f"  -> New best model saved with Val mIoU: {best_val_miou:.4f}")
            wandb.run.summary["best_val_miou"] = best_val_miou # Add to summary


    return model, history # Return history for potential local plotting

# --- Evaluation Function (Adapted for W&B) ---
def evaluate_model(model, test_loader, criterion, device, num_classes):
    model.eval()
    running_test_loss = 0.0
    running_test_miou = 0.0
    test_samples = 0

    pbar_test = tqdm(test_loader, desc="Testing")
    with torch.no_grad():
        for images, masks in pbar_test:
            images, masks = images.to(device), masks.to(device)
            outputs = model(images)
            loss = criterion(outputs, masks)

            running_test_loss += loss.item() * images.size(0)
            batch_miou = compute_mIoU(outputs, masks, num_classes)
            running_test_miou += batch_miou * images.size(0)
            test_samples += images.size(0)
            pbar_test.set_postfix({'Loss': running_test_loss/test_samples, 'mIoU': running_test_miou/test_samples})

    test_loss = running_test_loss / len(test_loader.dataset)
    test_miou = running_test_miou / len(test_loader.dataset)

    print(f"\n--- Test Set Evaluation ---")
    print(f"Test Loss: {test_loss:.4f}")
    print(f"Test mIoU: {test_miou:.4f}")
    print("---------------------------\n")

    # --- W&B Logging ---
    # Log final test metrics to the summary of the run
    wandb.run.summary["test_loss"] = test_loss
    wandb.run.summary["test_miou"] = test_miou

    return test_loss, test_miou

In [5]:
if __name__ == "__main__":

    # --- Configuration ---
    DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"Using device: {DEVICE}")

    # --- W&B Configuration ---
    WANDB_PROJECT = "unet-segmentation-variants" # CHANGE THIS to your project name
    WANDB_ENTITY = None # Set to your W&B username or team name, or leave as None

    # --- IMPORTANT: SET YOUR DATA PATHS HERE ---
    TRAIN_DIR = "/kaggle/input/seg-q3/dataset_256/train" # Contains 'images' and 'labels' subfolders
    TEST_DIR = "/kaggle/input/seg-q3/dataset_256/test"   # Contains 'images' and 'labels' subfolders
    # ---

    NUM_CLASSES = 13
    IMG_SIZE = (256, 256)
    BATCH_SIZE = 8
    LEARNING_RATE = 1e-4
    NUM_EPOCHS = 50 # Min 50 epochs
    VALIDATION_SPLIT = 0.15
    RANDOM_SEED = 42
    LOG_PREDS_FREQ = 10 # Log prediction images every 10 epochs

    torch.manual_seed(RANDOM_SEED)
    np.random.seed(RANDOM_SEED)
    if DEVICE == 'cuda': torch.cuda.manual_seed(RANDOM_SEED)

    # --- Base Hyperparameter Config for W&B ---
    base_config = {
        "learning_rate": LEARNING_RATE,
        "batch_size": BATCH_SIZE,
        "num_epochs": NUM_EPOCHS,
        "validation_split": VALIDATION_SPLIT,
        "image_size": IMG_SIZE[0],
        "num_classes": NUM_CLASSES,
        "seed": RANDOM_SEED,
        "optimizer": "Adam",
        "loss_function": "CrossEntropyLoss"
    }


    # --- Datasets and DataLoaders ---
    base_transform = None # Add augmentations here if needed

    full_train_dataset = SegmentationDataset(
        image_dir=os.path.join(TRAIN_DIR, 'images'),
        label_dir=os.path.join(TRAIN_DIR, 'labels'),
        target_size=IMG_SIZE,
        transform=base_transform
    )
    num_train = len(full_train_dataset)
    num_val = int(np.floor(VALIDATION_SPLIT * num_train))
    num_train_split = num_train - num_val
    train_dataset, val_dataset = random_split(
        full_train_dataset, [num_train_split, num_val],
        generator=torch.Generator().manual_seed(RANDOM_SEED)
    )
    print(f"Split: {len(train_dataset)} training, {len(val_dataset)} validation")

    test_dataset = SegmentationDataset(
        image_dir=os.path.join(TEST_DIR, 'images'),
        label_dir=os.path.join(TEST_DIR, 'labels'),
        target_size=IMG_SIZE,
        transform=None
    )
    print(f"Test set size: {len(test_dataset)}")

    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

    # --- Model Training Loop ---
    model_variants = {
        "vanilla": {"variant": "vanilla"},
        "noskip": {"variant": "noskip"},
        "residual": {"variant": "residual"},
        "attention": {"variant": "attention"}
    }
    results = {}

    for name, params in model_variants.items():
        print(f"\n===== Training U-Net ({name}) =====")

        # --- Initialize W&B Run ---
        run_config = base_config.copy()
        run_config.update(params) # Add variant-specific params (like alpha)
        run_name = f"unet-{name}-lr{LEARNING_RATE}-bs{BATCH_SIZE}-ep{NUM_EPOCHS}"

        run = wandb.init(
            project=WANDB_PROJECT,
            entity=WANDB_ENTITY,
            name=run_name,
            config=run_config,
            reinit=True # Allow reinitializing within the same script
        )

        # Instantiate Model
        model = UNet(num_classes=NUM_CLASSES, **params).to(DEVICE)
        print(f"Model Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")

        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

        # Train the model
        trained_model, history = train_model(
            model, train_loader, val_loader, criterion, optimizer, NUM_EPOCHS, DEVICE, NUM_CLASSES, LOG_PREDS_FREQ
        )

        # Evaluate on Test Set
        test_loss, test_miou = evaluate_model(
            trained_model, test_loader, criterion, DEVICE, NUM_CLASSES
        )
        results[name] = test_miou

        # Log final predictions from test set to W&B
        print(f"--- Logging final Test Predictions for U-Net ({name}) ---")
        log_predictions_to_wandb(trained_model, test_loader, DEVICE, num_samples=10, num_classes=NUM_CLASSES, epoch=NUM_EPOCHS) # Log more test samples

        # --- Finish W&B Run ---
        run.finish()

        # Optional: Plot locally using history if needed
        # plot_metrics(history['train_loss'], history['val_loss'], history['train_miou'], history['val_miou'], title_prefix=f"U-Net ({name})")


    # --- Final Results ---
    print("\n===== Final Test mIoU Results =====")
    for name, miou in results.items():
        print(f"U-Net ({name}): {miou:.4f}") # These results are also in the W&B run summaries
    print("===================================\n")

    # --- Discussion Placeholders ---
    print("\n--- Discussion Points (Refer to W&B plots/results) ---")
    print("3.2 U-Net without skip connections:")
    print(" - Q: What differences do you observe in the visualized results (W&B images) compared to the standard U-Net results?")
    print(" - A: [Analyze logged images in W&B - e.g., NoSkip likely shows much poorer boundary definition, smoother/blurrier outputs, possibly missing small objects compared to Vanilla.]")
    print(" - Q: Discuss the importance of skip connections using W&B metrics/plots.")
    print(" - A: [Compare mIoU curves and final test mIoU in W&B for Vanilla vs NoSkip. Expect significantly lower mIoU for NoSkip. Explain how skip connections preserve high-res spatial info lost in pooling, enabling precise localization crucial for segmentation.]")
    print("\n3.3 Residual U-Net:")
    print(" - [Compare Residual U-Net metrics (loss, mIoU curves, test mIoU) and visualizations in W&B against Vanilla U-Net. Did residual blocks improve training stability (smoother loss curves?) or final performance? Discuss the potential benefits (gradient flow) vs. added complexity/parameters.]")
    print("-------------------------\n")

Using device: cuda
Split: 1700 training, 300 validation
Test set size: 500

===== Training U-Net (vanilla) =====


Model Parameters: 31038413


Epoch 1/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 1/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1/50 => Train Loss: 1.0524, Train mIoU: 0.3553 | Val Loss: 0.6452, Val mIoU: 0.5355


Epoch 2/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 2/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 2/50 => Train Loss: 0.4729, Train mIoU: 0.5577 | Val Loss: 0.3530, Val mIoU: 0.5728


Epoch 3/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 3/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 3/50 => Train Loss: 0.2826, Train mIoU: 0.6103 | Val Loss: 0.2229, Val mIoU: 0.6875


Epoch 4/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 4/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 4/50 => Train Loss: 0.1942, Train mIoU: 0.7014 | Val Loss: 0.1722, Val mIoU: 0.7191


Epoch 5/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 5/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 5/50 => Train Loss: 0.1525, Train mIoU: 0.7323 | Val Loss: 0.1363, Val mIoU: 0.7483


Epoch 6/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 6/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 6/50 => Train Loss: 0.1277, Train mIoU: 0.7580 | Val Loss: 0.1195, Val mIoU: 0.7424


Epoch 7/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 7/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 7/50 => Train Loss: 0.0990, Train mIoU: 0.8014 | Val Loss: 0.0945, Val mIoU: 0.8062


Epoch 8/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 8/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 8/50 => Train Loss: 0.0853, Train mIoU: 0.8204 | Val Loss: 0.0971, Val mIoU: 0.8038


Epoch 9/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 9/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 9/50 => Train Loss: 0.0751, Train mIoU: 0.8388 | Val Loss: 0.0793, Val mIoU: 0.8222


Epoch 10/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 10/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 10/50 => Train Loss: 0.0657, Train mIoU: 0.8560 | Val Loss: 0.0678, Val mIoU: 0.8539
Logging predictions to W&B for epoch 10...


  colors = plt.cm.get_cmap('tab20', num_classes)


Epoch 11/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 11/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 11/50 => Train Loss: 0.0605, Train mIoU: 0.8654 | Val Loss: 0.0799, Val mIoU: 0.8218


Epoch 12/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 12/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 12/50 => Train Loss: 0.0663, Train mIoU: 0.8446 | Val Loss: 0.0596, Val mIoU: 0.8555


Epoch 13/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 21/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 21/50 => Train Loss: 0.0297, Train mIoU: 0.9270 | Val Loss: 0.0367, Val mIoU: 0.9072


Epoch 22/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 22/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 22/50 => Train Loss: 0.0279, Train mIoU: 0.9307 | Val Loss: 0.0358, Val mIoU: 0.9078


Epoch 23/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 23/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 23/50 => Train Loss: 0.0275, Train mIoU: 0.9303 | Val Loss: 0.0355, Val mIoU: 0.9064


Epoch 24/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 24/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 24/50 => Train Loss: 0.0585, Train mIoU: 0.8537 | Val Loss: 0.0394, Val mIoU: 0.8967


Epoch 25/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 25/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 25/50 => Train Loss: 0.0283, Train mIoU: 0.9277 | Val Loss: 0.0348, Val mIoU: 0.9084


Epoch 26/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 26/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 26/50 => Train Loss: 0.0250, Train mIoU: 0.9379 | Val Loss: 0.0320, Val mIoU: 0.9163


Epoch 27/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 27/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 27/50 => Train Loss: 0.0230, Train mIoU: 0.9438 | Val Loss: 0.0305, Val mIoU: 0.9203


Epoch 28/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 28/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 28/50 => Train Loss: 0.0216, Train mIoU: 0.9472 | Val Loss: 0.0305, Val mIoU: 0.9206


Epoch 29/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 29/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 29/50 => Train Loss: 0.0210, Train mIoU: 0.9484 | Val Loss: 0.0301, Val mIoU: 0.9212


Epoch 30/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 30/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 30/50 => Train Loss: 0.0201, Train mIoU: 0.9508 | Val Loss: 0.0299, Val mIoU: 0.9222
Logging predictions to W&B for epoch 30...


Epoch 31/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 31/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 31/50 => Train Loss: 0.0196, Train mIoU: 0.9525 | Val Loss: 0.0300, Val mIoU: 0.9195


Epoch 32/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 32/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 32/50 => Train Loss: 0.0191, Train mIoU: 0.9530 | Val Loss: 0.0302, Val mIoU: 0.9203


Epoch 33/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 33/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 33/50 => Train Loss: 0.0190, Train mIoU: 0.9530 | Val Loss: 0.0292, Val mIoU: 0.9237


Epoch 34/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 34/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 34/50 => Train Loss: 0.0596, Train mIoU: 0.8424 | Val Loss: 0.0391, Val mIoU: 0.8895


Epoch 35/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 35/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 35/50 => Train Loss: 0.0257, Train mIoU: 0.9319 | Val Loss: 0.0356, Val mIoU: 0.9072


Epoch 36/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 36/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 36/50 => Train Loss: 0.0215, Train mIoU: 0.9445 | Val Loss: 0.0282, Val mIoU: 0.9232


Epoch 37/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 37/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 37/50 => Train Loss: 0.0183, Train mIoU: 0.9546 | Val Loss: 0.0276, Val mIoU: 0.9268


Epoch 38/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 38/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 38/50 => Train Loss: 0.0174, Train mIoU: 0.9579 | Val Loss: 0.0274, Val mIoU: 0.9277


Epoch 39/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 39/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 39/50 => Train Loss: 0.0165, Train mIoU: 0.9606 | Val Loss: 0.0273, Val mIoU: 0.9288


Epoch 40/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 40/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 40/50 => Train Loss: 0.0160, Train mIoU: 0.9623 | Val Loss: 0.0278, Val mIoU: 0.9262
Logging predictions to W&B for epoch 40...


Epoch 41/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 41/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 41/50 => Train Loss: 0.0156, Train mIoU: 0.9631 | Val Loss: 0.0279, Val mIoU: 0.9290


Epoch 42/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 42/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 42/50 => Train Loss: 0.0150, Train mIoU: 0.9647 | Val Loss: 0.0273, Val mIoU: 0.9286


Epoch 43/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 43/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 43/50 => Train Loss: 0.0146, Train mIoU: 0.9657 | Val Loss: 0.0280, Val mIoU: 0.9273


Epoch 44/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 44/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 44/50 => Train Loss: 0.0142, Train mIoU: 0.9658 | Val Loss: 0.0287, Val mIoU: 0.9231


Epoch 45/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 45/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 45/50 => Train Loss: 0.0144, Train mIoU: 0.9651 | Val Loss: 0.0272, Val mIoU: 0.9309


Epoch 46/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 46/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 46/50 => Train Loss: 0.0139, Train mIoU: 0.9668 | Val Loss: 0.0274, Val mIoU: 0.9283


Epoch 47/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 47/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 47/50 => Train Loss: 0.0147, Train mIoU: 0.9651 | Val Loss: 0.0302, Val mIoU: 0.9241


Epoch 48/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 48/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 48/50 => Train Loss: 0.0270, Train mIoU: 0.9295 | Val Loss: 0.0798, Val mIoU: 0.7885


Epoch 49/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 49/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 49/50 => Train Loss: 0.0310, Train mIoU: 0.9130 | Val Loss: 0.0276, Val mIoU: 0.9222


Epoch 50/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 50/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 50/50 => Train Loss: 0.0155, Train mIoU: 0.9614 | Val Loss: 0.0253, Val mIoU: 0.9330
Logging predictions to W&B for epoch 50...


Testing:   0%|          | 0/63 [00:00<?, ?it/s]


--- Test Set Evaluation ---
Test Loss: 0.0264
Test mIoU: 0.9113
---------------------------

--- Logging final Test Predictions for U-Net (vanilla) ---


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇██
train_loss,█▄▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_miou,▁▃▄▅▅▆▆▇▇▇▇▇▇▇▇▇▇██▇███████▇████████████
val_loss,█▅▃▃▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁
val_miou,▁▂▄▅▅▆▆▇▆▇▇▇▇▇▇▇███▇██████▇███████████▅█

0,1
best_val_miou,0.93304
epoch,50.0
test_loss,0.02644
test_miou,0.91129
train_loss,0.0155
train_miou,0.96142
val_loss,0.02525
val_miou,0.93304



===== Training U-Net (noskip) =====


Model Parameters: 27904973


Epoch 1/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 1/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1/50 => Train Loss: 1.1493, Train mIoU: 0.2425 | Val Loss: 0.7774, Val mIoU: 0.3148


Epoch 2/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 2/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 2/50 => Train Loss: 0.5765, Train mIoU: 0.4036 | Val Loss: 0.4620, Val mIoU: 0.4298


Epoch 3/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 3/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 3/50 => Train Loss: 0.3742, Train mIoU: 0.4594 | Val Loss: 0.3032, Val mIoU: 0.5077


Epoch 4/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 4/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 4/50 => Train Loss: 0.2612, Train mIoU: 0.5600 | Val Loss: 0.2479, Val mIoU: 0.5598


Epoch 5/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 5/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 5/50 => Train Loss: 0.2067, Train mIoU: 0.6511 | Val Loss: 0.1836, Val mIoU: 0.6675


Epoch 6/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 6/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 6/50 => Train Loss: 0.1638, Train mIoU: 0.7186 | Val Loss: 0.1613, Val mIoU: 0.7088


Epoch 7/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 7/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 7/50 => Train Loss: 0.1402, Train mIoU: 0.7432 | Val Loss: 0.1392, Val mIoU: 0.7375


Epoch 8/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 8/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 8/50 => Train Loss: 0.1238, Train mIoU: 0.7674 | Val Loss: 0.1257, Val mIoU: 0.7643


Epoch 9/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 9/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 9/50 => Train Loss: 0.1118, Train mIoU: 0.7859 | Val Loss: 0.1190, Val mIoU: 0.7637


Epoch 10/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 10/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 10/50 => Train Loss: 0.1027, Train mIoU: 0.7971 | Val Loss: 0.1119, Val mIoU: 0.7733
Logging predictions to W&B for epoch 10...


Epoch 11/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 11/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 11/50 => Train Loss: 0.0948, Train mIoU: 0.8084 | Val Loss: 0.1096, Val mIoU: 0.7777


Epoch 12/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 12/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 12/50 => Train Loss: 0.1045, Train mIoU: 0.7850 | Val Loss: 0.1577, Val mIoU: 0.7088


Epoch 13/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 13/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 13/50 => Train Loss: 0.0924, Train mIoU: 0.8110 | Val Loss: 0.0999, Val mIoU: 0.7864


Epoch 14/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 14/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 14/50 => Train Loss: 0.0817, Train mIoU: 0.8324 | Val Loss: 0.0973, Val mIoU: 0.7932


Epoch 15/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 15/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 15/50 => Train Loss: 0.0777, Train mIoU: 0.8408 | Val Loss: 0.0934, Val mIoU: 0.8042


Epoch 16/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 16/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 16/50 => Train Loss: 0.0740, Train mIoU: 0.8492 | Val Loss: 0.0925, Val mIoU: 0.8073


Epoch 17/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 17/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 17/50 => Train Loss: 0.0726, Train mIoU: 0.8516 | Val Loss: 0.0919, Val mIoU: 0.8115


Epoch 18/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 18/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 18/50 => Train Loss: 0.0699, Train mIoU: 0.8559 | Val Loss: 0.0913, Val mIoU: 0.8034


Epoch 19/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 19/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 19/50 => Train Loss: 0.0672, Train mIoU: 0.8619 | Val Loss: 0.0900, Val mIoU: 0.8072


Epoch 20/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 20/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 20/50 => Train Loss: 0.0653, Train mIoU: 0.8656 | Val Loss: 0.0881, Val mIoU: 0.8180
Logging predictions to W&B for epoch 20...


Epoch 21/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 21/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 21/50 => Train Loss: 0.0634, Train mIoU: 0.8716 | Val Loss: 0.0875, Val mIoU: 0.8191


Epoch 22/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 22/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 22/50 => Train Loss: 0.0616, Train mIoU: 0.8731 | Val Loss: 0.0876, Val mIoU: 0.8200


Epoch 23/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 23/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 23/50 => Train Loss: 0.1183, Train mIoU: 0.7585 | Val Loss: 0.1001, Val mIoU: 0.7871


Epoch 24/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 24/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 24/50 => Train Loss: 0.0730, Train mIoU: 0.8490 | Val Loss: 0.0862, Val mIoU: 0.8214


Epoch 25/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 25/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 25/50 => Train Loss: 0.0614, Train mIoU: 0.8800 | Val Loss: 0.0850, Val mIoU: 0.8247


Epoch 26/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 26/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 26/50 => Train Loss: 0.0577, Train mIoU: 0.8884 | Val Loss: 0.0847, Val mIoU: 0.8285


Epoch 27/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 27/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 27/50 => Train Loss: 0.0560, Train mIoU: 0.8916 | Val Loss: 0.0842, Val mIoU: 0.8297


Epoch 28/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 28/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 28/50 => Train Loss: 0.0549, Train mIoU: 0.8912 | Val Loss: 0.0874, Val mIoU: 0.8183


Epoch 29/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 29/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 29/50 => Train Loss: 0.0534, Train mIoU: 0.8964 | Val Loss: 0.0861, Val mIoU: 0.8293


Epoch 30/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 30/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 30/50 => Train Loss: 0.0522, Train mIoU: 0.8994 | Val Loss: 0.0850, Val mIoU: 0.8287
Logging predictions to W&B for epoch 30...


Epoch 31/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 31/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 31/50 => Train Loss: 0.0514, Train mIoU: 0.9009 | Val Loss: 0.0857, Val mIoU: 0.8318


Epoch 32/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 32/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 32/50 => Train Loss: 0.0507, Train mIoU: 0.9016 | Val Loss: 0.0860, Val mIoU: 0.8261


Epoch 33/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 33/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 33/50 => Train Loss: 0.0505, Train mIoU: 0.8990 | Val Loss: 0.0879, Val mIoU: 0.8266


Epoch 34/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 34/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 34/50 => Train Loss: 0.0539, Train mIoU: 0.8936 | Val Loss: 0.0856, Val mIoU: 0.8316


Epoch 35/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 35/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 35/50 => Train Loss: 0.0487, Train mIoU: 0.9063 | Val Loss: 0.0861, Val mIoU: 0.8331


Epoch 36/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 36/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 36/50 => Train Loss: 0.0475, Train mIoU: 0.9093 | Val Loss: 0.0856, Val mIoU: 0.8365


Epoch 37/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 37/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 37/50 => Train Loss: 0.0461, Train mIoU: 0.9114 | Val Loss: 0.0864, Val mIoU: 0.8362


Epoch 38/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 38/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 38/50 => Train Loss: 0.0451, Train mIoU: 0.9147 | Val Loss: 0.0868, Val mIoU: 0.8351


Epoch 39/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 39/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 39/50 => Train Loss: 0.0445, Train mIoU: 0.9162 | Val Loss: 0.0885, Val mIoU: 0.8308


Epoch 40/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 40/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 40/50 => Train Loss: 0.0439, Train mIoU: 0.9174 | Val Loss: 0.0885, Val mIoU: 0.8290
Logging predictions to W&B for epoch 40...


Epoch 41/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 41/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 41/50 => Train Loss: 0.0439, Train mIoU: 0.9171 | Val Loss: 0.0879, Val mIoU: 0.8364


Epoch 42/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 42/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 42/50 => Train Loss: 0.0424, Train mIoU: 0.9208 | Val Loss: 0.0901, Val mIoU: 0.8292


Epoch 43/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 43/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 43/50 => Train Loss: 0.0414, Train mIoU: 0.9231 | Val Loss: 0.0882, Val mIoU: 0.8356


Epoch 44/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 44/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 44/50 => Train Loss: 0.0406, Train mIoU: 0.9261 | Val Loss: 0.0897, Val mIoU: 0.8372


Epoch 45/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 45/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 45/50 => Train Loss: 0.0400, Train mIoU: 0.9267 | Val Loss: 0.0916, Val mIoU: 0.8361


Epoch 46/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 46/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 46/50 => Train Loss: 0.0394, Train mIoU: 0.9263 | Val Loss: 0.0908, Val mIoU: 0.8353


Epoch 47/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 47/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 47/50 => Train Loss: 0.0388, Train mIoU: 0.9288 | Val Loss: 0.0905, Val mIoU: 0.8360


Epoch 48/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 48/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 48/50 => Train Loss: 0.0380, Train mIoU: 0.9327 | Val Loss: 0.0922, Val mIoU: 0.8397


Epoch 49/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 49/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 49/50 => Train Loss: 0.0370, Train mIoU: 0.9343 | Val Loss: 0.0938, Val mIoU: 0.8372


Epoch 50/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 50/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 50/50 => Train Loss: 0.0363, Train mIoU: 0.9349 | Val Loss: 0.0942, Val mIoU: 0.8380
Logging predictions to W&B for epoch 50...


Testing:   0%|          | 0/63 [00:00<?, ?it/s]


--- Test Set Evaluation ---
Test Loss: 0.0992
Test mIoU: 0.8026
---------------------------

--- Logging final Test Predictions for U-Net (noskip) ---


0,1
epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▄▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_miou,▁▃▃▄▅▆▆▆▇▆▇▇▇▇▇▇▇▇▆▇████████████████████
val_loss,█▅▃▃▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_miou,▁▃▄▄▆▇▇▇▇▇▇██████▇██████████████████████

0,1
best_val_miou,0.83971
epoch,50.0
test_loss,0.09918
test_miou,0.80264
train_loss,0.03632
train_miou,0.93488
val_loss,0.09422
val_miou,0.83795



===== Training U-Net (residual) =====


Model Parameters: 51315597


Epoch 1/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 1/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1/50 => Train Loss: 0.5875, Train mIoU: 0.3776 | Val Loss: 0.2729, Val mIoU: 0.5544


Epoch 2/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 2/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 2/50 => Train Loss: 0.2089, Train mIoU: 0.5907 | Val Loss: 0.1804, Val mIoU: 0.6608


Epoch 3/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 3/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 3/50 => Train Loss: 0.1388, Train mIoU: 0.7040 | Val Loss: 0.1199, Val mIoU: 0.7099


Epoch 4/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 4/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 4/50 => Train Loss: 0.1056, Train mIoU: 0.7598 | Val Loss: 0.1018, Val mIoU: 0.7329


Epoch 5/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 5/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 5/50 => Train Loss: 0.0841, Train mIoU: 0.7940 | Val Loss: 0.0804, Val mIoU: 0.7959


Epoch 6/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 6/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 6/50 => Train Loss: 0.0732, Train mIoU: 0.8170 | Val Loss: 0.1020, Val mIoU: 0.7601


Epoch 7/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 7/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 7/50 => Train Loss: 0.0706, Train mIoU: 0.8274 | Val Loss: 0.0629, Val mIoU: 0.8352


Epoch 8/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 8/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 8/50 => Train Loss: 0.0541, Train mIoU: 0.8613 | Val Loss: 0.0538, Val mIoU: 0.8621


Epoch 9/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 9/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 9/50 => Train Loss: 0.0544, Train mIoU: 0.8586 | Val Loss: 0.0551, Val mIoU: 0.8554


Epoch 10/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 10/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 10/50 => Train Loss: 0.0456, Train mIoU: 0.8814 | Val Loss: 0.0496, Val mIoU: 0.8673
Logging predictions to W&B for epoch 10...


Epoch 11/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 11/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 11/50 => Train Loss: 0.0535, Train mIoU: 0.8616 | Val Loss: 0.0466, Val mIoU: 0.8711


Epoch 12/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 12/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 12/50 => Train Loss: 0.0384, Train mIoU: 0.8992 | Val Loss: 0.0427, Val mIoU: 0.8823


Epoch 13/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 13/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 13/50 => Train Loss: 0.0341, Train mIoU: 0.9099 | Val Loss: 0.0398, Val mIoU: 0.8889


Epoch 14/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 14/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 14/50 => Train Loss: 0.0317, Train mIoU: 0.9158 | Val Loss: 0.0387, Val mIoU: 0.8936


Epoch 15/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 15/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 15/50 => Train Loss: 0.0309, Train mIoU: 0.9168 | Val Loss: 0.0370, Val mIoU: 0.8982


Epoch 16/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 16/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 16/50 => Train Loss: 0.0279, Train mIoU: 0.9265 | Val Loss: 0.0359, Val mIoU: 0.8986


Epoch 17/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 17/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 17/50 => Train Loss: 0.0260, Train mIoU: 0.9317 | Val Loss: 0.0350, Val mIoU: 0.9038


Epoch 18/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 18/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 18/50 => Train Loss: 0.0250, Train mIoU: 0.9343 | Val Loss: 0.0335, Val mIoU: 0.9086


Epoch 19/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 19/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 19/50 => Train Loss: 0.0232, Train mIoU: 0.9386 | Val Loss: 0.0330, Val mIoU: 0.9091


Epoch 20/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 20/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 20/50 => Train Loss: 0.0219, Train mIoU: 0.9427 | Val Loss: 0.0326, Val mIoU: 0.9076
Logging predictions to W&B for epoch 20...


Epoch 21/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 21/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 21/50 => Train Loss: 0.0220, Train mIoU: 0.9408 | Val Loss: 0.0330, Val mIoU: 0.9078


Epoch 22/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 22/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 22/50 => Train Loss: 0.0199, Train mIoU: 0.9484 | Val Loss: 0.0311, Val mIoU: 0.9151


Epoch 23/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 23/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 23/50 => Train Loss: 0.0187, Train mIoU: 0.9519 | Val Loss: 0.0314, Val mIoU: 0.9140


Epoch 24/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 24/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 24/50 => Train Loss: 0.0506, Train mIoU: 0.8663 | Val Loss: 0.0463, Val mIoU: 0.8576


Epoch 25/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 25/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 25/50 => Train Loss: 0.0246, Train mIoU: 0.9330 | Val Loss: 0.0307, Val mIoU: 0.9158


Epoch 26/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 26/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 26/50 => Train Loss: 0.0189, Train mIoU: 0.9508 | Val Loss: 0.0299, Val mIoU: 0.9187


Epoch 27/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 27/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 27/50 => Train Loss: 0.0171, Train mIoU: 0.9565 | Val Loss: 0.0293, Val mIoU: 0.9211


Epoch 28/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 28/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 28/50 => Train Loss: 0.0161, Train mIoU: 0.9600 | Val Loss: 0.0291, Val mIoU: 0.9228


Epoch 29/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 29/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 29/50 => Train Loss: 0.0157, Train mIoU: 0.9611 | Val Loss: 0.0298, Val mIoU: 0.9209


Epoch 30/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 30/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 30/50 => Train Loss: 0.0148, Train mIoU: 0.9634 | Val Loss: 0.0294, Val mIoU: 0.9217
Logging predictions to W&B for epoch 30...


Epoch 31/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 31/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 31/50 => Train Loss: 0.0140, Train mIoU: 0.9657 | Val Loss: 0.0292, Val mIoU: 0.9224


Epoch 32/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 32/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 32/50 => Train Loss: 0.0139, Train mIoU: 0.9663 | Val Loss: 0.0303, Val mIoU: 0.9203


Epoch 33/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 33/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 33/50 => Train Loss: 0.0134, Train mIoU: 0.9668 | Val Loss: 0.0292, Val mIoU: 0.9230


Epoch 34/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 34/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 34/50 => Train Loss: 0.0128, Train mIoU: 0.9689 | Val Loss: 0.0296, Val mIoU: 0.9243


Epoch 35/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 35/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 35/50 => Train Loss: 0.0125, Train mIoU: 0.9689 | Val Loss: 0.0295, Val mIoU: 0.9236


Epoch 36/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 36/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 36/50 => Train Loss: 0.0128, Train mIoU: 0.9674 | Val Loss: 0.0297, Val mIoU: 0.9228


Epoch 37/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 37/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 37/50 => Train Loss: 0.0118, Train mIoU: 0.9708 | Val Loss: 0.0293, Val mIoU: 0.9260


Epoch 38/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 38/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 38/50 => Train Loss: 0.0111, Train mIoU: 0.9736 | Val Loss: 0.0298, Val mIoU: 0.9266


Epoch 39/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 39/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 39/50 => Train Loss: 0.0111, Train mIoU: 0.9727 | Val Loss: 0.0312, Val mIoU: 0.9220


Epoch 40/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 40/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 40/50 => Train Loss: 0.0405, Train mIoU: 0.8930 | Val Loss: 0.0446, Val mIoU: 0.8598
Logging predictions to W&B for epoch 40...


Epoch 41/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 41/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 41/50 => Train Loss: 0.0200, Train mIoU: 0.9447 | Val Loss: 0.0284, Val mIoU: 0.9227


Epoch 42/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 42/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 42/50 => Train Loss: 0.0125, Train mIoU: 0.9697 | Val Loss: 0.0283, Val mIoU: 0.9264


Epoch 43/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 43/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 43/50 => Train Loss: 0.0106, Train mIoU: 0.9757 | Val Loss: 0.0284, Val mIoU: 0.9283


Epoch 44/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 44/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 44/50 => Train Loss: 0.0097, Train mIoU: 0.9785 | Val Loss: 0.0291, Val mIoU: 0.9289


Epoch 45/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 45/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 45/50 => Train Loss: 0.0091, Train mIoU: 0.9795 | Val Loss: 0.0289, Val mIoU: 0.9289


Epoch 46/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 46/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 46/50 => Train Loss: 0.0087, Train mIoU: 0.9805 | Val Loss: 0.0311, Val mIoU: 0.9267


Epoch 47/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 47/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 47/50 => Train Loss: 0.0086, Train mIoU: 0.9804 | Val Loss: 0.0305, Val mIoU: 0.9271


Epoch 48/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 48/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 48/50 => Train Loss: 0.0083, Train mIoU: 0.9816 | Val Loss: 0.0313, Val mIoU: 0.9277


Epoch 49/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 49/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 49/50 => Train Loss: 0.0080, Train mIoU: 0.9819 | Val Loss: 0.0312, Val mIoU: 0.9274


Epoch 50/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 50/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 50/50 => Train Loss: 0.0081, Train mIoU: 0.9819 | Val Loss: 0.0314, Val mIoU: 0.9275
Logging predictions to W&B for epoch 50...


Testing:   0%|          | 0/63 [00:00<?, ?it/s]


--- Test Set Evaluation ---
Test Loss: 0.0327
Test mIoU: 0.9031
---------------------------

--- Logging final Test Predictions for U-Net (residual) ---


0,1
epoch,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇███
train_loss,█▃▃▂▂▂▂▂▁▂▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_miou,▁▃▅▅▆▆▇▇▇▇▇▇▇▇▇▇████▇███████████▇███████
val_loss,█▅▄▃▂▂▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_miou,▁▃▄▄▆▆▇▇▇▇▇▇▇▇██████████████████▇███████

0,1
best_val_miou,0.92894
epoch,50.0
test_loss,0.03269
test_miou,0.90308
train_loss,0.0081
train_miou,0.98187
val_loss,0.03143
val_miou,0.92749



===== Training U-Net (attention) =====


Model Parameters: 31389945


Epoch 1/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 1/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 1/50 => Train Loss: 1.1734, Train mIoU: 0.1892 | Val Loss: 0.7743, Val mIoU: 0.2878


Epoch 2/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 2/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 2/50 => Train Loss: 0.5640, Train mIoU: 0.4160 | Val Loss: 0.4283, Val mIoU: 0.4525


Epoch 3/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 3/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 3/50 => Train Loss: 0.3420, Train mIoU: 0.4798 | Val Loss: 0.2734, Val mIoU: 0.5529


Epoch 4/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 4/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 4/50 => Train Loss: 0.2251, Train mIoU: 0.6042 | Val Loss: 0.1900, Val mIoU: 0.6770


Epoch 5/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 5/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 5/50 => Train Loss: 0.1643, Train mIoU: 0.7206 | Val Loss: 0.1517, Val mIoU: 0.7284


Epoch 6/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

Epoch 6/50 [Val]:   0%|          | 0/38 [00:00<?, ?it/s]

Epoch 6/50 => Train Loss: 0.1285, Train mIoU: 0.7639 | Val Loss: 0.1203, Val mIoU: 0.7649


Epoch 7/50 [Train]:   0%|          | 0/213 [00:00<?, ?it/s]

KeyboardInterrupt: 

# Analysis of U-Net Variants for Semantic Segmentation

Based on the provided CSV data and plots from the WandB export, here's an analysis of the different U-Net architecture variants trained for semantic segmentation.

**Summary of Final Test Performance:**

| Variant          | Test mIoU        | Best Val mIoU    | Test Loss        | Runtime (s) |
| :--------------- | :--------------- | :--------------- | :--------------- | :---------- |
| **unet-residual**| **0.9024**       | 0.9293           | **0.0298**       | 7413        |
| unet-attention   | **0.8995**       | 0.7880           | **0.0294**       | 6282        |
| unet-vanilla     | 0.8612           | **0.9318**       | 0.0395           | 4776        |
| unet-noskip      | 0.8056           | 0.8377           | 0.0889           | 3912        |

---

## Detailed Analysis per Variant

### 3.1 Vanilla U-Net

*   **Performance:**
    *   Test mIoU: 0.8612
    *   Best Validation mIoU: 0.9318 (Highest among all variants)
    *   Test Loss: 0.0395
    *   Final Validation mIoU (Epoch 50): 0.8872
    *   Final Training mIoU (Epoch 50): 0.8753
*   **Plots Observation:** The training and validation mIoU curves show rapid initial learning, followed by plateauing. The validation mIoU peaked earlier and settled slightly lower by epoch 50 compared to its peak. The model demonstrates the effectiveness of the standard U-Net architecture.
*   **Conclusion:** Serves as a solid baseline. It achieved the highest peak validation score but didn't generalize quite as well to the test set as the Residual or Attention variants.

### 3.2 U-Net without Skip Connections

*   **Performance:**
    *   Test mIoU: 0.8056 (Significantly lower than others)
    *   Best Validation mIoU: 0.8377
    *   Test Loss: 0.0889 (Highest among all variants)
    *   Final Validation mIoU (Epoch 50): 0.8368
    *   Final Training mIoU (Epoch 50): 0.9122
*   **Plots Observation:** Both `val_miou` and `train_miou` curves plateau at significantly lower levels compared to variants with skip connections. The curves also show noticeable dips (e.g., around steps 15 and 30), indicating training instability or difficulty in learning robust features without the skip connections. Convergence is slower and less effective.
*   **Differences in Visualized Results:** Compared to the standard U-Net, the visualized segmentation masks from this model would likely exhibit:
    *   Much coarser boundaries and less precise localization.
    *   More misclassified pixels, especially for smaller objects or intricate details.
    *   Potential failure to segment certain classes or finer structures altogether.
    The lack of high-resolution information from early encoder layers hinders the decoder's ability to reconstruct accurate details.
*   **Importance of Skip Connections:**
    *   Skip connections are fundamentally important in U-Net. They bridge the semantic gap between the encoder and decoder by concatenating high-resolution feature maps from the encoder pathway directly with the up-sampled feature maps in the decoder pathway.
    *   **Role:**
        1.  **Preserving Spatial Detail:** They allow the decoder to reuse fine-grained spatial information from earlier layers, which is lost during the downsampling (pooling/strided convolution) in the encoder. This is critical for accurate boundary localization.
        2.  **Gradient Flow:** They provide shorter paths for gradients to flow during backpropagation, mitigating the vanishing gradient problem and facilitating the training of deeper networks.
        3.  **Combining Context and Detail:** They enable the fusion of high-level semantic context (learned in deeper layers) with low-level, high-resolution details (from shallower layers).
    *   The poor performance (`test_miou` of 0.8056 vs 0.8612+ for others) and unstable training curves of the `unet-noskip` variant empirically demonstrate the critical necessity of skip connections for effective U-Net performance in segmentation tasks.

### 3.3 Residual U-Net

*   **Performance:**
    *   Test mIoU: 0.9024 (Highest)
    *   Best Validation mIoU: 0.9293
    *   Test Loss: 0.0298 (Lowest)
    *   Final Validation mIoU (Epoch 50): 0.9293
    *   Final Training mIoU (Epoch 50): 0.9740 (Highest)
*   **Plots Observation:** The training and validation mIoU curves show smooth, stable, and rapid convergence to very high values, slightly outperforming the vanilla U-Net in the final stages, especially on the training set.
*   **Conclusion:** Replacing standard convolutional blocks with residual blocks yielded the best test set performance in terms of both mIoU and loss. Residual connections likely aided optimization within the blocks, leading to slightly more robust feature learning and better generalization, albeit at the cost of the longest training time (7413s).

### 3.4 Gated Attention U-Net

*   **Plots Observation:** Similar to the Residual U-Net, the plots show smooth and effective learning, converging quickly to high mIoU values. The performance is consistently high throughout the later stages of training.
*   **Advantages of Attention Gates (as per paper):**
    *   **Focus on Relevant Regions:** AGs learn to automatically focus on target structures of interest while suppressing feature responses in irrelevant background regions.
    *   **Improved Sensitivity/Accuracy:** By weighting features passed through skip connections based on relevance (guided by the gating signal), they improve model sensitivity and predictive accuracy, particularly for varying object shapes and sizes.
    *   **Computational Efficiency:** They add minimal computational overhead compared to significantly increasing model depth or using model ensembles.
*   **Role of Gating Signal:** The gating signal, typically derived from a coarser scale (deeper layer) in the network, provides contextual information. This context helps the attention gate identify which spatial locations in the high-resolution feature map from the skip connection are most salient for the segmentation task at the current decoder stage. It essentially guides the attention mechanism.
*   **Differences Compared to Standard U-Net:** The Attention U-Net significantly outperformed the Vanilla U-Net on the test set (`test_miou` 0.8995 vs 0.8612) and achieved comparable performance to the Residual U-Net. This suggests that the attention mechanism effectively refined the information passed through the skip connections, improving the model's ability to focus on relevant features and generalize better to unseen test data.

---

## Overall Comparison and Conclusions

1.  **Skip Connections are Essential:** The `unet-noskip` variant performed drastically worse than all others, highlighting the critical role of skip connections in preserving spatial detail and enabling effective feature fusion in U-Net architectures.
2.  **Advanced Blocks Improve Performance:** Both Residual blocks (`unet-residual`) and Attention Gates (`unet-attention`) led to notable improvements in test set mIoU and loss compared to the `unet-vanilla` baseline.
3.  **Residual vs. Attention:** In this experiment, the Residual U-Net achieved the marginally best test performance, closely followed by the Attention U-Net. The choice between them might depend on specific dataset characteristics and computational budget (Residual U-Net took significantly longer to train).
4.  **Generalization:** While the Vanilla U-Net reached the highest peak validation mIoU, the Residual and Attention variants generalized better to the final test set, indicating their mechanisms might help prevent minor overfitting or learn more robust features.
5.  **Training Efficiency:** The Vanilla U-Net was faster than Residual and Attention variants, while the No-Skip variant was the fastest (likely due to simpler architecture but ineffective). The Residual variant was the slowest.
