<a href="https://colab.research.google.com/github/JensH-2157843/AML_Project/blob/main/src/neural_networks/NN1(segmentation).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Library imports

In [2]:
!pip install segmentation-models==1.0.1 albumentations==1.3.1 --quiet
import os
import numpy as np
from PIL import Image
from glob import glob
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset, DataLoader
import torch
import torchvision.transforms as T
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm
import tensorflow as tf
import matplotlib.pyplot as plt
from transformers import SegformerFeatureExtractor

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/125.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m125.7/125.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/50.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.7/50.7 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25h

# Dataset import

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [21]:
## DATASET IMPORT ##
deepglobe_dir = "/content/drive/MyDrive/train"
import os

deepglobe_images = sorted(glob(os.path.join(deepglobe_dir, '*_sat.jpg')))
deepglobe_masks = sorted(glob(os.path.join(deepglobe_dir, '*_mask.png')))

for tile in sorted(os.listdir(deepglobe_dir)):
    tile_path = os.path.join(deepglobe_dir, tile)
    if not os.path.isdir(tile_path):
        continue
    img_folder = os.path.join(tile_path, "images")
    mask_folder = os.path.join(tile_path, "masks")
    deepglobe_images.extend(sorted(glob(os.path.join(img_folder, '*.jpg'))))
    deepglobe_masks.extend(sorted(glob(os.path.join(mask_folder, '*.png'))))

all_images = deepglobe_images
all_masks = deepglobe_masks

train_imgs, val_imgs, train_masks, val_masks = train_test_split(
    all_images, all_masks, test_size=0.2, random_state=42
)

642


# Model

In [25]:
## ARCHITECTURE ##
class ConvBlock(nn.Module):
    """
    Convolutional Block: (Conv -> BN -> ReLU) * 2
    """
    def __init__(self, in_channels, out_channels):
        super(ConvBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu1 = nn.ReLU(inplace=True)
        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(out_channels)
        self.relu2 = nn.ReLU(inplace=True)

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)
        x = self.conv2(x)
        x = self.bn2(x)
        x = self.relu2(x)
        return x

class EncoderBlock(nn.Module):
    """
    Encoder Block: ConvBlock -> MaxPool
    Returns both ConvBlock output (skip) and MaxPool output.
    """
    def __init__(self, in_channels, out_channels):
        super(EncoderBlock, self).__init__()
        self.conv_block = ConvBlock(in_channels, out_channels)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        skip = self.conv_block(x)
        pooled = self.pool(skip)
        return skip, pooled

class DecoderBlock(nn.Module):
    """
    Decoder Block: ConvTranspose -> Concat -> ConvBlock
    """
    def __init__(self, in_channels, out_channels):
        super(DecoderBlock, self).__init__()
        # Upsamples by a factor of 2, halving the channels.
        self.upconv = nn.ConvTranspose2d(in_channels, out_channels, kernel_size=2, stride=2)
        # ConvBlock takes concatenated input (skip + upconv), so its input channels
        # will be out_channels (from skip) + out_channels (from upconv).
        self.conv_block = ConvBlock(out_channels * 2, out_channels)

    def forward(self, x, skip_connection):
        x = self.upconv(x)

        # Ensure spatial dimensions match before concatenating.
        # If input sizes are powers of 2, they should match.
        # If not, cropping (from skip) or padding (to x) might be needed.
        # Here we assume they match or crop the skip connection if necessary.
        if x.shape != skip_connection.shape:
            # Simple center-cropping (adjust if needed)
            diffY = skip_connection.size()[2] - x.size()[2]
            diffX = skip_connection.size()[3] - x.size()[3]
            skip_connection = skip_connection[:, :, diffY // 2 : skip_connection.size()[2] - diffY // 2 - diffY % 2,
                                                diffX // 2 : skip_connection.size()[3] - diffX // 2 - diffX % 2]

        x = torch.cat([x, skip_connection], dim=1) # Concatenate along channel dimension (dim=1)
        x = self.conv_block(x)
        return x

class DeepUnet(nn.Module):

    def __init__(self, in_channels=3, out_classes=11):
        """
        Initializes the DeepUnet model.

        Args:
            in_channels (int): Number of input channels (e.g., 3 for RGB).
            out_classes (int): Number of output segmentation classes.
        """
        super(DeepUnet, self).__init__()
        self.in_channels = in_channels
        self.out_classes = out_classes

        # Encoder Path
        self.enc1 = EncoderBlock(in_channels, 64)
        self.enc2 = EncoderBlock(64, 128)
        self.enc3 = EncoderBlock(128, 256)
        self.enc4 = EncoderBlock(256, 512)

        # Bottleneck
        self.bottleneck = ConvBlock(512, 1024)

        # Decoder Path
        self.dec1 = DecoderBlock(1024, 512)
        self.dec2 = DecoderBlock(512, 256)
        self.dec3 = DecoderBlock(256, 128)
        self.dec4 = DecoderBlock(128, 64)

        # Output Layer
        self.output_conv = nn.Conv2d(64, out_classes, kernel_size=1)

        # Optional: Softmax layer. Often omitted if using CrossEntropyLoss,
        # which combines LogSoftmax and NLLLoss.
        # self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        """
        Defines the forward pass of the U-Net.

        Args:
            x (Tensor): The input tensor (N, C, H, W).

        Returns:
            Tensor: The output segmentation map (N, out_classes, H, W).
        """
        # Encoder path
        s1, p1 = self.enc1(x)
        s2, p2 = self.enc2(p1)
        s3, p3 = self.enc3(p2)
        s4, p4 = self.enc4(p3)

        # Bottleneck
        b1 = self.bottleneck(p4)

        # Decoder path
        d1 = self.dec1(b1, s4)
        d2 = self.dec2(d1, s3)
        d3 = self.dec3(d2, s2)
        d4 = self.dec4(d3, s1)

        # Output
        outputs = self.output_conv(d4)

        # Optional: Apply softmax
        # outputs = self.softmax(outputs)

        return outputs

# Example of how to create and use the model:
if __name__ == '__main__':
    # Ensure input dimensions are powers of 2 for easy up/down sampling
    input_tensor = torch.randn(1, 3, 256, 256) # (N, C, H, W)
    num_classes = 11

    # Create an instance of the model
    model = DeepUnet(in_channels=3, out_classes=num_classes)

    # Perform a forward pass
    output = model(input_tensor)

    # Print input and output shapes
    print(f"Input Tensor Shape: {input_tensor.shape}")
    print(f"Output Tensor Shape: {output.shape}")

    # You can print the model structure (optional)
    # print(model)

    # Calculate number of parameters (optional)
    num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Number of trainable parameters: {num_params:,}")

Input Tensor Shape: torch.Size([1, 3, 256, 256])
Output Tensor Shape: torch.Size([1, 11, 256, 256])
Number of trainable parameters: 31,038,283
