### Deep Learning Homework 6

Taking inspiration from the last 2 pictures within the notebook (07-convnets.ipynb), implement a U-Net-style CNN with the following specs:

1. All convolutions must use a 3 x 3 kernel and leave the spatial dimensions (i.e. height, width) of the input untouched.
2. Downsampling in the contracting part is performed via maxpooling with a 2 x 2 kernel and stride of 2.
3. Upsampling is operated by a deconvolution with a 2 x 2 kernel and stride of 2. The PyTorch module that implements the deconvolution is `nn.ConvTranspose2d`
4. The final layer of the expanding part has only 1 channel 

* between how many classes are we discriminating?

Create a network class with (at least) a `__init__` and a `forward` method. Please resort to additional structures (e.g., `nn.Module`s, private methods...) if you believe it helps readability of your code.

Test, at least with random data, that the network is doing the correct tensor operations and that the output has the correct shape (e.g., use `assert`s in your code to see if the byproduct is of the expected shape).

Note: the overall organization of your work can greatly improve readability and understanding of your code by others. Please consider preparing your notebook in an organized fashion so that we can better understand (and correct) your implementation.

In [1]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pylab as pl
from IPython.display import clear_output

In [13]:
class VGG_block(nn.Module):
    """Implements a VGG layer with kernel size 3 and padding 1"""
    def __init__(self, in_channels, out_channels, num_layers=2, maxpool=False, activation=nn.ReLU):
        super().__init__()
        
        layers = []
        
        layers.append(nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
        layers.append(activation())
        for i in range(num_layers-1):
            layers.append(nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1))
            layers.append(activation())
        
        if maxpool:
            layers.append(nn.MaxPool2d(2))
             
        self.layers = nn.Sequential(*layers)
    
    def forward(self, X):
        return self.layers(X)
    
    
class upsampling_block(nn.Module):
    """
    Implements the upsampling block of the U-net, basically it's a VGG_block
    with in_channels=in_channels and out_channels=mid_channels followed by a 
    deconvolution operation that doubles the size of the input image
    """
    def __init__(self, in_channels, mid_channels, out_channels, num_mid_layers=2, activation=nn.ReLU):
        super().__init__()
        
        self.layers = nn.Sequential(
            VGG_block(in_channels, mid_channels, num_layers=num_mid_layers, activation=activation),
            nn.ConvTranspose2d(mid_channels, out_channels, kernel_size=2, stride=2),
            activation()
        )
        
    def forward(self, X):
        return self.layers(X)

    
class U_net(nn.Module):
    """
    Implements a U-net, this architecture can be trained without changing anything
    on non-square and non-power of two images, however the results can be worse
    """

    def __init__(self, channels=3, depth=4, num_classes=10):
        super().__init__()
        
        # Downsampling layers
        downsampling_layers = []
        in_channels, out_channels = channels, 64
        for i in range(depth):
            downsampling_layers.append(VGG_block(in_channels, out_channels))
            in_channels = out_channels
            out_channels *= 2
        
        self.downsampling_layers = nn.Sequential(*downsampling_layers)
        # Ceil mode is required if I have uneven images, otherwise it's the same
        self.maxpool = nn.MaxPool2d(2, ceil_mode=True)
        
        # "Deepest" layer
        mid_channels = out_channels
        out_channels = in_channels
        self.deep_layer = upsampling_block(in_channels, mid_channels, out_channels)
        
        # Upsampling layers
        upsampling_layers = []
        in_channels, mid_channels, out_channels = in_channels*2, mid_channels//2, out_channels//2
        for i in range(depth-1):
            upsampling_layers.append(upsampling_block(in_channels, mid_channels, out_channels))
            in_channels = in_channels//2
            mid_channels = mid_channels//2
            out_channels = out_channels//2
            
        self.upsampling_layers = nn.Sequential(*upsampling_layers)
        
        # Classifier or last layer
        self.classifier = nn.Sequential(
            VGG_block(in_channels, mid_channels),
            nn.Conv2d(mid_channels, num_classes, kernel_size=1)
        )

    
    def center_crop(self, images, size_x, size_y):
        """Rectangle center crop of a set of images"""
        
        # If the crop is bigger or equal to the images do nothing
        if size_x>=images.size()[2] and size_y>=images.size()[3]:
            return images
        
        # Otherwise perform the cropping
        center_x = images.size()[2] // 2
        center_y = images.size()[3] // 2
        bottom_left = [center_x - size_x//2, center_y - size_y//2]
        top_right = [center_x + (size_x+1)//2, center_y + (size_y+1)//2]
        return images[:, :, bottom_left[0]: top_right[0], bottom_left[1]: top_right[1]]
    

    def forward(self, X):
        skips = [] # Holds the skip connections
        out = X
        
        # Downsampling phase
        for layer in self.downsampling_layers:
            out = layer(out)
            skips.append(out)
            out = self.maxpool(out)
        
        # Deepest layer
        out = self.deep_layer(out)
        
        # Upsampling phase
        for i, layer in enumerate(self.upsampling_layers, start=1):
            # The cropping is done only if the downsampling phase has uneven image sizes
            # In that case in the upsampling the resulting image will be 1 pixel
            # wider and I need to crop it, notice that this doesn't happen for
            # power-of-two images and cropping does nothing
            out = self.center_crop(out, *skips[-i].size()[2:])
            out = torch.cat((skips[-i], out), dim=1) # Concatenate the previous output
            out = layer(out)
        
        # Classification phase
        out = self.center_crop(out, *skips[0].size()[2:])
        out = torch.cat((skips[0], out), dim=1)
        return self.classifier(out)

In [14]:
from random import randint


# Test with images like the one in the U-net picture of the notebook, F for my pc
net = U_net(channels=3, depth=3, num_classes=2)
images = torch.randn((10, 3, 572, 572))
print("input size:", images.size(), "output size:", net(images).size(), "\n")

# Test with random images and random nets
for i in range(10):
    channels = randint(1,10)
    net = U_net(channels=channels, depth=randint(1,5), num_classes=randint(2, 20))
    images = torch.randn((randint(1,10), channels, randint(1, 200), randint(1, 200)))
    out = net(images)
    assert out.size()[2:] == images.size()[2:]
    print("input size:", images.size(), "output size:", out.size())


print("\nTest passed!")

input size: torch.Size([10, 3, 572, 572]) output size: torch.Size([10, 2, 572, 572]) 

input size: torch.Size([4, 3, 77, 197]) output size: torch.Size([4, 20, 77, 197])
input size: torch.Size([5, 7, 160, 4]) output size: torch.Size([5, 19, 160, 4])
input size: torch.Size([2, 10, 198, 38]) output size: torch.Size([2, 4, 198, 38])
input size: torch.Size([9, 1, 147, 16]) output size: torch.Size([9, 14, 147, 16])
input size: torch.Size([4, 8, 154, 41]) output size: torch.Size([4, 17, 154, 41])
input size: torch.Size([1, 4, 197, 100]) output size: torch.Size([1, 16, 197, 100])
input size: torch.Size([1, 4, 24, 177]) output size: torch.Size([1, 3, 24, 177])
input size: torch.Size([9, 7, 118, 89]) output size: torch.Size([9, 18, 118, 89])
input size: torch.Size([1, 1, 25, 48]) output size: torch.Size([1, 10, 25, 48])
input size: torch.Size([9, 10, 97, 87]) output size: torch.Size([9, 9, 97, 87])

Test passed!


In [5]:
# Summary of the model
from torchsummary import summary

summary(net)
net

Layer (type:depth-idx)                        Param #
├─Sequential: 1-1                             --
|    └─VGG_block: 2-1                         --
|    |    └─Sequential: 3-1                   39,872
|    └─VGG_block: 2-2                         --
|    |    └─Sequential: 3-2                   221,440
|    └─VGG_block: 2-3                         --
|    |    └─Sequential: 3-3                   885,248
|    └─VGG_block: 2-4                         --
|    |    └─Sequential: 3-4                   3,539,968
|    └─VGG_block: 2-5                         --
|    |    └─Sequential: 3-5                   14,157,824
├─MaxPool2d: 1-2                              --
├─upsampling_block: 1-3                       --
|    └─Sequential: 2-6                        --
|    |    └─VGG_block: 3-6                    56,627,200
|    |    └─ConvTranspose2d: 3-7              8,389,632
|    |    └─ReLU: 3-8                         --
├─Sequential: 1-4                             --
|    └─upsampling_bl

U_net(
  (downsampling_layers): Sequential(
    (0): VGG_block(
      (layers): Sequential(
        (0): Conv2d(5, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU()
        (2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): ReLU()
      )
    )
    (1): VGG_block(
      (layers): Sequential(
        (0): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU()
        (2): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): ReLU()
      )
    )
    (2): VGG_block(
      (layers): Sequential(
        (0): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU()
        (2): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (3): ReLU()
      )
    )
    (3): VGG_block(
      (layers): Sequential(
        (0): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
        (1): ReLU()
        (2): Conv2d(5