# Exercise 1
**Inception module**

In [3]:
import torch
import torch.nn as nn

In [4]:
class InceptionModule(nn.Module):
    def __init__(self):
        super(InceptionModule, self).__init__()
        self.conv_1x1 = nn.Conv2d(192, 64, 1)
        self.conv_1x1_3x3 = nn.Conv2d(192, 96, 1)
        self.conv_1x1_5x5 = nn.Conv2d(192, 16, 1)
        self.max_pool = nn.MaxPool2d(3, 1, 1)
        self.conv_pool = nn.Conv2d(192, 32, 1)

        self.conv_3x3 = nn.Conv2d(96, 128, 3, 1, 1)
        self.conv_5x5 = nn.Conv2d(16, 32, 5, 1, 2)
    
    def forward(self, x):
        in_3x3 = self.conv_1x1_3x3(x)
        in_5x5 = self.conv_1x1_5x5(x)
        in_1x1 = self.max_pool(x)

        in_cat = self.conv_1x1(x) #64x28x28
        out_3x3 = self.conv_3x3(in_3x3) #128x28x28
        out_5x5 = self.conv_5x5(in_5x5) #32x28x28
        out_pool = self.conv_pool(in_1x1) #32x28x28

        out = torch.cat([in_cat, out_3x3, out_5x5, out_pool], dim=1) #cat across channels
        #(64 + 128 +32 + 32)x28x28 = 1x256x28x28
        return out

let's verify:

In [5]:
inception = InceptionModule()

sample = torch.randn(1, 192, 28, 28) #torch has BxCxHxW convention

out = inception(sample)

print(out.shape)

torch.Size([1, 256, 28, 28])


In [6]:
from torchsummary import summary
summary(inception, (192, 28, 28))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 96, 28, 28]          18,528
            Conv2d-2           [-1, 16, 28, 28]           3,088
         MaxPool2d-3          [-1, 192, 28, 28]               0
            Conv2d-4           [-1, 64, 28, 28]          12,352
            Conv2d-5          [-1, 128, 28, 28]         110,720
            Conv2d-6           [-1, 32, 28, 28]          12,832
            Conv2d-7           [-1, 32, 28, 28]           6,176
Total params: 163,696
Trainable params: 163,696
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 3.35
Params size (MB): 0.62
Estimated Total Size (MB): 4.55
----------------------------------------------------------------


# Exercise 2
**NN for ECG interpretation**
Convolutional neural networks can be applied
to one-dimensional as well as two- or three-
dimensional input. Let us implement a
residual neural network that was proposed
for ECG interpretation1. The network is
trained on 30 second long signals sampled at
200Hz --> **(1, 1, 6000) INPUT SAMPLES**. The architecture is depicted in the
figure to the right.
- The convolutional layers all have a filter
length of 16 and have 64. Every alternate
residual block subsamples its inputs by a
factor of 2, thus the original input is
ultimately subsampled by a factor of 2^8 --> 256.
- When a residual block subsamples the input,
the corresponding shortcut connections also
subsample their input using a Max Pooling
operation with the same subsample factor.
- The final fully connected layer and softmax
activation produce a distribution over the 14
output classes for each time-step.

In [7]:
import torch.nn.functional as F

In [8]:
class SingleResidualBlock(nn.Module):
    def __init__(self, index):
        super(SingleResidualBlock, self).__init__()
        stride = 2 if index % 2 == 0 else 1
        padding = 0 if stride == 2 else 1
        k_size = 2 if stride == 2 else 3

        left = (16 - 1) // 2      # 7
        right = (16 - 1) - left   # 8

        self.conv1 = nn.Sequential(
            nn.ConstantPad1d((left, right), 0),
            nn.Conv1d(64, 64, 16, stride=1, padding=0)
        )

        # conv2: 64 -> 64, may subsample with stride
        if stride == 2:
            self.conv2 = nn.Conv1d(64, 64, 16, stride=stride, padding=7)
        else:
           self.conv2 = nn.Sequential(
            nn.ConstantPad1d((left, right), 0),
            nn.Conv1d(64, 64, 16, stride=1, padding=0)
        )



        self.bn = nn.BatchNorm1d(64) #batch norm wants only #channels
        self.dp = nn.Dropout1d(p=0.7)

        self.pool = nn.MaxPool1d(kernel_size=k_size, stride=stride, padding=padding)

    
    def forward(self, x):
        x1 = F.relu(self.bn(self.conv1(x)))
        x1 = self.dp(x1)
        x1 = self.conv2(x1)

        max_pool = self.pool(x)
        
        out = max_pool + x1
        return out 





In [9]:
class RepeatedResidualBlock(nn.Module):
    def __init__(self, index):
        super(RepeatedResidualBlock, self).__init__()
        stride = 2 if index % 2 == 0 else 1
        padding = 0 if stride == 2 else 1
        k_size = 2 if stride == 2 else 3

        left = (16 - 1) // 2      # 7
        right = (16 - 1) - left   # 8

        self.bn1 = nn.BatchNorm1d(64) #batch norm wants only #channels
        self.dp = nn.Dropout1d(p=0.7)

        self.conv1 = nn.Sequential(
            nn.ConstantPad1d((left, right), 0),
            nn.Conv1d(64, 64, 16, stride=1, padding=0)
        )
        self.bn2 = nn.BatchNorm1d(64) #batch norm wants only #channels
        
         # conv2: 64 -> 64, may subsample with stride
        if stride == 2:
            self.conv2 = nn.Conv1d(64, 64, 16, stride=stride, padding=7)
        else:
           self.conv2 = nn.Sequential(
            nn.ConstantPad1d((left, right), 0),
            nn.Conv1d(64, 64, 16, stride=1, padding=0)
        )
        

        self.pool = nn.MaxPool1d(kernel_size=k_size, stride=stride, padding=padding)

    
    def forward(self, x):
        x1 = self.dp(F.relu(self.bn1(x)))
        
        x1 = self.conv1(x1)

        x1 = self.dp(F.relu(self.bn2(x1)))

        x1 = self.conv2(x1)

        max_pool = self.pool(x)
        
        out = max_pool + x1
        return out 

In [10]:
NUM_CLASSES = 14

In [11]:
class ECGNet(nn.Module):
    def __init__(self, in_channels):
        super(ECGNet, self).__init__()

        self.conv = nn.Conv1d(in_channels, 64, 16, stride=1)
        self.bn = nn.BatchNorm1d(64)

        self.single_res = SingleResidualBlock(index=0)
        self.repeated_res = nn.ModuleList([RepeatedResidualBlock(index) for index in range(1, 16)])

        self.fcl = nn.Linear(64*23, NUM_CLASSES)

        

    
    def forward(self, x):

        x = F.relu(self.bn(self.conv(x)))

        x = self.single_res(x)

        for res in self.repeated_res:
            x = res(x)
        
        x = F.relu(self.bn(x))

        x = torch.flatten(x, 1)

        out = self.fcl(x)

        logits = F.softmax(out, dim=1)
        
        return logits 

In [12]:
model = ECGNet(in_channels=1)
x = torch.randn(1, 1, 6000)
out = model(x)
print(out.shape)

torch.Size([1, 14])


In [13]:
summary(model, (1, 6000))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1             [-1, 64, 5985]           1,088
       BatchNorm1d-2             [-1, 64, 5985]             128
     ConstantPad1d-3             [-1, 64, 6000]               0
            Conv1d-4             [-1, 64, 5985]          65,600
       BatchNorm1d-5             [-1, 64, 5985]             128
         Dropout1d-6             [-1, 64, 5985]               0
            Conv1d-7             [-1, 64, 2992]          65,600
         MaxPool1d-8             [-1, 64, 2992]               0
SingleResidualBlock-9             [-1, 64, 2992]               0
      BatchNorm1d-10             [-1, 64, 2992]             128
        Dropout1d-11             [-1, 64, 2992]               0
    ConstantPad1d-12             [-1, 64, 3007]               0
           Conv1d-13             [-1, 64, 2992]          65,600
      BatchNorm1d-14             [-1, 

# Exercise 3
**NETWORK FOR HUMAN ACTION CLASSIFICATION**

mplement a two stream network for human action classification as proposed by
Simonyan and Zisserman in Two-Stream Convolutional Networks for Action
Recognition in Videos2. The task consists in classifying the frames in a video
according to the action performed by the human. This particular architectures
operates on a frame by frame basis.
Assuming that:
- The network takes as input one single frame and its corresponding optical
flow
o The optical flow3 is already calculated and provided as additional
input
o The input size of both RGB and optical flow are 224 x 224
o The optical flow for 2*L consecutive frames are encoded as 2*L
grayscale input channels to the temporal stream branch
- The two branches have the same architecture but separate (not shared)
parameters
- The number of possible actions is 100
- The class score fusion consists in taking the average of the two predictions

In [14]:
L = 10
NUM_CLASSES = 100

In [15]:
class SpatialStreamConvNet(nn.Module):
    def __init__(self):
        super(SpatialStreamConvNet, self).__init__()

        self.conv1 = nn.Conv2d(3, 96, 7, stride=2)
        self.pool = nn.MaxPool2d(2, 2)
        self.bn1 = nn.BatchNorm2d(96)

        self.conv2 = nn.Conv2d(96, 256, 5, stride=2)

        self.bn2 = nn.BatchNorm2d(256)

        self.conv3 = nn.Conv2d(256, 512, 3)

        self.conv4_5 = nn.Conv2d(512, 512, 3)

        self.fcl1 = nn.Linear(3*3*512, 4096)

        self.dp = nn.Dropout2d(0.7)

        self.fcl2 = nn.Linear(4096, 2048)

        self.fcl3 = nn.Linear(2048, NUM_CLASSES)
    
    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))

        x = self.pool(F.relu(self.bn2(self.conv2(x))))

        x = F.relu(self.conv3(x))

        x = F.relu(self.conv4_5(x))

        x = self.pool(F.relu(self.conv4_5(x)))

        x = torch.flatten(x, 1)

        x = F.relu(self.fcl1(x))

        x = F.relu(self.fcl2(x))

        x = self.fcl3(x)

        logits = F.softmax(x, dim=1)

        return logits



In [16]:
class TemporalStreamConvNet(nn.Module):
    def __init__(self, l):
        super(TemporalStreamConvNet, self).__init__()

        self.conv1 = nn.Conv2d(2*l, 96, 7, stride=2)
        self.pool = nn.MaxPool2d(2, 2)
        self.bn1 = nn.BatchNorm2d(96)

        self.conv2 = nn.Conv2d(96, 256, 5, stride=2)

        self.bn2 = nn.BatchNorm2d(256)

        self.conv3 = nn.Conv2d(256, 512, 3)

        self.conv4_5 = nn.Conv2d(512, 512, 3)

        self.fcl1 = nn.Linear(3*3*512, 4096)

        self.dp = nn.Dropout2d(0.7)

        self.fcl2 = nn.Linear(4096, 2048)

        self.fcl3 = nn.Linear(2048, NUM_CLASSES)
    
    def forward(self, x):
        x = self.pool(F.relu(self.bn1(self.conv1(x))))

        x = self.pool(F.relu(self.bn2(self.conv2(x))))

        x = F.relu(self.conv3(x))

        x = F.relu(self.conv4_5(x))

        x = self.pool(F.relu(self.conv4_5(x)))

        x = torch.flatten(x, 1)

        x = F.relu(self.fcl1(x))

        x = F.relu(self.fcl2(x))

        x = self.fcl3(x)

        logits = F.softmax(x, dim=1)

        return logits



In [17]:
class HumanActionNet(nn.Module):
    def __init__(self):
        super(HumanActionNet, self).__init__()
        self.SpatialNet = SpatialStreamConvNet()
        self.TemporalNet = TemporalStreamConvNet(L)
    
    def forward(self, frame, optical_flow):
        spatial_score = self.SpatialNet(frame)
        temporal_score = self.TemporalNet(optical_flow)

        avg = (spatial_score + temporal_score) / 2
        return avg

In [18]:
frame = torch.randn(1, 3, 224, 224)
optical_flow = torch.randn(1, 2*L, 224, 224)

model = HumanActionNet()

out = model(frame, optical_flow)

print(out.shape)

torch.Size([1, 100])


In [19]:
summary(model, [(3, 224, 224), (2*L, 224, 224)])

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 96, 109, 109]          14,208
       BatchNorm2d-2         [-1, 96, 109, 109]             192
         MaxPool2d-3           [-1, 96, 54, 54]               0
            Conv2d-4          [-1, 256, 25, 25]         614,656
       BatchNorm2d-5          [-1, 256, 25, 25]             512
         MaxPool2d-6          [-1, 256, 12, 12]               0
            Conv2d-7          [-1, 512, 10, 10]       1,180,160
            Conv2d-8            [-1, 512, 8, 8]       2,359,808
            Conv2d-9            [-1, 512, 6, 6]       2,359,808
        MaxPool2d-10            [-1, 512, 3, 3]               0
           Linear-11                 [-1, 4096]      18,878,464
           Linear-12                 [-1, 2048]       8,390,656
           Linear-13                  [-1, 100]         204,900
SpatialStreamConvNet-14                

# Exercise 4
**HOUSE PRICE PREDICTOR**
Let us define a neural network to predict the price of the house starting from
- One or more images of the interior and exterior
- A set of categorical data
In particular, refer to the dataset described here to retrieve the list of input
variables https://github.com/emanhamed/Houses-dataset
The high level architecture is depicted in the figure below. For extracting features
from the image, use a CNN of your choice pre-trained on ImageNet.


In [20]:
IMAGES_PER_HOUSE = 4
N_SAMPLES = 4

In [47]:
#CNN
import torchvision.models as md
resnet = md.resnet50(weights=md.ResNet50_Weights.DEFAULT)
resnet.fc = nn.Identity()

In [48]:
#MLP
class MLP(nn.Module):
    def __init__(self, in_features):
        super(MLP, self).__init__()

        self.fcl1 = nn.Linear(in_features, 128)
        self.fcl2 = nn.Linear(128, 128)
        self.fcl3 = nn.Linear(128, 64)
    
    def forward(self, x):
        x = F.sigmoid(self.fcl2(F.sigmoid(self.fcl1(x))))

        out = F.sigmoid(self.fcl3(x))
        return out


In [94]:
class HousePredictor(nn.Module):
    def __init__(self):
        super(HousePredictor, self).__init__()
        self.cnn = resnet
        self.mlp = MLP(4)
        self.fcl = nn.Linear(2112, 256)
        self.head = nn.Linear(256, 1)
    
    def forward(self, textual_data, images):
        if textual_data.dim() == 1:
            textual_data = textual_data.unsqueeze(0) #nneded for nn.Linear that expects (batch_size, n_features)

        
        if images.dim()==5: #handling batch of samples (houses) --> 5D tensors
            batch_size, num_images, C, H, W = images.shape
            images = images.view(batch_size * num_images, C, H, W) #if batch_size = 2 --> (8, 3, 224, 224)
            cnn_out = self.cnn(images) #(8, 2048)
            cnn_out = cnn_out.view(batch_size, num_images, -1) #(2, 4, 2048)
            cnn_feat = cnn_out.mean(dim=1)  # (2, 2048)
        else:
            cnn_out = self.cnn(images) #gives features for each image in the batch (4 images)
            cnn_feat = cnn_out.mean(dim=0, keepdim=True) #batch mean preserving dimension (1, 2048)
            
        mlp_out = self.mlp(textual_data) #(1, 64) if single batch and (2, 64) if 2 batches
        
        

        

        conc = torch.cat([mlp_out, cnn_feat], dim=1)
        x = F.relu(self.fcl(conc))
        out = self.head(x)
        return out #(1, 1)  or (2, 1)


In [78]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

images = torch.randn(N_SAMPLES, IMAGES_PER_HOUSE, 3, 224, 224) #ImageNet type

df = pd.read_csv('./data/data.csv')
categorical_colums = list(df.select_dtypes(['object']).columns)

if len(categorical_colums) != 0:
    le = LabelEncoder()
    for col in categorical_colums:
        df[col] = le.fit_transform(df[col])

train_cols = [col for col in df.columns if col!='Price' and col != 'Nmb']

textual_data = df[train_cols].values.astype(np.float32)
textual_data = torch.from_numpy(textual_data)


In [79]:
print(images.shape)
print(textual_data.shape)

torch.Size([4, 4, 3, 224, 224])
torch.Size([4, 4])


In [95]:
model = HousePredictor()
for sample_images, sample_textual_data in zip(images, textual_data):
    out = model(sample_textual_data, sample_images)

    print(out.shape)


torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])
torch.Size([1, 1])


In [96]:
from torchinfo import summary
summary(model, [(4, ), (4, 3, 224, 224)])

Layer (type:depth-idx)                        Output Shape              Param #
HousePredictor                                [1, 1]                    --
├─ResNet: 1-1                                 [4, 2048]                 --
│    └─Conv2d: 2-1                            [4, 64, 112, 112]         9,408
│    └─BatchNorm2d: 2-2                       [4, 64, 112, 112]         128
│    └─ReLU: 2-3                              [4, 64, 112, 112]         --
│    └─MaxPool2d: 2-4                         [4, 64, 56, 56]           --
│    └─Sequential: 2-5                        [4, 256, 56, 56]          --
│    │    └─Bottleneck: 3-1                   [4, 256, 56, 56]          75,008
│    │    └─Bottleneck: 3-2                   [4, 256, 56, 56]          70,400
│    │    └─Bottleneck: 3-3                   [4, 256, 56, 56]          70,400
│    └─Sequential: 2-6                        [4, 512, 28, 28]          --
│    │    └─Bottleneck: 3-4                   [4, 512, 28, 28]          379,392

# Exercise 5
**SIAMESE NETWORK**
1) Implement a simple Siamese network for face verification. The Siamese
network must
• take as input two images
• convert them into a vector of fixed length using a set of convolutional and
dense layers (hint: remember that both stream need to share the weights!)
• compute the distance
2) At inference time, only the part of the model that computes f(x) is needed.
Extract the subnetwork from the Siamese model

In [111]:
class CoreNetwork(nn.Module):
    def __init__(self, in_channels):
        super(CoreNetwork, self).__init__()
        self.conv1 = nn.Conv2d(in_channels, 64, 3, 1, 1)
        self.bn1 = nn.BatchNorm2d(64)
        self.pool1 = nn.MaxPool2d(3, 1, 1)
        self.conv2 = nn.Conv2d(64, 128, 5, 1)
        self.bn2 = nn.BatchNorm2d(128)

        self.pool2_3 = nn.MaxPool2d(2, 2)

        self.conv3 = nn.Conv2d(128, 256, 3, 1, 1)
        self.bn3 = nn.BatchNorm2d(256)
        self.fcl1 = nn.Linear(55*55*256, 128)
        self.dp = nn.Dropout2d(0.7)

    def forward(self,x):
        x = self.pool1(F.relu(self.bn1(self.conv1(x))))

        x = self.dp(x)

        x = self.pool2_3(F.relu(self.bn2(self.conv2(x))))

        x = self.dp(x)

        x = self.pool2_3(F.relu(self.bn3(self.conv3(x))))

        x = self.dp(x)

        x = torch.flatten(x, 1)

        out = F.relu(self.fcl1(x))

        return out #return embeddings

In [None]:
class Siamese(nn.Module):
    def __init__(self, in_channels):
        super(Siamese, self).__init__()
        self.core = CoreNetwork(in_channels)
    
    def forward(self, x1, x2):
        f_x1 = self.core(x1)
        f_x2 = self.core(x2)

        diff = torch.abs(f_x1 - f_x2).sum(dim=1) #compute L1 distance in a batch

        return diff

In [None]:
model = Siamese(3)

#2 batches of 10 images, the pair "i" of images is composed by the image at position "i" in both batches
images1 = torch.randn(10, 3, 224, 224) #batch of 10 images for siamese network
images2 = torch.randn(10, 3, 224, 224) #batch of 10 images for siamese network

out = model(images1, images2) #we will have 10 values --> (10,)

print(out.shape)
out


torch.Size([10])


tensor([63.2710, 54.4061, 57.1568, 54.2596, 66.8101, 59.6038, 75.0633, 49.2480,
        69.4429, 87.5928], grad_fn=<SumBackward1>)

In [122]:
from torchsummary import summary
summary(model, [(3, 224, 224), (3, 224, 224)])

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1         [-1, 64, 224, 224]           1,792
       BatchNorm2d-2         [-1, 64, 224, 224]             128
         MaxPool2d-3         [-1, 64, 224, 224]               0
         Dropout2d-4         [-1, 64, 224, 224]               0
            Conv2d-5        [-1, 128, 220, 220]         204,928
       BatchNorm2d-6        [-1, 128, 220, 220]             256
         MaxPool2d-7        [-1, 128, 110, 110]               0
         Dropout2d-8        [-1, 128, 110, 110]               0
            Conv2d-9        [-1, 256, 110, 110]         295,168
      BatchNorm2d-10        [-1, 256, 110, 110]             512
        MaxPool2d-11          [-1, 256, 55, 55]               0
        Dropout2d-12          [-1, 256, 55, 55]               0
           Linear-13                  [-1, 128]      99,123,328
      CoreNetwork-14                  [

In [123]:
f_x = model.core #to use at inference time
f_x

CoreNetwork(
  (conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool1): MaxPool2d(kernel_size=3, stride=1, padding=1, dilation=1, ceil_mode=False)
  (conv2): Conv2d(64, 128, kernel_size=(5, 5), stride=(1, 1))
  (bn2): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (pool2_3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv3): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fcl1): Linear(in_features=774400, out_features=128, bias=True)
  (dp): Dropout2d(p=0.7, inplace=False)
)