In [4]:
# Torch
import torch 
from torch import nn
from torch.nn import functional as F

# TorchInfo
from torchinfo import summary

# Factorization into smaller convolutions

## Comparing the number of parameter in large filter

In [11]:
class SimpleCNN(nn.Module):
    def __init__(self,filter_size=3) -> None:
        super().__init__()
        self.conv1 = nn.Conv2d(in_channels=3,out_channels=32,kernel_size=filter_size,
                               stride=1,padding=(filter_size-1)//2)
        self.maxpool = nn.MaxPool2d(kernel_size=2,stride=2)
        
    def forward(self,x):
        return self.maxpool(F.relu(self.conv1(x)))

In [12]:
model3x3 = SimpleCNN(filter_size=3)
model5x5 = SimpleCNN(filter_size=5)

In [13]:
summary(model3x3,input_size=(1,3,32,32)  ,  col_width=14,depth=2,
    col_names=["kernel_size", "output_size", "num_params", "mult_adds"],
    row_settings=["var_names"],
)

Layer (type (var_name))                  Kernel Shape   Output Shape   Param #        Mult-Adds
SimpleCNN (SimpleCNN)                    --             [1, 32, 16, 16] --             --
├─Conv2d (conv1)                         [3, 3]         [1, 32, 32, 32] 896            917,504
├─MaxPool2d (maxpool)                    2              [1, 32, 16, 16] --             --
Total params: 896
Trainable params: 896
Non-trainable params: 0
Total mult-adds (M): 0.92
Input size (MB): 0.01
Forward/backward pass size (MB): 0.26
Params size (MB): 0.00
Estimated Total Size (MB): 0.28

In [14]:
summary(model5x5,input_size=(1,3,32,32)  ,  col_width=14,depth=2,
    col_names=["kernel_size", "output_size", "num_params", "mult_adds"],
    row_settings=["var_names"],
)

Layer (type (var_name))                  Kernel Shape   Output Shape   Param #        Mult-Adds
SimpleCNN (SimpleCNN)                    --             [1, 32, 16, 16] --             --
├─Conv2d (conv1)                         [5, 5]         [1, 32, 32, 32] 2,432          2,490,368
├─MaxPool2d (maxpool)                    2              [1, 32, 16, 16] --             --
Total params: 2,432
Trainable params: 2,432
Non-trainable params: 0
Total mult-adds (M): 2.49
Input size (MB): 0.01
Forward/backward pass size (MB): 0.26
Params size (MB): 0.01
Estimated Total Size (MB): 0.28

In [15]:
inputs = torch.rand((1,3,32,32))
output3x3 =  model3x3(inputs)
output5x5 = model5x5(inputs)

In [16]:
# Compare the number of parameter in the two models
print(f"Number of parameter in the 3x3 model : {sum(p.numel()for p in model3x3.parameters())}")
print(f"Number of parameter in the 5x5 model : {sum(p.numel()for p in model5x5.parameters())}")

Number of parameter in the 3x3 model : 896
Number of parameter in the 5x5 model : 2432


In [17]:
# Compare the shape of the output from the two models
print("Output shape of 3x3 model: ", output3x3.shape)
print("Output shape of 5x5 model: ", output5x5.shape)

Output shape of 3x3 model:  torch.Size([1, 32, 16, 16])
Output shape of 5x5 model:  torch.Size([1, 32, 16, 16])


As you can see, the number of parameters in the model with a 5x5 filter is significantly higher than the model with a 3x3 filter. This means that the computational cost of using a 5x5 filter is higher than a 3x3 filter. 

## Replacing the large conv with smaller conv

In [32]:
class Inception(nn.Module):
    def __init__(self,in_channels,out_channels) -> None:
        super().__init__()
        
        self.branch1 = nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0)
        
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=3,stride=1,padding=1),nn.ReLU()
            )
        
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=5,stride=1,padding=2),nn.ReLU()
        )
        
        
    def forward(self,x):
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        
        print(x1.shape,x2.shape,x3.shape)
        
        return torch.cat((x1,x2,x3),dim=1)

In [38]:
inputs = torch.rand((1,3,32,32))

In [33]:
summary(Inception(3,32),input_size=(1,3,32,32)  ,  col_width=14,depth=2,
    col_names=["kernel_size", "output_size", "num_params", "mult_adds"],
    row_settings=["var_names"],
)

torch.Size([1, 32, 32, 32]) torch.Size([1, 32, 32, 32]) torch.Size([1, 32, 32, 32])


Layer (type (var_name))                  Kernel Shape   Output Shape   Param #        Mult-Adds
Inception (Inception)                    --             [1, 96, 32, 32] --             --
├─Conv2d (branch1)                       [1, 1]         [1, 32, 32, 32] 128            131,072
├─Sequential (branch2)                   --             [1, 32, 32, 32] --             --
│    └─Conv2d (0)                        [1, 1]         [1, 32, 32, 32] 128            131,072
│    └─ReLU (1)                          --             [1, 32, 32, 32] --             --
│    └─Conv2d (2)                        [3, 3]         [1, 32, 32, 32] 9,248          9,469,952
│    └─ReLU (3)                          --             [1, 32, 32, 32] --             --
├─Sequential (branch3)                   --             [1, 32, 32, 32] --             --
│    └─Conv2d (0)                        [1, 1]         [1, 32, 32, 32] 128            131,072
│    └─ReLU (1)                          --             [1, 32, 32, 32] 

In [30]:
class NewInception(nn.Module):
    def __init__(self,in_channels,out_channels) -> None:
        super().__init__()
        
        self.branch1 = nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0)
        
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=3,stride=1,padding=1),nn.ReLU()
            )
        
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=3,stride=1,padding=1),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=3,stride=1,padding=1),nn.ReLU()
        )
        
        
    def forward(self,x):
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        
        print(x1.shape,x2.shape,x3.shape)
        
        return torch.cat((x1,x2,x3),dim=1)

In [31]:
summary(NewInception(3,32),input_size=(1,3,32,32)  ,  col_width=14,depth=2,
    col_names=["kernel_size", "output_size", "num_params", "mult_adds"],
    row_settings=["var_names"],
)

torch.Size([1, 32, 32, 32]) torch.Size([1, 32, 32, 32]) torch.Size([1, 32, 32, 32])


Layer (type (var_name))                  Kernel Shape   Output Shape   Param #        Mult-Adds
NewInception (NewInception)              --             [1, 96, 32, 32] --             --
├─Conv2d (branch1)                       [1, 1]         [1, 32, 32, 32] 128            131,072
├─Sequential (branch2)                   --             [1, 32, 32, 32] --             --
│    └─Conv2d (0)                        [1, 1]         [1, 32, 32, 32] 128            131,072
│    └─ReLU (1)                          --             [1, 32, 32, 32] --             --
│    └─Conv2d (2)                        [3, 3]         [1, 32, 32, 32] 9,248          9,469,952
│    └─ReLU (3)                          --             [1, 32, 32, 32] --             --
├─Sequential (branch3)                   --             [1, 32, 32, 32] --             --
│    └─Conv2d (0)                        [1, 1]         [1, 32, 32, 32] 128            131,072
│    └─ReLU (1)                          --             [1, 32, 32, 32] 

 By using multiple smaller convolutional layers instead of one large 5x5 convolution, the network can achieve the same output depth while using fewer parameters

In [40]:
old_inception = Inception(3,32)
new_inception = NewInception(3,32)

In [43]:
num_param_old = sum(p.numel() for p in  old_inception.parameters())  
num_param_new = sum(p.numel() for p in  new_inception.parameters())  

In [46]:
print(f"reduction cost converting the 5x5 into two 3x3 : \
    {(num_param_old - num_param_new)/num_param_old:.2%}")

reduction cost converting the 5x5 into two 3x3 :     20.24%


 # Spatial Factorization into Asymmetric Convolutions

In [58]:
class AsymInception(nn.Module):
    def __init__(self,in_channels,out_channels) -> None:
        super().__init__()
        
        self.branch1 = nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0)
        
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=(3,1),stride=1,padding=(1,0)),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=(1,3),stride=1,padding=(0,1)),nn.ReLU(),
            )
        
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels,out_channels,kernel_size=1,stride=1,padding=0),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=(3,1),stride=1,padding=(1,0)),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=(1,3),stride=1,padding=(0,1)),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=(3,1),stride=1,padding=(1,0)),nn.ReLU(),
            nn.Conv2d(out_channels,out_channels,kernel_size=(1,3),stride=1,padding=(0,1)),nn.ReLU()
        )
        
        
    def forward(self,x):
        x1 = self.branch1(x)
        x2 = self.branch2(x)
        x3 = self.branch3(x)
        
        print(x1.shape,x2.shape,x3.shape)
        
        return torch.cat((x1,x2,x3),dim=1)

In [59]:
summary(AsymInception(3,32),input_size=(1,3,32,32)  ,  col_width=14,depth=2,
    col_names=["kernel_size", "output_size", "num_params", "mult_adds"],
    row_settings=["var_names"],
)

torch.Size([1, 32, 32, 32]) torch.Size([1, 32, 32, 32]) torch.Size([1, 32, 32, 32])


Layer (type (var_name))                  Kernel Shape   Output Shape   Param #        Mult-Adds
AsymInception (AsymInception)            --             [1, 96, 32, 32] --             --
├─Conv2d (branch1)                       [1, 1]         [1, 32, 32, 32] 128            131,072
├─Sequential (branch2)                   --             [1, 32, 32, 32] --             --
│    └─Conv2d (0)                        [1, 1]         [1, 32, 32, 32] 128            131,072
│    └─ReLU (1)                          --             [1, 32, 32, 32] --             --
│    └─Conv2d (2)                        [3, 1]         [1, 32, 32, 32] 3,104          3,178,496
│    └─ReLU (3)                          --             [1, 32, 32, 32] --             --
│    └─Conv2d (4)                        [1, 3]         [1, 32, 32, 32] 3,104          3,178,496
│    └─ReLU (5)                          --             [1, 32, 32, 32] --             --
├─Sequential (branch3)                   --             [1, 32, 32, 32

In [60]:
asym_inception = AsymInception(3,32)

num_param_asym = sum(p.numel() for p in asym_inception.parameters())
num_param_asym

19008

In [61]:
num_param_old,num_param_new,num_param_asym

(35264, 28128, 19008)

In [63]:
print(f"Relative gain for reduction converting the 5x5 into two 3x3 : \
    {(num_param_old - num_param_new)/num_param_old:.2%}")
print(f"Relative gain for reduction converting the 5x5 using the asymmetric conv  : \
    {(num_param_old - num_param_asym)/num_param_old:.2%}")

Relative gain for reduction converting the 5x5 into two 3x3 :     20.24%
Relative gain for reduction converting the 5x5 using the asymmetric conv  :     46.10%
