In [1]:
import torch
import torch.nn as nn



In [5]:
class MBConvBlock(nn.Module):
    
    def __init__(self, in_channels: int,
                 out_channels: int,
                 kernel_size,
                 stride:int,
                 expansion_factor:int):
        super(MBConvBlock, self).__init__()
        
        # logic check : We only skip connect if tje shape matches
        # 1. stride must be 1 (so the spacial dimensions match)
        # 2. Input channels must equal output channels
        
        self.use_residual = (stride == 1 and in_channels == out_channels)
        
        # calculate the 'Wide' Inner dimensions (hidden dimension)
        # if the input is 32 and the expansion is 6 then the hidden dimension is 32 * 6 = 192
        hidden_dim = int(in_channels * expansion_factor)
        
        # we will wrap the layers in the list and wrap them later
        layers = []
        
        #phase1
        # The expansion Phase (Narrow - wide)
        # If the expansion factor is greater than 1 , we need the 1x1 conv to blow up the channels, If expansions is 1 ( like in the very first layer of MobileNet) , we skip this step entirely
        
        if expansion_factor != 1:
            layers.extend([
                nn.Conv2d(in_channels=in_channels, out_channels=hidden_dim, kernel_size=1, bias=False),
                nn.BatchNorm2d(num_features=hidden_dim),
                nn.ReLU6(inplace=True) # ReLU6 is standard for all the mobile nets
            ])
        #phase 2
        # Depth wise Convolution
        # This is the most critical part, we use the groups = hidden_dim. this forces the network to learn a separate filter for every single channel , rather than mixing them all together.
        # this is where the efficiency comes from
        
        # 3x3 conv (usually) handling spatial patterns 
        layers.extend([
            #groups= hidden_dim is the magic argument here
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size=kernel_size,
                      stride=stride,
                      padding= kernel_size//2,
                      groups=hidden_dim,
                      bias = False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True)
        ])
        
        #Why we did this: padding=kernel_size//2 ensures that if stride=1, the output size remains exactly the same as input size. bias=False is used because BatchNorm essentially negates the bias anyway.
        
        # phase 3
        # The projection phase (wide -> Narrow)
        # the 'linear bottleneck '. We squash the channels back to the out_channels 
        # CRUCIALLY we do not add the actication function (layer) at the end
        
        # 1x1 conv to reduce the channels back to output size
        
        layers.extend([
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=out_channels,
                      kernel_size=1,
                      bias=False),
            nn.BatchNorm2d(out_channels)
            # STOP! Do not add ReLU here. This is the "Linear Bottleneck".
        ])
        
        #convert the layer into sequential
        self.conv = nn.Sequential(*layers)
        
        #Why we did this: Adding ReLU here would destroy the information we just compressed. We keep it linear to preserve the "manifold of interest."
        
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        
        if self.use_residual:
            return x + self.conv(x)
        else:
            return self.conv(x)
        

block = MBConvBlock(in_channels=32,
                    out_channels=32,
                    kernel_size=3,
                    stride=1,
                    expansion_factor=6)

x = torch.randn(1, 32, 64, 64)

out = block(x)

print(out.shape)
    
        
        

torch.Size([1, 32, 64, 64])


In [6]:
block = MBConvBlock(in_channels=32,
                    out_channels=32,
                    kernel_size=3,
                    stride=1,
                    expansion_factor=0.5)

x = torch.randn(1, 32, 64, 64)

out = block(x)

print(out.shape)
    

torch.Size([1, 32, 64, 64])


In [7]:
class MBConvBlock(nn.Module):
    
    def __init__(self, in_channels: int,
                 out_channels: int,
                 kernel_size,
                 stride:int,
                 expansion_factor:int):
        super(MBConvBlock, self).__init__()
        
        # logic check : We only skip connect if tje shape matches
        # 1. stride must be 1 (so the spacial dimensions match)
        # 2. Input channels must equal output channels
        
        self.use_residual = (stride == 1 and in_channels == out_channels)
        
        if stride == 2:
            self.use_residual = False
        
        
        # calculate the 'Wide' Inner dimensions (hidden dimension)
        # if the input is 32 and the expansion is 6 then the hidden dimension is 32 * 6 = 192
        hidden_dim = int(in_channels * expansion_factor)
        
        # we will wrap the layers in the list and wrap them later
        layers = []
        
        #phase1
        # The expansion Phase (Narrow - wide)
        # If the expansion factor is greater than 1 , we need the 1x1 conv to blow up the channels, If expansions is 1 ( like in the very first layer of MobileNet) , we skip this step entirely
        
        if expansion_factor != 1:
            layers.extend([
                nn.Conv2d(in_channels=in_channels, out_channels=hidden_dim, kernel_size=1, bias=False),
                nn.BatchNorm2d(num_features=hidden_dim),
                nn.ReLU6(inplace=True) # ReLU6 is standard for all the mobile nets
            ])
        #phase 2
        # Depth wise Convolution
        # This is the most critical part, we use the groups = hidden_dim. this forces the network to learn a separate filter for every single channel , rather than mixing them all together.
        # this is where the efficiency comes from
        
        # 3x3 conv (usually) handling spatial patterns 
        layers.extend([
            #groups= hidden_dim is the magic argument here
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=hidden_dim,
                      kernel_size=kernel_size,
                      stride=stride,
                      padding= kernel_size//2,
                      groups=hidden_dim,
                      bias = False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(inplace=True)
        ])
        
        #Why we did this: padding=kernel_size//2 ensures that if stride=1, the output size remains exactly the same as input size. bias=False is used because BatchNorm essentially negates the bias anyway.
        
        # phase 3
        # The projection phase (wide -> Narrow)
        # the 'linear bottleneck '. We squash the channels back to the out_channels 
        # CRUCIALLY we do not add the actication function (layer) at the end
        
        # 1x1 conv to reduce the channels back to output size
        
        layers.extend([
            nn.Conv2d(in_channels=hidden_dim,
                      out_channels=out_channels,
                      kernel_size=1,
                      bias=False),
            nn.BatchNorm2d(out_channels)
            # STOP! Do not add ReLU here. This is the "Linear Bottleneck".
        ])
        
        #convert the layer into sequential
        self.conv = nn.Sequential(*layers)
        
        #Why we did this: Adding ReLU here would destroy the information we just compressed. We keep it linear to preserve the "manifold of interest."
        
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        
        if self.use_residual:
            return x + self.conv(x)
        else:
            return self.conv(x)
        

# Scenario: Transitioning from 32 channels to 64 channels, and shrinking image size
block_downsample = MBConvBlock(in_channels=32, out_channels=64, kernel_size=3, stride=2, expansion_factor=6)

x = torch.randn(1, 32, 64, 64)
out = block_downsample(x)

print(f"Input: {x.shape}")
print(f"Output: {out.shape}")

if x.shape[2] // 2 == out.shape[2]:
    print("SUCCESS: Downsampling handled correctly, residual skipped.")
else:
    print("FAILURE: Dimensions mismatch.")
        
        

Input: torch.Size([1, 32, 64, 64])
Output: torch.Size([1, 64, 32, 32])
SUCCESS: Downsampling handled correctly, residual skipped.


In [10]:
class MobileNetV2(nn.Module):

    def __init__(self, num_classes=1000, width_mult=1.0):
        super(MobileNetV2, self).__init__()

        # 1. Configuration
        # t, c, n, s  t -> expansion_factor, c -> out_channels, n -> number of times to repeat the block, s -> stride

        self.config = [
            [1, 16, 1, 1],
            [6, 24, 2, 2],
            [6, 32, 3, 2],
            [6, 64, 4, 2],
            [6, 96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # 2. STEM (Initial Processing)
        # standard 3x3 Conv to get the thing started

        input_channel = 32
        last_channel = 1280  # standard first expansion

        self.stem = nn.Sequential(
            nn.Conv2d(
                in_channels=3,
                out_channels=input_channel,
                kernel_size=3,
                stride=2,
                padding=1,
                bias=False,
            ),
            nn.BatchNorm2d(input_channel),
            nn.ReLU6(inplace=True),
        )

        # 3. The Body (stacking the MBConvBlocks)
        layers = []
        for t, c, n, s in self.config:
            for i in range(n):

                # CRITICAL LOGIC:
                # ONLY the first block in a sequence uses the defined stride
                # All subsequent blocks must utilzie stride 1
                stride = s if i == 0 else 1

                layers.append(
                    MBConvBlock(
                        in_channels=input_channel,
                        out_channels=c,
                        kernel_size=3,
                        stride=stride,
                        expansion_factor=t,
                    )
                )

                # update the input channel for the next iteration
                input_channel = c

        self.features = nn.Sequential(*layers)

        # 4. The HEAD (classifiacation)
        # Expand features to 1280 -> Global Avg Pool -> classifier

        self.conv_last = nn.Sequential(
            nn.Conv2d(
                in_channels=input_channel,
                out_channels=last_channel,
                kernel_size=1,
                bias=False,
            ),
            nn.BatchNorm2d(last_channel),
            nn.ReLU6(inplace=True),
        )

        self.avg_pool = nn.AdaptiveAvgPool2d((1, 1))

        self.classifier = nn.Sequential(
            nn.Dropout(0.2),
            nn.Linear(in_features=last_channel, out_features=num_classes),
        )

    def forward(self, x):

        # 1. STEM
        x = self.stem(x)

        # 2. Body
        x = self.features(x)

        # 3. Head
        x = self.conv_last(x)
        x = self.avg_pool(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        
        return x
    
    


In [11]:
# 1. Instantiate
model = MobileNetV2(num_classes=1000)

# 2. Count Parameters (Should be approx 3.5 Million)
total_params = sum(p.numel() for p in model.parameters())
print(f"Total Parameters: {total_params/1e6:.2f} Million")

# 3. Pass Data
dummy_input = torch.randn(1, 3, 224, 224)
output = model(dummy_input)

print(f"Input: {dummy_input.shape}")
print(f"Output: {output.shape}")

# 4. Final Sanity Check
if output.shape == (1, 1000):
    print("✅ ARCHITECTURE COMPLETED SUCCESSFULLY")
else:
    print("❌ DIMENSION ERROR")

Total Parameters: 3.50 Million
Input: torch.Size([1, 3, 224, 224])
Output: torch.Size([1, 1000])
✅ ARCHITECTURE COMPLETED SUCCESSFULLY
