# EfficientNet

# Build the EfficientNet

Let's start to build the model!

We first need to build the `EfficientNetB0` baseline model, which is shown in *Table 1* of efficientnet paper.

Also, we setup the hyperparameter using for `B0` to `B7`.

It systematically scales up the power of the model.

In [38]:
baseline_model = [
    # expand_ratio, channels, repeats (layers), stride, kernel_size
    [1, 16, 1, 1, 3],
    [6, 24, 2, 2, 3],
    [6, 40, 2, 2, 5],
    [6, 80, 3, 2, 3],
    [6, 112, 3, 1, 5],
    [6, 192, 4, 2, 5],
    [6, 320, 1, 1, 3],
]
# stride: if the resolution of next layer is 1/2 it means the image is scaled, so stride = 2

phi_values = {
    # (phi_value, resolution, drop_rate), phi_value for alpha, beta, and gamma
    "b0": (0, 224, 0.2),
    "b1": (0.5, 240, 0.2),
    "b2": (1, 280, 0.3),
    "b3": (2, 300, 0.3),
    "b4": (3, 380, 0.4),
    "b5": (4, 456, 0.4),
    "b6": (5, 528, 0.5),
    "b7": (6, 600, 0.5),
}

Next, we build a special `CNNBlock` with `groups = groups` for implementing **depth-wise convolution**.

> *Note:* If we don't specify `groups` parameter, default = 1 means normal convolution layer, meaning using all channels to calculate every output channels; If `groups = in_channels = out_channels`, it is doing **depth-wise convolution**.

> *Note 2:* We often set `bias = False` if we do **batch normalisation**, since bn already exists `scale` and `shift` for bias term.

In [39]:
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, groups = 1):
        """
        General Convolutional layer
        groups: if we set groups = 1, this is a normal conv,
                if we set it to groups = in_channels, then it is a Depthwise conv
        """
        super(CNNBlock, self).__init__()
        self.cnn = nn.Conv2d(in_channels,
                             out_channels,
                             kernel_size,
                             stride, 
                             padding,
                             groups = groups,
                             bias = False)
        self.bn = nn.BatchNorm2d(out_channels)
        self.silu = nn.SiLU()

    def forward(self, x):
        return self.silu(self.bn(self.cnn(x)))


The other special method is `Squeeze-and-Excitation` . Here we are going to build the layer. (More details about Squeeze-and-Excitation going to notebook)

In [40]:
class SqueezeExcitation(nn.Module):
    def __init__(self, in_channels, reduced_dim):
        """
        Squeeze-and-excitation layer. We also add an additional nonliearity
        """
        super(SqueezeExcitation, self).__init__()
        self.se = nn.Sequential(
            nn.AdaptiveAvgPool2d(1),
            nn.Conv2d(in_channels, reduced_dim, 1),
            nn.SiLU(),
            nn.Conv2d(reduced_dim, in_channels, 1),
            nn.Sigmoid()
        )

    def forward(self, x):
        return x * self.se(x)


Finally, we can integrate CNNBlock and SqueezeExcitation to build `InvertedResidualBlock`.

Several things to be aware:
1. `use_residual`: Remember when to use residual connection
2. `stochastic_depth`: Only when using residual connection will apply stochastic dropout

In [41]:
from math import ceil

class InvertedResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, expand_ratio, reduction = 4, survival_prob = 0.8):
        """
        expand_ratio: scale up in MBConv
        reduction: reduced_dim for Squeeze-and-Excitation optimisation, for attention score between each channel
        survival_prob: for stochastic depth-wise ratio
        """
        super(InvertedResidualBlock, self).__init__()
        self.survival_prob = survival_prob
        self.use_residual = (in_channels == out_channels) and (stride == 1)
        hidden_dim = in_channels * expand_ratio
        self.expand = in_channels != hidden_dim
        reduced_dim = int(in_channels / reduction)

        if self.expand:
            # expand the channel
            self.expand_conv = CNNBlock(in_channels, hidden_dim, kernel_size = 1, stride = 1, padding = 0)

        self.conv = nn.Sequential(
            CNNBlock(hidden_dim, hidden_dim, kernel_size, stride, padding, groups = hidden_dim),
            SqueezeExcitation(hidden_dim, reduced_dim),
            nn.Conv2d(hidden_dim, out_channels, 1),
            nn.BatchNorm2d(out_channels)
        )

    def stochastic_depth(self, x):
        """
        Different "data point" will randomly skip this layer.
        ***Not batch-wise***
        """
        if not self.training:
            return x
        
        binary_tensor = torch.rand(x.shape[0], 1, 1, 1, device = x.device) < self.survival_prob
        return torch.div(x, self.survival_prob) * binary_tensor

    def forward(self, inputs):
        """
        Only when using residual connection will add stochastic depth
        """
        x = self.expand_conv(inputs) if self.expand else inputs

        if self.use_residual:
            return self.stochastic_depth(self.conv(x)) + inputs
        else:
            return self.conv(x)
        


## Build `EfficientNet`

Building the net is not very hard, make sure the input and output channels!

Based on the version of `EfficientNet` build the net with corresponding scale.

In [42]:
class EfficientNet(nn.Module):
    def __init__(self, version, num_classes):
        
        super(EfficientNet, self).__init__()
        width_factor, depth_factor, dropout_rate = self.calculate_factors(version)
        self.pool = nn.AdaptiveAvgPool2d(1)
        last_channels = ceil(1280 * width_factor)
        self.features = self.create_features(width_factor, depth_factor, last_channels)
        self.classifier = nn.Sequential(
            nn.Dropout(dropout_rate),
            nn.Linear(last_channels, num_classes)
        )

    def calculate_factors(self, version, alpha = 1.2, beta = 1.1):
        phi, resolution, drop_rate = phi_values[version]
        depth_factor = alpha ** phi
        width_factor = beta ** phi
        return depth_factor, width_factor, drop_rate
    
    def create_features(self, width_factor, depth_factor, last_channels):
        channels = int(32 * width_factor) # Channel size of first layer is 32 in paper, which we do not include in baseline_model list 
        features = [CNNBlock(3, channels, kernel_size = 3, stride = 2, padding = 1)]
        in_channels = channels
        
        for expand_ratio, channels, repeats, stride, kernel_size in baseline_model:
            out_channels = 4 * ceil(int(channels * width_factor) / 4) # just want to make sure the channels is Multiples of 4
            layers_repeats = ceil(repeats * depth_factor)

            for layer in range(layers_repeats):
                features.append(
                    InvertedResidualBlock(in_channels, out_channels, expand_ratio = expand_ratio,
                                          stride = stride if layer == 0 else 1,
                                          kernel_size = kernel_size,
                                          padding = kernel_size // 2) # if k = 1: pad = 0, if k = 3: pad = 1, if k = 5, pad = 3
                )

                in_channels = out_channels
        features.append(
            CNNBlock(in_channels, last_channels, kernel_size = 1, stride = 1, padding = 0)

        )
        return nn.Sequential(*features)
    
    def forward(self, x):
        x = self.pool(self.features(x))
        return self.classifier(x.view(x.shape[0], -1))

Finally write a test function.


In [43]:
def test():
    device = "cuda" if torch.cuda.is_available() else "cpu"
    version = "b0"
    phi, res, drop_rate = phi_values[version]
    num_examples, num_classes = 4, 10
    x = torch.randn((num_examples, 3, res, res)).to(device)
    model = EfficientNet(
        version=version,
        num_classes=num_classes,
    ).to(device)

    print(model(x).shape)  # (num_examples, num_classes)

In [44]:
test()

torch.Size([4, 10])


### SURPRISING! We finish building Efficient Net!