[**GoogLeNet (Szegedy et al., 2015)**](https://arxiv.org/abs/1409.4842) is an architecture that uses networks with multi-branch convolutions, introducing the Inception module to capture features at multiple scales efficiently.

![](./imgs/googlenet.png)

![](./imgs/inception_v1.png)

GoogLeNet was designed to improve accuracy while keeping computational cost manageable. It won the ILSVRC-2014 competition with state-of-the-art performance, surpassing AlexNet and VGG by using the Inception module, which allows the network to extract multi-scale features efficiently.

Unlike traditional CNNs, GoogLeNet processes different receptive fields (1×1, 3×3, 5×5 convolutions, and pooling layers) in parallel within each Inception module, significantly improving feature extraction. The architecture is 22 layers deep (excluding pooling layers) but remains computationally efficient due to 1×1 convolutions, which reduce the number of parameters by acting as "bottleneck" layers. Instead of fully connected layers, GoogLeNet uses Global Average Pooling (GAP) to minimize overfitting and reduce computational cost.

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import utils

In [2]:
class Inception(nn.Module):
    def __init__(self, in_channels, ch1x1, ch3x3red, ch3x3, ch5x5red, ch5x5, pool_proj):
        """ Inception module with four parallel branches. """
        super().__init__()

        # Branch 1: 1x1 conv
        self.branch1 = nn.Sequential(
            nn.Conv2d(in_channels, ch1x1, kernel_size=1),
            nn.ReLU(inplace=True)
        )

        # Branch 2: 1x1 conv -> 3x3 conv
        self.branch2 = nn.Sequential(
            nn.Conv2d(in_channels, ch3x3red, kernel_size=1), nn.ReLU(inplace=True),
            nn.Conv2d(ch3x3red, ch3x3, kernel_size=3, padding=1), nn.ReLU(inplace=True)
        )

        # Branch 3: 1x1 conv -> 5x5 conv
        self.branch3 = nn.Sequential(
            nn.Conv2d(in_channels, ch5x5red, kernel_size=1), nn.ReLU(inplace=True),
            nn.Conv2d(ch5x5red, ch5x5, kernel_size=5, padding=2), nn.ReLU(inplace=True)
        )

        # Branch 4: 3x3 max pool -> 1x1 conv
        self.branch4 = nn.Sequential(
            nn.MaxPool2d(kernel_size=3, stride=1, padding=1),
            nn.Conv2d(in_channels, pool_proj, kernel_size=1), nn.ReLU(inplace=True)
        )

    def forward(self, x):
        outputs = [self.branch1(x), self.branch2(x), self.branch3(x), self.branch4(x)]
        return torch.cat(outputs, 1)

In [3]:
class AuxiliaryClassifier(nn.Module):
    def __init__(self, in_channels, num_classes):
        """ Auxiliary classifier used during training to combat vanishing gradients. """
        super().__init__()

        self.avgpool = nn.AvgPool2d(kernel_size=5, stride=3)
        self.conv = nn.Conv2d(in_channels, 128, kernel_size=1)
        self.fc1 = nn.Linear(128 * 4 * 4, 1024)  # Assumes 14x14 input size after pooling
        self.fc2 = nn.Linear(1024, num_classes)
        self.dropout = nn.Dropout(p=0.7)

    def forward(self, x):
        x = self.avgpool(x)
        x = F.relu(self.conv(x), inplace=True)
        x = torch.flatten(x, 1)
        x = F.relu(self.fc1(x), inplace=True)
        x = self.dropout(x)
        x = self.fc2(x)
        return x

In [4]:
class GoogLeNet(nn.Module):
    def __init__(self, num_classes=10):
        """ GoogLeNet architecture with 9 Inception modules and 2 auxiliary classifiers. """
        super().__init__()

        # Stem
        self.stem = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3), nn.ReLU(True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
            nn.Conv2d(64, 64, kernel_size=1), nn.ReLU(True),
            nn.Conv2d(64, 192, kernel_size=3, padding=1), nn.ReLU(True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
        )

        # Inception modules (parameters from the original paper)
        self.inception3a = Inception(192, 64, 96, 128, 16, 32, 32)          # Output: 256
        self.inception3b = Inception(256, 128, 128, 192, 32, 96, 64)        # Output: 480
        self.maxpool1 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

        self.inception4a = Inception(480, 192, 96, 208, 16, 48, 64)         # Output: 512
        self.inception4b = Inception(512, 160, 112, 224, 24, 64, 64)        # Output: 512
        self.inception4c = Inception(512, 128, 128, 256, 24, 64, 64)        # Output: 512
        self.inception4d = Inception(512, 112, 144, 288, 32, 64, 64)        # Output: 528
        self.inception4e = Inception(528, 256, 160, 320, 32, 128, 128)      # Output: 832
        self.maxpool2 = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        
        self.inception5a = Inception(832, 256, 160, 320, 32, 128, 128)      # Output: 832
        self.inception5b = Inception(832, 384, 192, 384, 48, 128, 128)      # Output: 1024
        
        # Auxiliary classifiers
        self.aux1 = AuxiliaryClassifier(512, num_classes)  # After inception4a
        self.aux2 = AuxiliaryClassifier(528, num_classes)  # After inception4d
        
        # Main classifier
        self.avgpool = nn.AvgPool2d(kernel_size=7, stride=1)
        self.dropout = nn.Dropout(p=0.4)
        self.fc = nn.Linear(1024, num_classes)

    def forward(self, x, training=False):
        # Initial layers
        x = self.stem(x)
        
        # Inception modules
        x = self.inception3a(x)
        x = self.inception3b(x)
        x = self.maxpool1(x)

        x = self.inception4a(x)
        if training:
            aux1 = self.aux1(x)

        x = self.inception4b(x)
        x = self.inception4c(x)
        x = self.inception4d(x)
        if training:
            aux2 = self.aux2(x)

        x = self.inception4e(x)
        x = self.maxpool2(x)

        x = self.inception5a(x)
        x = self.inception5b(x)
        
        # Main classifier
        x = self.avgpool(x)
        x = x.view(x.size(0), -1)  # Flatten
        x = self.dropout(x)
        x = self.fc(x)
        
        if training:
            return x, aux1, aux2
        return x

In [5]:
def train_step(
    train_loader: torch.utils.data.DataLoader,
    model: torch.nn.Module,
    criterion: torch.nn.Module,
    optimizer: torch.optim.Optimizer,
    device: torch.device
):
    """
    Performs one full training pass (epoch) over the training dataset for GoogLeNet.

    Args:
        train_loader (DataLoader): DataLoader for the training dataset.
        model (nn.Module): GoogLeNet model with auxiliary classifiers.
        criterion (nn.Module): Loss function (e.g., CrossEntropyLoss).
        optimizer (Optimizer): Optimizer for updating model weights.
        device (torch.device): Device to perform computations on.

    Returns:
        tuple: Average training loss and accuracy (based on main output).
    """
    model.train()
    running_loss, running_corrects = 0.0, 0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        
        # Forward pass: GoogLeNet returns (main_output, aux1_output, aux2_output) during training
        main_output, aux1_output, aux2_output = model(inputs, training=True)
        
        # Compute losses for main and auxiliary outputs
        loss_main = criterion(main_output, labels)
        loss_aux1 = criterion(aux1_output, labels)
        loss_aux2 = criterion(aux2_output, labels)
        
        # Combine losses with weights (as per original GoogLeNet paper: 0.3 for auxiliaries)
        loss = loss_main + 0.3 * loss_aux1 + 0.3 * loss_aux2
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        # Track loss and accuracy (using main output only for accuracy)
        running_loss += loss.item()
        running_corrects += (main_output.argmax(dim=1) == labels).sum().item()

    avg_loss = running_loss / len(train_loader)
    avg_accuracy = running_corrects / len(train_loader.dataset)
    return avg_loss, avg_accuracy

In [6]:
data = utils.CIFAR10DataLoader(batch_size=64, resize=(224, 224))
train_loader = data.get_train_loader()
test_loader = data.get_test_loader()

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GoogLeNet(num_classes=10)
model.apply(utils.init_kaiming).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

epochs = 10
for epoch in range(epochs):
    train_loss, train_acc = utils.train_step(train_loader, model, criterion, optimizer, device)
    test_loss, test_acc = utils.eval_step(test_loader, model, criterion, device)
    print(f"Epoch {epoch + 1:>{len(str(epochs))}}/{epochs} | "
          f"Train Loss: {train_loss:.4f} | "
          f"Test Loss: {test_loss:.4f} | "
          f"Test Acc: {test_acc:.4f}")

Epoch  1/10 | Train Loss: 1.9161 | Test Loss: 1.7627 | Test Acc: 0.3655
Epoch  2/10 | Train Loss: 1.4997 | Test Loss: 1.4119 | Test Acc: 0.4806
Epoch  3/10 | Train Loss: 1.2306 | Test Loss: 1.2342 | Test Acc: 0.5614
Epoch  4/10 | Train Loss: 1.0280 | Test Loss: 0.9727 | Test Acc: 0.6571
Epoch  5/10 | Train Loss: 0.8826 | Test Loss: 0.8258 | Test Acc: 0.7087
Epoch  6/10 | Train Loss: 0.7561 | Test Loss: 0.7240 | Test Acc: 0.7487
Epoch  7/10 | Train Loss: 0.6687 | Test Loss: 0.6983 | Test Acc: 0.7594
Epoch  8/10 | Train Loss: 0.5904 | Test Loss: 0.7302 | Test Acc: 0.7471
Epoch  9/10 | Train Loss: 0.5287 | Test Loss: 0.5844 | Test Acc: 0.7988
Epoch 10/10 | Train Loss: 0.4736 | Test Loss: 0.5735 | Test Acc: 0.7996
