### Section 4.2.3 Ensemble Learning

https://arxiv.org/pdf/1409.4842 - doing transformations on input data as proposed in given paper and following the methodolfy for training all models in ensemble

### Imports


In [None]:
import os
import math
import time
from typing import Tuple

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast

import torchvision
from torchvision import transforms

import matplotlib.pyplot as plt
import numpy as np

# fix random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

# device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)


Using device: cuda


### Custom Implementation of Batch Normalization

In [None]:
class CustomBatchNorm2d(nn.Module):
    def __init__(self, num_features, eps=1e-5, momentum=0.1):
        super().__init__()
        self.eps = eps
        self.momentum = momentum

        self.gamma = nn.Parameter(torch.ones(num_features))
        self.beta = nn.Parameter(torch.zeros(num_features))

        self.register_buffer("running_mean", torch.zeros(num_features))
        self.register_buffer("running_var", torch.ones(num_features))

    def forward(self, x):
        if self.training:
            mean = x.mean(dim=(0, 2, 3))
            var = x.var(dim=(0, 2, 3), unbiased=False)

            self.running_mean = (1 - self.momentum) * self.running_mean + self.momentum * mean
            self.running_var = (1 - self.momentum) * self.running_var + self.momentum * var
        else:
            mean = self.running_mean
            var = self.running_var

        x_hat = (x - mean[None, :, None, None]) / torch.sqrt(var[None, :, None, None] + self.eps)
        out = self.gamma[None, :, None, None] * x_hat + self.beta[None, :, None, None]
        return out


### Defining Inception Block (as proposed in research paper)

Please note that we have used only 2 inception blocks due to limitation of resource.

In [None]:
class ConvBNAct(nn.Module):
    def __init__(self, in_c, out_c, k, s=1, p=0, use_bn=False, activation_fn=nn.ReLU):
        super().__init__()
        layers = [nn.Conv2d(in_c, out_c, k, s, p, bias=not use_bn)]
        if use_bn:
            layers.append(CustomBatchNorm2d(out_c))
        layers.append(activation_fn()) # keeping it custom
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


class InceptionBlock(nn.Module):
    def __init__(self, in_c, use_bn=False, activation_fn=nn.ReLU):
        super().__init__()

        # shrink all widths
        self.b1 = ConvBNAct(in_c, 8, 1, use_bn=use_bn, activation_fn=activation_fn)

        self.b2 = nn.Sequential(
            ConvBNAct(in_c, 8, 1, use_bn=use_bn, activation_fn=activation_fn),
            ConvBNAct(8, 16, 3, p=1, use_bn=use_bn, activation_fn=activation_fn)
        )

        self.b3 = nn.Sequential(
            ConvBNAct(in_c, 8, 1, use_bn=use_bn, activation_fn=activation_fn),
            ConvBNAct(8, 16, 3, p=1, use_bn=use_bn, activation_fn=activation_fn),
            ConvBNAct(16, 16, 3, p=1, use_bn=use_bn, activation_fn=activation_fn)
        )

        self.b4 = nn.Sequential(
            nn.MaxPool2d(3, stride=1, padding=1),
            ConvBNAct(in_c, 8, 1, use_bn=use_bn, activation_fn=activation_fn)
        )

    def forward(self, x):
        return torch.cat([self.b1(x), self.b2(x), self.b3(x), self.b4(x)], dim=1)

### TinyInception (Model defination for training)

In [None]:
class TinyInception(nn.Module):
    def __init__(self, num_classes=10,
                 use_bn=True,
                 dropout_rate=0.05,
                 increase_init=False,
                 final_bn=False,
                 activation_fn=nn.ReLU):
        super().__init__()

        self.stem = ConvBNAct(3, 16, 3, p=1, use_bn=use_bn, activation_fn=activation_fn)
        self.inc1 = InceptionBlock(16, use_bn=use_bn, activation_fn=activation_fn)
        self.inc2 = InceptionBlock(48, use_bn=use_bn, activation_fn=activation_fn)
        self.pool = nn.AdaptiveAvgPool2d(1)

        # non-convolutional dropout used in BN-Inception paper
        self.drop = nn.Dropout(dropout_rate)

        # using batch norm in last layer
        self.final_bn = nn.BatchNorm1d(48) if final_bn else nn.Identity()

        self.fc = nn.Linear(48, num_classes)

        # increase initial weights
        if increase_init:
            self._increase_initial_weights()

    def _increase_initial_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.normal_(m.weight, mean=0, std=0.2)  # having larger variance

    def forward(self, x):
        x = self.stem(x)
        x = self.inc1(x)
        x = self.inc2(x)
        x = self.pool(x)
        x = x.view(x.size(0), -1)

        x = self.drop(x)
        x = self.final_bn(x)

        return self.fc(x)

### Loading Dataset

#### Transformation on input data

In [None]:
import torchvision.transforms as T

# performing random corp and photometric distortion on training data
def cifar10_inception_style_transform():
    transform = T.Compose([
        # randomCrop
        T.RandomCrop(32, padding=4),

        # flipping image
        T.RandomHorizontalFlip(),

        # photometric distortions- but small as suggestion
        T.ColorJitter(
            brightness=0.2,
            contrast=0.2,
            saturation=0.2,
            hue=0.05
        ),

        T.ToTensor(),

        # Normalize
        T.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    return transform


In [None]:
def load_cifar_10():
    train_transform = cifar10_inception_style_transform()

    test_transform = T.Compose([
        T.ToTensor(),
        T.Normalize((0.5,0.5,0.5), (0.5,0.5,0.5))
    ])

    trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True,
        transform=train_transform
    )

    trainloader = torch.utils.data.DataLoader(
        trainset, batch_size=64, shuffle=True, num_workers=4
    )

    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True,
        transform=test_transform
    )

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=64, shuffle=False, num_workers=4
    )

    return trainloader, testloader


### Model Training function


In [None]:
def train_model(model, trainloader, valloader, epochs=30, lr=0.045,
                resume=True):

    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = model.to(device)

    opt = optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    criterion = nn.CrossEntropyLoss()

    train_acc_list = []
    val_acc_list = []
    epos = []

    for epoch in range(epochs):
        model.train()
        correct = total = 0

        for x, y in trainloader:
            x, y = x.to(device), y.to(device)
            opt.zero_grad()
            out = model(x)
            loss = criterion(out, y)

            loss.backward()
            opt.step()

            _, pred = out.max(1)
            correct += pred.eq(y).sum().item()
            total += y.size(0)

        train_acc = correct / total
        train_acc_list.append(train_acc)
        epos.append(epoch)

        # Validation
        model.eval()
        correct = total = 0
        with torch.no_grad():
            for x, y in valloader:
                x, y = x.to(device), y.to(device)
                out = model(x)
                _, pred = out.max(1)
                correct += pred.eq(y).sum().item()
                total += y.size(0)

        val_acc = correct / total
        val_acc_list.append(val_acc)

        print(f"Epoch {epoch+1}: Train Acc={train_acc:.2f} | Val Acc={val_acc:.2f}")


    return train_acc_list, val_acc_list, epos

## Training

In [None]:
trainloader, valloader = load_cifar_10()

100%|██████████| 170M/170M [00:03<00:00, 48.0MB/s]


## Define models for ensemble learning

Done as per following description : Each was based on BN-x30, modified via some of the following: increased initial weights in the convolutional layers; using Dropout (with the Dropout probability of 5% or 10%, vs. 40%
for the original Inception) and using non-convolutional, per-activation Batch Normalization with last hidden layers of the model.

In [None]:
training_results = []

In [None]:
def build_ensemble_models():
    models = []

    configs = [
        # dropout 5% - 10%
        {"dropout_rate": 0.05, "increase_init": False, "final_bn": False},
        {"dropout_rate": 0.10, "increase_init": False, "final_bn": False},

        # increased weight init
        {"dropout_rate": 0.05, "increase_init": True, "final_bn": False},
        {"dropout_rate": 0.10, "increase_init": True, "final_bn": False},

        # using batchNorm in the final linear layer
        {"dropout_rate": 0.05, "increase_init": False, "final_bn": True},
        {"dropout_rate": 0.10, "increase_init": False, "final_bn": True},
    ]

    for cfg in configs:
        model = TinyInception(
            num_classes=10,
            use_bn=True,
            dropout_rate=cfg["dropout_rate"],
            increase_init=cfg["increase_init"],
            final_bn=cfg["final_bn"]
        )
        models.append({"model":model,"config":cfg})

    return models


In [None]:
def ensemble_predict(models, x):
    probs = []

    with torch.no_grad():
        for model in models:
            logits = model["model"](x)
            p = torch.softmax(logits, dim=1)
            probs.append(p)

    # arithmetic average (as per the paper)
    return torch.mean(torch.stack(probs), dim=0)


## Training each model independently now

In [None]:
models = build_ensemble_models()

In [None]:
models[0]["config"]

{'dropout_rate': 0.05, 'increase_init': False, 'final_bn': False}

#### Model - 0

In [None]:
acc_train_0, acc_val_0, ep = train_model(models[0]["model"], trainloader, valloader,epochs=30,lr=0.045)

Epoch 1: Train Acc=0.36 | Val Acc=0.44
Epoch 2: Train Acc=0.50 | Val Acc=0.52
Epoch 3: Train Acc=0.55 | Val Acc=0.49
Epoch 4: Train Acc=0.58 | Val Acc=0.57
Epoch 5: Train Acc=0.59 | Val Acc=0.53
Epoch 6: Train Acc=0.60 | Val Acc=0.53
Epoch 7: Train Acc=0.61 | Val Acc=0.60
Epoch 8: Train Acc=0.62 | Val Acc=0.62
Epoch 9: Train Acc=0.63 | Val Acc=0.62
Epoch 10: Train Acc=0.64 | Val Acc=0.65
Epoch 11: Train Acc=0.65 | Val Acc=0.67
Epoch 12: Train Acc=0.65 | Val Acc=0.66
Epoch 13: Train Acc=0.65 | Val Acc=0.68
Epoch 14: Train Acc=0.65 | Val Acc=0.67
Epoch 15: Train Acc=0.66 | Val Acc=0.67
Epoch 16: Train Acc=0.66 | Val Acc=0.65
Epoch 17: Train Acc=0.66 | Val Acc=0.66
Epoch 18: Train Acc=0.67 | Val Acc=0.68
Epoch 19: Train Acc=0.67 | Val Acc=0.69
Epoch 20: Train Acc=0.67 | Val Acc=0.68
Epoch 21: Train Acc=0.68 | Val Acc=0.69
Epoch 22: Train Acc=0.68 | Val Acc=0.63
Epoch 23: Train Acc=0.68 | Val Acc=0.65
Epoch 24: Train Acc=0.68 | Val Acc=0.67
Epoch 25: Train Acc=0.69 | Val Acc=0.69
Epoch 26:

In [None]:
torch.save(models[0]["model"].state_dict(), 'model_0.pth')
print("Model 'model_0.pth' saved successfully.")

Model 'model_0.pth' saved successfully.


In [None]:
training_results.append({
    "model_config": models[0]["config"],
    "train_acc": acc_train_0,
    "val_acc": acc_val_0,
    "epochs": ep,
})
print(f"Training result information appended: {training_results}")

Training result information appended: [{'model_config': {'dropout_rate': 0.05, 'increase_init': False, 'final_bn': False}, 'train_acc': [0.3596, 0.5016, 0.5503, 0.57606, 0.59178, 0.60388, 0.61448, 0.6223, 0.63086, 0.6366, 0.64596, 0.64786, 0.65202, 0.65424, 0.6608, 0.66348, 0.6646, 0.6714, 0.67212, 0.67368, 0.67572, 0.67894, 0.67914, 0.68196, 0.68674, 0.6867, 0.6878, 0.68814, 0.69244, 0.69424], 'val_acc': [0.4429, 0.5208, 0.4855, 0.5656, 0.5316, 0.5321, 0.6009, 0.6216, 0.6216, 0.6451, 0.6653, 0.6558, 0.6771, 0.6733, 0.6666, 0.6503, 0.6616, 0.6816, 0.6938, 0.6825, 0.6925, 0.6338, 0.6547, 0.6718, 0.6946, 0.683, 0.7078, 0.6954, 0.712, 0.693], 'epochs': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]}]


In [None]:
plt.figure(figsize=(10,5))
plt.plot([i+1 for i in ep], acc_val_0, label="Dropout 5% - LR:30x")
plt.title("Validation Accuracy vs Epoch (CIFAR-10)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.savefig("model_dropout_0.png")
plt.show()


#### Model - 1



In [None]:
models[1]["config"]

{'dropout_rate': 0.1, 'increase_init': False, 'final_bn': False}

In [None]:
trainloader, valloader = load_cifar_10()

In [None]:
acc_train_1,acc_val_1,eps = train_model(models[1]["model"], trainloader, valloader,epochs=30,lr=0.045)

Epoch 1: Train Acc=0.35 | Val Acc=0.43
Epoch 2: Train Acc=0.48 | Val Acc=0.47
Epoch 3: Train Acc=0.53 | Val Acc=0.48
Epoch 4: Train Acc=0.56 | Val Acc=0.57
Epoch 5: Train Acc=0.58 | Val Acc=0.56
Epoch 6: Train Acc=0.59 | Val Acc=0.55
Epoch 7: Train Acc=0.60 | Val Acc=0.63
Epoch 8: Train Acc=0.61 | Val Acc=0.60
Epoch 9: Train Acc=0.62 | Val Acc=0.65
Epoch 10: Train Acc=0.63 | Val Acc=0.65
Epoch 11: Train Acc=0.63 | Val Acc=0.66
Epoch 12: Train Acc=0.63 | Val Acc=0.66
Epoch 13: Train Acc=0.64 | Val Acc=0.65
Epoch 14: Train Acc=0.64 | Val Acc=0.66
Epoch 15: Train Acc=0.64 | Val Acc=0.65
Epoch 16: Train Acc=0.65 | Val Acc=0.63
Epoch 17: Train Acc=0.65 | Val Acc=0.67
Epoch 18: Train Acc=0.66 | Val Acc=0.66
Epoch 19: Train Acc=0.66 | Val Acc=0.68
Epoch 20: Train Acc=0.66 | Val Acc=0.67
Epoch 21: Train Acc=0.66 | Val Acc=0.69
Epoch 22: Train Acc=0.66 | Val Acc=0.68
Epoch 23: Train Acc=0.66 | Val Acc=0.69
Epoch 24: Train Acc=0.67 | Val Acc=0.67
Epoch 25: Train Acc=0.67 | Val Acc=0.68
Epoch 26:

In [None]:
torch.save(models[1]["model"].state_dict(), 'model_1.pth')
print("Model 'model_1.pth' saved successfully.")

Model 'model_1.pth' saved successfully.


In [None]:
training_results.append({
    "model_config": models[1]["config"],
    "train_acc": acc_train_1,
    "val_acc": acc_val_1,
    "epochs": eps,
})
print(f"Training result information appended: {training_results}")

Training result information appended: [{'model_config': {'dropout_rate': 0.1, 'increase_init': False, 'final_bn': False}, 'train_acc': [0.34944, 0.47666, 0.5278, 0.5554, 0.57642, 0.58994, 0.60132, 0.60948, 0.61838, 0.6265, 0.62814, 0.63212, 0.63944, 0.64028, 0.64384, 0.64826, 0.6484, 0.65816, 0.65702, 0.65736, 0.66018, 0.66204, 0.66082, 0.66532, 0.66706, 0.66936, 0.6692, 0.67186, 0.6719, 0.67488], 'val_acc': [0.4292, 0.4748, 0.4783, 0.5671, 0.565, 0.5474, 0.6268, 0.5981, 0.6518, 0.651, 0.6575, 0.6648, 0.6517, 0.661, 0.6482, 0.6267, 0.6677, 0.6631, 0.6836, 0.6745, 0.6882, 0.6845, 0.6944, 0.672, 0.6848, 0.6229, 0.6928, 0.6926, 0.6885, 0.677], 'epochs': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]}]


In [None]:
plt.figure(figsize=(10,5))
plt.plot([i+1 for i in eps], acc_val_1, label="Dropout 10% - LR:30x")
plt.title("Validation Accuracy vs Epoch (CIFAR-10)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.savefig("model_dropout_10.png")
plt.show()

#### Model 2


In [None]:
models[2]["config"]

{'dropout_rate': 0.05, 'increase_init': True, 'final_bn': False}

In [None]:
trainloader, valloader = load_cifar_10()

In [None]:
acc_train_2,acc_val_2,eps = train_model(models[2]["model"], trainloader, valloader,epochs=30,lr=0.045)

In [None]:
torch.save(models[2]["model"].state_dict(), 'model_2.pth')
print("Model 'model_2.pth' saved successfully.")

In [None]:
training_results.append({
    "model_config": models[2]["config"],
    "train_acc": acc_train_2,
    "val_acc": acc_val_2,
    "epochs": eps,
})
print(f"Training result information appended: {training_results}")

In [None]:
plt.figure(figsize=(10,5))
plt.plot([i+1 for i in eps], acc_val_2, label="Dropout 5% - LR:30x - Increase Init")
plt.title("Validation Accuracy vs Epoch (CIFAR-10)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.savefig("model_dropout_5_increase_init.png")
plt.show()

#### Model 3


In [None]:
models[3]["config"]

In [None]:
trainloader, valloader = load_cifar_10()

In [None]:
acc_train_3,acc_val_3,eps = train_model(models[3]["model"], trainloader, valloader,epochs=30,lr=0.045)

In [None]:
torch.save(models[3]["model"].state_dict(), 'model_3.pth')
print("Model 'model_3.pth' saved successfully.")

In [None]:
training_results.append({
    "model_config": models[3]["config"],
    "train_acc": acc_train_3,
    "val_acc": acc_val_3,
    "epochs": eps,
})
print(f"Training result information appended: {training_results}")

In [None]:
plt.figure(figsize=(10,5))
plt.plot([i+1 for i in eps], acc_val_3, label="Dropout 10% - LR:30x - Increase Init")
plt.title("Validation Accuracy vs Epoch (CIFAR-10)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.savefig("model_dropout_10_increase_init.png")
plt.show()

#### Model 4

In [None]:
models[4]["config"]

In [None]:
trainloader, valloader = load_cifar_10()

In [None]:
acc_train_4,acc_val_4,eps = train_model(models[4]["model"], trainloader, valloader,epochs=30,lr=0.045)

In [None]:
torch.save(models[4]["model"].state_dict(), 'model_4.pth')
print("Model 'model_4.pth' saved successfully.")

In [None]:
training_results.append({
    "model_config": models[4]["config"],
    "train_acc": acc_train_4,
    "val_acc": acc_val_4,
    "epochs": eps,
})
print(f"Training result information appended: {training_results}")

In [None]:
plt.figure(figsize=(10,5))
plt.plot([i+1 for i in eps], acc_val_4, label="Dropout 5% - LR:30x - Final BN")
plt.title("Validation Accuracy vs Epoch (CIFAR-10)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.savefig("model_dropout_5_final_bn.png")
plt.show()

#### Model 5

In [None]:
models[5]["config"]

In [None]:
trainloader, valloader = load_cifar_10()

In [None]:
acc_train_5,acc_val_5,eps = train_model(models[5]["model"], trainloader, valloader,epochs=30,lr=0.045)

In [None]:
torch.save(models[5]["model"].state_dict(), 'model_5.pth')
print("Model 'model_5.pth' saved successfully.")


In [None]:
training_results.append({
    "model_config": models[5]["config"],
    "train_acc": acc_train_5,
    "val_acc": acc_val_5,
    "epochs": eps,
})
print(f"Training result information appended: {training_results}")

In [None]:
plt.figure(figsize=(10,5))
plt.plot([i+1 for i in eps], acc_val_5, label="Dropout 10% - LR:30x - Final BN")
plt.title("Validation Accuracy vs Epoch (CIFAR-10)")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.legend()
plt.grid()
plt.savefig("model_dropout_10_final_bn.png")
plt.show()

### Saving results

In [None]:
import json

# Prepare the training results for JSON serialization by excluding model objects
serializable_results = []
for result in training_results:
    serializable_result = {
        "train_acc": result["train_acc"],
        "val_acc": result["val_acc"],
        "epochs": result["epochs"],
        "description": result["description"]
    }
    serializable_results.append(serializable_result)

# Save the serializable results to a JSON file
with open('training_results.json', 'w') as f:
    json.dump(serializable_results, f, indent=4)

print("Training results saved to 'training_results.json' successfully.")

### Predicting results using the ensemble of models trained

In [None]:
def load_ensemble_models():
    models = build_ensemble_models()  # recreates architecture with same configs

    for i, item in enumerate(models):
        if i!=2:
            path = f"model_{i}.pth"
            state_dict = torch.load(path, map_location="cpu")
            item["model"].load_state_dict(state_dict)
            item["model"].eval()

    return models


In [None]:
def ensemble_predict(models, inputs):
    logits_list = []
    for item in models:
        model = item["model"]
        with torch.no_grad():
            logits = model(inputs)
            logits_list.append(logits)

    # average logits
    avg_logits = torch.mean(torch.stack(logits_list), dim=0)
    return avg_logits


In [None]:
def evaluate_ensemble(models, val_loader, device="cuda"):
    correct = 0
    total = 0

    for inputs, labels in val_loader:
        inputs, labels = inputs.to(device), labels.to(device)

        avg_logits = ensemble_predict(models, inputs)
        preds = avg_logits.argmax(dim=1)

        correct += (preds == labels).sum().item()
        total += labels.size(0)

    return correct / total


In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"

# load trained ensemble models
models = load_ensemble_models()

for item in models:
    item["model"].to(device)

# compute accuracy using simple averaging
acc = evaluate_ensemble(models, valloader, device)
print(f"Ensemble Validation Accuracy = {acc * 100:.2f}%")


Ensemble Validation Accuracy = 73.64%
