In [1]:
import torch
from torchvision.datasets import MNIST
import torchvision.transforms as transforms

import torch.nn as nn
from torch.optim import Adam
import numpy as np

from torchsummary import summary


In [2]:
BATCH_NORM = True

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

np.random.seed(42)
torch.manual_seed(42)
print(device)

mps


In [3]:
transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.5,), (0.5,))])

train = MNIST(root='data', train=True, transform=transform, download=True)
test = MNIST(root='data', train=False, transform=transform, download=True)

In [4]:
train_loader = torch.utils.data.DataLoader(train, batch_size=128, num_workers=2, shuffle=True)
test_loader = torch.utils.data.DataLoader(test, batch_size=128, num_workers=2, shuffle=False)

In [5]:
class CNN1(nn.Module):
    def __init__(self, batch_norm=True):
        super(CNN1, self).__init__()
        
        self.input_dim = None
        self.conv1_dim = None
        self.conv2_dim = None

        # This is a pretty common set up for MNIST
        size1 = 16
        size2 = 32

        conv1 = [
            nn.Conv2d(in_channels=1, out_channels=size1, kernel_size=5, stride=1, padding=2),                              
            nn.ReLU(), # Activation function                 
            nn.MaxPool2d(kernel_size=2,stride=2,padding=0)
        ]
        if batch_norm: conv1.append(nn.BatchNorm2d(size1))

        self.conv1 = nn.Sequential(*conv1)

        conv2 = [
            nn.Conv2d(in_channels=size1,out_channels=size2, kernel_size=5,stride=1,padding=2),
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size=2, stride=2,padding=0)
        ]
        if batch_norm: conv2.append(nn.BatchNorm2d(size2))

        self.conv2 = nn.Sequential(*conv2)    
        
        # Fully connected layer, Output 10 classes
        self.out = nn.Linear(size2 * 7 * 7, 10) # Decision Layer

        
    def forward(self, x):
        if self.input_dim == None: self.input_dim = x.shape
        
        x = self.conv1(x)
        if self.conv1_dim == None: self.conv1_dim = x.shape
            
        x = self.conv2(x)
        if self.conv2_dim == None:  self.conv2_dim = x.shape
           
        x = x.view(x.size(0), -1)
        output = self.out(x) 
        return output, x 
    
cnn1 = CNN1(BATCH_NORM).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(cnn1.parameters(), lr = 0.01)   


# summary(cnn1, (1, 28, 28))

In [6]:
num_epochs = 5
# Training 
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # print("Input tensor shape:", images.shape)  
        images, labels = images.to(device), labels.to(device)

        # Forward
        outputs, _ = cnn1(images)
        loss = criterion(outputs, labels)
        
        # Backward
        optimizer.zero_grad()
        loss.backward() # backpropagation 
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item()}')

print('Finished Training')


Epoch [1/5], Step [100/469], Loss: 0.11753280460834503
Epoch [1/5], Step [200/469], Loss: 0.031483471393585205
Epoch [1/5], Step [300/469], Loss: 0.05704391375184059
Epoch [1/5], Step [400/469], Loss: 0.05861607566475868
Epoch [2/5], Step [100/469], Loss: 0.011963937431573868
Epoch [2/5], Step [200/469], Loss: 0.10138247162103653
Epoch [2/5], Step [300/469], Loss: 0.028207922354340553
Epoch [2/5], Step [400/469], Loss: 0.040423691272735596
Epoch [3/5], Step [100/469], Loss: 0.09230667352676392
Epoch [3/5], Step [200/469], Loss: 0.05819518491625786
Epoch [3/5], Step [300/469], Loss: 0.06692968308925629
Epoch [3/5], Step [400/469], Loss: 0.053229257464408875
Epoch [4/5], Step [100/469], Loss: 0.02532869577407837
Epoch [4/5], Step [200/469], Loss: 0.010937328450381756
Epoch [4/5], Step [300/469], Loss: 0.0075055635534226894
Epoch [4/5], Step [400/469], Loss: 0.011470272205770016
Epoch [5/5], Step [100/469], Loss: 0.02070842683315277
Epoch [5/5], Step [200/469], Loss: 0.021890871226787567


In [7]:
print("Input tensor shape", cnn1.input_dim)
print("Shape after conv1:", cnn1.conv1_dim)
print("Shape after conv2:", cnn1.conv2_dim)

Input tensor shape torch.Size([128, 1, 28, 28])
Shape after conv1: torch.Size([128, 16, 14, 14])
Shape after conv2: torch.Size([128, 32, 7, 7])


In [8]:
def eval(cnn):
    correct = 0
    total = 0
    with torch.no_grad():  # Disable gradient computation
        cnn.eval()  # Set model evaluation mode
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device) 

            outputs_tuple = cnn(images)  
            outputs = outputs_tuple[0]  

            _, predicted = torch.max(outputs.data, 1)

            total += labels.size(0) 
            correct += (predicted == labels).sum().item() 

    accuracy = correct / total * 100  
    print(f'Accuracy on all 10000 test images: {accuracy}%')
    return accuracy

accuracy = eval(cnn1)



Accuracy on all 10000 test images: 98.6%


### Now to use the measurements learned in the book


Need to know:
1. MEC of fully connected (decision) layer -> use NN calc
2. The amount of bits of information arriving at that decision layer

In [9]:
decision_layer_in_features = cnn1.out.in_features
decision_layer_out_features = cnn1.out.out_features
MEC_decision_cnn1 = (decision_layer_in_features + 1) * decision_layer_out_features

print(f"MEC_decision = ({decision_layer_in_features} in features, + 1 bias) * {decision_layer_out_features} neurons = {MEC_decision_cnn1} bits")

MEC_decision = (1568 in features, + 1 bias) * 10 neurons = 15690 bits


#### 1. Manual Calculation of MEC of fully connected (decision) layer

It takes in 1568 features as input, which comes from flattening the 32 channels of 7x7 feature maps output from the conv2 layer (32 * 7 * 7 = 1568).

(out): Linear(in_features=1568, out_features=10, bias=True)

10 neurons, each with 1568 inputs, plus a bias term

MEC = (1568 + 1 bias) * 10 = **15690 bits**


**2. The amount of bits of information arriving at that decision layer**  
 
![alt text](def9-1.jpg "Definition 9.1")  
![alt text](cor9.jpg "Corollary 9.1")

In [10]:
g1 = np.prod(cnn1.input_dim)/np.prod(cnn1.conv1_dim)
print('G1:', g1)

g2 = np.prod(cnn1.conv1_dim)/np.prod(cnn1.conv2_dim)
print('G2:', g2)

g_total_cnn1 = g1 * g2

print(f'Total compression G_total = {g_total_cnn1:.2f}\n')

summary(cnn1.to('cpu'), (1, 28, 28))
del cnn1

G1: 0.25
G2: 2.0
Total compression G_total = 0.50

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 16, 28, 28]             416
              ReLU-2           [-1, 16, 28, 28]               0
         MaxPool2d-3           [-1, 16, 14, 14]               0
       BatchNorm2d-4           [-1, 16, 14, 14]              32
            Conv2d-5           [-1, 32, 14, 14]          12,832
              ReLU-6           [-1, 32, 14, 14]               0
         MaxPool2d-7             [-1, 32, 7, 7]               0
       BatchNorm2d-8             [-1, 32, 7, 7]              64
            Linear-9                   [-1, 10]          15,690
Total params: 29,034
Trainable params: 29,034
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.36
Params size (MB): 0.11
Estimated Total Size (MB):

#### 2. Manual Calculation of Convolution Compression
Input tensor shape: torch.Size([64, 1, 28, 28])  
Shape after conv1: torch.Size([64, 16, 14, 14])  
Shape after conv2: torch.Size([64, 32, 7, 7])  

Input tensor shape: torch.Size([64, 1, 28, 28])
* (# inputs) = 64 {batch size}
* (input height) = 28 {The height of the MNIST images, 28 pixels}  
* (input width) = 28 {The width of the MNIST images, 28 pixels}  
* (input channels) = 1 {MNIST images are grayscale, 1 input channel}  

Output tensor shape:
Shape after conv1: torch.Size([64, 16, 14, 14])

G1 = (64 * 1 * 28 * 28)/(64 * 16 * 14 * 14) = **0.25**  
G2 = (64 * 16 * 14 * 14)/(64 * 32 * 7 * 7) = **2**

G_total = 0.25 * 2 = **0.5** 

### 2nd Attempt, using MEC and Compression (G) to tune Hyperparameters  
I tuned hyperparameters by trying to maintain accuracy while increasing G > 2

In [11]:

np.random.seed(42)
torch.manual_seed(42)

class CNN2(nn.Module):
    def __init__(self, batch_norm=True, conv1_only=False):
        super(CNN2, self).__init__()
        self.conv1_only = conv1_only
        self.input_dim = None
        self.conv1_dim = None
        self.conv2_dim = None

        # I tuned hyperparameters by trying to increase accuracy while maintaining G > 2
        size1 = 4
        size2 = 15

        conv1 = [
            nn.Conv2d(in_channels=1, out_channels=size1, kernel_size=4, stride=2, padding=0),                              
            nn.ReLU(), # Activation function                 
            nn.MaxPool2d(kernel_size=2, stride=1,padding=1)
        ]
        if batch_norm: conv1.append(nn.BatchNorm2d(size1))

        self.conv1 = nn.Sequential(*conv1)

        conv2 = [
            nn.Conv2d(in_channels=size1, out_channels=size2, kernel_size=4, stride=2, padding=0),
            nn.ReLU(), 
            nn.MaxPool2d(kernel_size=2, stride=1, padding=0)
        ]
        if batch_norm: conv2.append(nn.BatchNorm2d(size2))

        self.conv2 = nn.Sequential(*conv2)



        # Fully connected layer, output 10 classes

        if self.conv1_only:
            self.out = nn.Linear(size1 * 14 * 14, 10)
        else:
            self.out = nn.Linear(size2 * 5 * 5, 10) # Decision Layer
    
    def forward(self, x):
        if self.input_dim == None: self.input_dim = x.shape
        
        x = self.conv1(x)
        if self.conv1_dim == None: self.conv1_dim = x.shape
            
        if not self.conv1_only:    
            x = self.conv2(x)
            if self.conv2_dim == None:  self.conv2_dim = x.shape
           
        x = x.view(x.size(0), -1)
        output = self.out(x) 
        return output, x 
    
cnn2 = CNN2(BATCH_NORM, conv1_only=False).to(device) # Set to True to test with only the first convolution -> results in lower accuracy
criterion = nn.CrossEntropyLoss()
optimizer = Adam(cnn2.parameters(), lr = 0.01)   

In [12]:
num_epochs = 5

# Train the model
for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # print("Input tensor shape:", images.shape)  
        images, labels = images.to(device), labels.to(device) # Move to device

        # Forward pass
        outputs, _ = cnn2(images)
        loss = criterion(outputs, labels)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward() # backpropagation 
        optimizer.step()
        
        if (i+1) % 100 == 0:
            print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{len(train_loader)}], Loss: {loss.item()}')

print('Finished Training')

Epoch [1/5], Step [100/469], Loss: 0.07444819808006287
Epoch [1/5], Step [200/469], Loss: 0.16685014963150024
Epoch [1/5], Step [300/469], Loss: 0.1369251012802124
Epoch [1/5], Step [400/469], Loss: 0.1766398847103119
Epoch [2/5], Step [100/469], Loss: 0.10923264175653458
Epoch [2/5], Step [200/469], Loss: 0.017553744837641716
Epoch [2/5], Step [300/469], Loss: 0.053314197808504105
Epoch [2/5], Step [400/469], Loss: 0.03503665700554848
Epoch [3/5], Step [100/469], Loss: 0.049106352031230927
Epoch [3/5], Step [200/469], Loss: 0.011438676156103611
Epoch [3/5], Step [300/469], Loss: 0.07855498790740967
Epoch [3/5], Step [400/469], Loss: 0.035285383462905884
Epoch [4/5], Step [100/469], Loss: 0.031362779438495636
Epoch [4/5], Step [200/469], Loss: 0.058974526822566986
Epoch [4/5], Step [300/469], Loss: 0.019044429063796997
Epoch [4/5], Step [400/469], Loss: 0.03395812213420868
Epoch [5/5], Step [100/469], Loss: 0.057196199893951416
Epoch [5/5], Step [200/469], Loss: 0.027254633605480194
Ep

In [13]:
print("Input tensor shape", cnn2.input_dim)
print("Shape after conv1:", cnn2.conv1_dim)
print("Shape after conv2:", cnn2.conv2_dim)



Input tensor shape torch.Size([128, 1, 28, 28])
Shape after conv1: torch.Size([128, 4, 14, 14])
Shape after conv2: torch.Size([128, 15, 5, 5])


In [14]:
accuracy_cnn2 = eval(cnn2)

Accuracy on all 10000 test images: 98.8%


In [15]:
decision_layer_in_features = cnn2.out.in_features
decision_layer_out_features = cnn2.out.out_features
MEC_decision_cnn2 = (decision_layer_in_features + 1) * decision_layer_out_features

print(f"MEC_decision = ({decision_layer_in_features} in features, + 1 bias) * {decision_layer_out_features} neurons = {MEC_decision_cnn2} bits")

MEC_decision = (375 in features, + 1 bias) * 10 neurons = 3760 bits


In [16]:
g1 = np.prod(cnn2.input_dim)/np.prod(cnn2.conv1_dim)
print('G1:', g1)

if not cnn2.conv1_only:
    g2 = np.prod(cnn2.conv1_dim)/np.prod(cnn2.conv2_dim)
    print('G2:', g2)

else:
    g2 = 1

g_total_cnn2 = g1 * g2

print(f'Total compression G_total = {g_total_cnn2:.2f}\n')


summary(cnn2.to('cpu'), (1, 28, 28))
# del cnn2

G1: 1.0
G2: 2.0906666666666665
Total compression G_total = 2.09

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1            [-1, 4, 13, 13]              68
              ReLU-2            [-1, 4, 13, 13]               0
         MaxPool2d-3            [-1, 4, 14, 14]               0
       BatchNorm2d-4            [-1, 4, 14, 14]               8
            Conv2d-5             [-1, 15, 6, 6]             975
              ReLU-6             [-1, 15, 6, 6]               0
         MaxPool2d-7             [-1, 15, 5, 5]               0
       BatchNorm2d-8             [-1, 15, 5, 5]              30
            Linear-9                   [-1, 10]           3,760
Total params: 4,841
Trainable params: 4,841
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.04
Params size (MB): 0.02
Estimated Tota

In [17]:
print(f"Reduced MEC_Decision from {MEC_decision_cnn1} to {MEC_decision_cnn2}: Decreased by a factor of {MEC_decision_cnn1/MEC_decision_cnn2:.2f}")
print(f"Increased G_total from {g_total_cnn1} to {g_total_cnn2:.2f}: Increased by a factor of {g_total_cnn2/g_total_cnn1:.2f}\n")

print(f"Accuracy: CNN1 = {accuracy}%, CNN2 = {accuracy_cnn2}%")


Reduced MEC_Decision from 15690 to 3760: Decreased by a factor of 4.17
Increased G_total from 0.5 to 2.09: Increased by a factor of 4.18

Accuracy: CNN1 = 98.6%, CNN2 = 98.8%


### Conclusion
First, I showed mathematically that the first attempt, CNN1 (not using the measurements in the book) was grossly overfiting and not even compressing data (G_total < 1):
* **CNN1**
  * MEC_Decision = 15690
  * Compression G_total = 0.5
  * Accuracy = 98.6%

Then, I proved that the model was overfitting by creating a new model (CNN2) that was 4x smaller but with the same accuracy:
* **CNN2**  
  * CNN2 MEC_Decision = 3760
  * CNN2 Compression G_total = 2.09
  * CNN2 Accuracy = 98.8%

Additional Finding:
The first convolution (conv1) on CNN2 may just be memorizing the data, as G1 = 1. You can test with only conv1 by setting (conv1_only=True) in the CNN2 cell above.  
Removing the second convolution on CNN2 () gives the following:  
* **CNN2 First Convolution Only**
  * MEC_Decision = 7850
  * Compression G_total = 1.00
  * Only Accuracy = 97.69%

Finally, The main takeaway is that by using the measurements in the book I achieved the following:  
1. **Decreased Decision MEC by a factor of 4.17**::
   * Reduced MEC_Decision from 15690 to 3760
2. **Increased Compression by a factor of 4.18**:
   * Increased G_total from 0.5 to 2.09
3. Maintained Testing Accuracy while being **≈4x smaller**:
   * **Accuracy: CNN1 = 98.6%, CNN2 = 98.8%**


#### Main Takeaway
Using Compression (G) to guide Hyperparameter tuning resulted in a CNN model **7.8x smaller** with **Same Accuracy**:
  * PyTorch Estimated Total Size: **CNN1 = 0.47MB vs CNN2 = 0.06MB**
  * Testing Accuracy: **CNN1 = 98.6% vs CNN2 = 98.8%**




P.S. Turning off batch normalization (batch_norm=False) results in the **exact same** accuracy: CNN1_no_batch = 97.91%, CNN2_no_batch = 97.91%.  
Also, turning off batch normalization shows that CNN2 is faster to train (by 1-2 seconds)
