# AlexNet

In [1]:
import time
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms, models
from torch.utils.data import DataLoader

# Set device
device = device = torch.device('mps')
print(device)

# Ensure reproducibility
torch.manual_seed(1)
if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True

# Hyperparameters
learning_rate = 0.001
batch_size = 128
num_epochs = 10
num_classes = 10

# Data preprocessing
transform = transforms.Compose([
    transforms.Resize((227, 227)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

# Dataset
train_dataset = datasets.CIFAR10(root='data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='data', train=False, download=True, transform=transform)

# Data loader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

class AlexNetRelu(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetRelu, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Linear(256 * 6 * 6, num_classes)
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x
        
class AlexNetSigmoid(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetSigmoid, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Linear(256 * 6 * 6, num_classes)
    
    def forward(self, x):
            x = self.features(x)
            x = torch.flatten(x, 1)
            x = self.classifier(x)
            return x
      
class AlexNetTanh(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetTanh, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Linear(256 * 6 * 6, num_classes)
        
    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

    
modelSigmoid = AlexNetSigmoid(num_classes=num_classes).to(device)
modelRelu = AlexNetRelu(num_classes=num_classes).to(device)
modelTanh = AlexNetTanh(num_classes=num_classes).to(device)
# Loss and optimizer



mps
Files already downloaded and verified
Files already downloaded and verified


In [2]:

def compute_accuracy(model, data_loader):
    model.eval()
    correct_pred, num_examples = 0, 0
    for i, (features, targets) in enumerate(data_loader):
            
        features = features.to(device)
        targets = targets.to(device)


        logits = model(features)
        probas = F.softmax(logits, dim=1)
        
        _, predicted_labels = torch.max(probas, 1)
        num_examples += targets.size(0)
        correct_pred += (predicted_labels == targets).sum()
    return correct_pred.float()/num_examples * 100


def compute_epoch_loss(model, data_loader):
    model.eval()
    curr_loss, num_examples = 0., 0
    with torch.no_grad():
        for features, targets in data_loader:
            features = features.to(device)
            targets = targets.to(device)
            logits = model(features)
            probas = F.softmax(logits, dim=1)
            loss = F.cross_entropy(logits, targets, reduction='sum')
            num_examples += targets.size(0)
            curr_loss += loss

        curr_loss = curr_loss / num_examples
        return curr_loss
    
def forward(self, x):
        x = self.features(x)
        ##print("Before flatten:", x.size())
        x = torch.flatten(x, 1)
        ##print("After flatten:", x.size())
        x = self.classifier(x)
        return x
    
def eval(model):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    start_time = time.time()
    
    for epoch in range(num_epochs):

        model.train()
        for batch_idx, (features, targets) in enumerate(train_loader):

            features = features.to(device)
            targets = targets.to(device)

            ### FORWARD AND BACK PROP
            logits = model(features)
            probas = F.softmax(logits, dim=1)
            cost = F.cross_entropy(logits, targets)
            optimizer.zero_grad()

            cost.backward()

            ### UPDATE MODEL PARAMETERS
            optimizer.step()

            ### LOGGING
            if not batch_idx % 50:
                print ('Epoch: %03d/%03d | Batch %04d/%04d | Cost: %.4f' 
                       %(epoch+1, num_epochs, batch_idx, 
                         len(train_loader), cost))

        model.eval()
        with torch.set_grad_enabled(False): # save memory during inference
            print('Epoch: %03d/%03d | Train: %.3f%% |  Loss: %.3f' % (
                  epoch+1, num_epochs, 
                  compute_accuracy(model, train_loader),
                  compute_epoch_loss(model, train_loader)))


        print('Time elapsed: %.2f min' % ((time.time() - start_time)/60))

    print('Total Training Time: %.2f min' % ((time.time() - start_time)/60))
    with torch.set_grad_enabled(False): # save memory during inference
        print('Test accuracy: %.2f%%' % (compute_accuracy(model, test_loader)))


In [3]:
eval(modelRelu)

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3015
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.3018
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.2980
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.3004
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.2928
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.2940
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.2438
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.1784
Epoch: 001/010 | Train: 25.468% |  Loss: 2.071
Time elapsed: 2.06 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.0876
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.0069
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.9521
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.0061
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.9575
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.8306
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.7768
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.7122
Epoch: 002/010 | Train: 38.664% |  Loss: 1.719
Time elapsed: 4.01 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.8211
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.7

In [4]:
eval(modelSigmoid) 

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3431
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.3584
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.3350
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.3070
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.2833
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.3252
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.3666
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.3628
Epoch: 001/010 | Train: 10.000% |  Loss: 2.327
Time elapsed: 2.02 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.3033
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.3467
Epoch: 002/010 | Batch 0100/0391 | Cost: 2.2835
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.3366
Epoch: 002/010 | Batch 0200/0391 | Cost: 2.3365
Epoch: 002/010 | Batch 0250/0391 | Cost: 2.3716
Epoch: 002/010 | Batch 0300/0391 | Cost: 2.3034
Epoch: 002/010 | Batch 0350/0391 | Cost: 2.3088
Epoch: 002/010 | Train: 10.000% |  Loss: 2.331
Time elapsed: 4.04 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 2.3639
Epoch: 003/010 | Batch 0050/0391 | Cost: 2.3

In [5]:
eval(modelTanh)

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3024
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.2687
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.0921
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.0106
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.8714
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.8325
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.8225
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.7375
Epoch: 001/010 | Train: 37.052% |  Loss: 1.774
Time elapsed: 1.90 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.8730
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.6398
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.7554
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.5894
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.5888
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.4689
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.5169
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.5826
Epoch: 002/010 | Train: 46.202% |  Loss: 1.494
Time elapsed: 3.76 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.4190
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.3

In [8]:
def evalAdam(model):
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    start_time = time.time()
    
    for epoch in range(num_epochs):

        model.train()
        for batch_idx, (features, targets) in enumerate(train_loader):

            features = features.to(device)
            targets = targets.to(device)

            ### FORWARD AND BACK PROP
            logits = model(features)
            probas = F.softmax(logits, dim=1)
            cost = F.cross_entropy(logits, targets)
            optimizer.zero_grad()

            cost.backward()

            ### UPDATE MODEL PARAMETERS
            optimizer.step()

            ### LOGGING
            if not batch_idx % 50:
                print ('Epoch: %03d/%03d | Batch %04d/%04d | Cost: %.4f' 
                       %(epoch+1, num_epochs, batch_idx, 
                         len(train_loader), cost))

        model.eval()
        with torch.set_grad_enabled(False): # save memory during inference
            print('Epoch: %03d/%03d | Train: %.3f%% |  Loss: %.3f' % (
                  epoch+1, num_epochs, 
                  compute_accuracy(model, train_loader),
                  compute_epoch_loss(model, train_loader)))


        print('Time elapsed: %.2f min' % ((time.time() - start_time)/60))

    print('Total Training Time: %.2f min' % ((time.time() - start_time)/60))
    with torch.set_grad_enabled(False): # save memory during inference
        print('Test accuracy: %.2f%%' % (compute_accuracy(model, test_loader)))

In [9]:
evalAdam(modelRelu)

Epoch: 001/010 | Batch 0000/0391 | Cost: 0.8890
Epoch: 001/010 | Batch 0050/0391 | Cost: 1.8792
Epoch: 001/010 | Batch 0100/0391 | Cost: 1.5043
Epoch: 001/010 | Batch 0150/0391 | Cost: 1.5438
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.4740
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.3168
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.2551
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.2964
Epoch: 001/010 | Train: 55.344% |  Loss: 1.246
Time elapsed: 2.29 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.2581
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.2982
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.4488
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.0405
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.1611
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.0967
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.0256
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.1020
Epoch: 002/010 | Train: 66.298% |  Loss: 0.968
Time elapsed: 4.26 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.1085
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.1

In [10]:
evalAdam(modelSigmoid)

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3242
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.3015
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.3022
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.3033
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.3018
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.3033
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.3016
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.3029
Epoch: 001/010 | Train: 10.000% |  Loss: 2.303
Time elapsed: 1.93 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.3030
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.3045
Epoch: 002/010 | Batch 0100/0391 | Cost: 2.3025
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.3029
Epoch: 002/010 | Batch 0200/0391 | Cost: 2.3041
Epoch: 002/010 | Batch 0250/0391 | Cost: 2.3019
Epoch: 002/010 | Batch 0300/0391 | Cost: 2.3027
Epoch: 002/010 | Batch 0350/0391 | Cost: 2.3034
Epoch: 002/010 | Train: 10.000% |  Loss: 2.303
Time elapsed: 4.09 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 2.3025
Epoch: 003/010 | Batch 0050/0391 | Cost: 2.3

In [11]:
evalAdam(modelTanh)

Epoch: 001/010 | Batch 0000/0391 | Cost: 0.9472
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.0687
Epoch: 001/010 | Batch 0100/0391 | Cost: 1.6920
Epoch: 001/010 | Batch 0150/0391 | Cost: 1.7916
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.5357
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.4201
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.7329
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.7121
Epoch: 001/010 | Train: 53.588% |  Loss: 1.368
Time elapsed: 1.89 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.3462
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.5113
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.3508
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.3032
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.4003
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.2646
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.4495
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.2735
Epoch: 002/010 | Train: 58.660% |  Loss: 1.319
Time elapsed: 3.82 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.4839
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.1

In [17]:
class AlexNetRelu3(models.AlexNet):
    def __init__(self, num_classes=10):
        super(AlexNetRelu3, self).__init__(num_classes=num_classes)
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(256 * 6 * 6, 4096), # Adjusted for the new feature map size
            nn.ReLU(inplace=True),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

class AlexNetSigmoid3(models.AlexNet):
    def __init__(self, num_classes=10):
        super(AlexNetSigmoid3, self).__init__(num_classes=num_classes)
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 256, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(256 * 6 * 6, 4096), # Adjusted for the new feature map size
            nn.Sigmoid(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

class AlexNetTanh3(models.AlexNet):
    def __init__(self, num_classes=10):
        super(AlexNetTanh3, self).__init__(num_classes=num_classes)
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 256, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Linear(256 * 6 * 6, 4096), # Adjusted for the new feature map size
            nn.Tanh(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

    
modelSigmoid3 = AlexNetSigmoid3(num_classes=num_classes).to(device)
modelRelu3 = AlexNetRelu3(num_classes=num_classes).to(device)
modelTanh3 = AlexNetTanh3(num_classes=num_classes).to(device)

In [18]:
eval(modelRelu3)

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3054
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.2941
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.2801
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.2468
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.2037
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.9062
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.9982
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.9240
Epoch: 001/010 | Train: 34.644% |  Loss: 1.838
Time elapsed: 2.14 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.8127
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.7773
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.8048
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.8110
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.4912
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.6845
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.5083
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.5099
Epoch: 002/010 | Train: 44.582% |  Loss: 1.553
Time elapsed: 4.37 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.5678
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.6

In [19]:
eval(modelSigmoid3)

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3680
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.3368
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.2838
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.4259
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.3594
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.3480
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.3475
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.4009
Epoch: 001/010 | Train: 10.000% |  Loss: 2.310
Time elapsed: 2.01 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.3472
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.3228
Epoch: 002/010 | Batch 0100/0391 | Cost: 2.3210
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.3439
Epoch: 002/010 | Batch 0200/0391 | Cost: 2.3871
Epoch: 002/010 | Batch 0250/0391 | Cost: 2.3384
Epoch: 002/010 | Batch 0300/0391 | Cost: 2.3055
Epoch: 002/010 | Batch 0350/0391 | Cost: 2.3715
Epoch: 002/010 | Train: 10.000% |  Loss: 2.321
Time elapsed: 4.12 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 2.3730
Epoch: 003/010 | Batch 0050/0391 | Cost: 2.3

In [20]:
eval(modelTanh3)

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3053
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.1677
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.1139
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.0365
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.8459
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.8306
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.7228
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.7928
Epoch: 001/010 | Train: 39.318% |  Loss: 1.739
Time elapsed: 2.04 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.7662
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.6521
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.6442
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.6619
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.6018
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.4843
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.3804
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.5466
Epoch: 002/010 | Train: 47.696% |  Loss: 1.474
Time elapsed: 3.98 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.3679
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.5

In [21]:
evalAdam(modelRelu3)

Epoch: 001/010 | Batch 0000/0391 | Cost: 0.8940
Epoch: 001/010 | Batch 0050/0391 | Cost: 1.9560
Epoch: 001/010 | Batch 0100/0391 | Cost: 1.7054
Epoch: 001/010 | Batch 0150/0391 | Cost: 1.6646
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.5616
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.5437
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.7146
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.6170
Epoch: 001/010 | Train: 47.372% |  Loss: 1.454
Time elapsed: 1.92 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.6764
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.4033
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.5083
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.4032
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.5131
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.4282
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.4173
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.2690
Epoch: 002/010 | Train: 49.496% |  Loss: 1.389
Time elapsed: 3.84 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.3348
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.0

In [22]:
evalAdam(modelSigmoid3)

Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3573
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.4746
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.4147
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.3974
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.3778
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.3816
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.4113
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.3469
Epoch: 001/010 | Train: 10.000% |  Loss: 2.358
Time elapsed: 1.88 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.4272
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.4495
Epoch: 002/010 | Batch 0100/0391 | Cost: 2.4335
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.4093
Epoch: 002/010 | Batch 0200/0391 | Cost: 2.2826
Epoch: 002/010 | Batch 0250/0391 | Cost: 2.3196
Epoch: 002/010 | Batch 0300/0391 | Cost: 2.1366
Epoch: 002/010 | Batch 0350/0391 | Cost: 2.0566
Epoch: 002/010 | Train: 24.780% |  Loss: 2.038
Time elapsed: 3.77 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.9932
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.9

In [23]:
evalAdam(modelTanh3)

Epoch: 001/010 | Batch 0000/0391 | Cost: 0.6787
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.1645
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.0785
Epoch: 001/010 | Batch 0150/0391 | Cost: 1.9582
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.9908
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.5546
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.5170
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.5151
Epoch: 001/010 | Train: 46.118% |  Loss: 1.595
Time elapsed: 1.85 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.7041
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.5838
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.2711
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.4301
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.3687
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.2557
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.4340
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.4515
Epoch: 002/010 | Train: 58.716% |  Loss: 1.209
Time elapsed: 3.71 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.4961
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.5

In [31]:
import torch
import torch.nn as nn
device = device = torch.device('mps')
class AlexNetRelu7(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetRelu7, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Linear(256 * 6 * 6, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

class AlexNetSigmoid7(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetSigmoid7, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.Sigmoid(),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Linear(256 * 6 * 6, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x

class AlexNetTanh7(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNetTanh7, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.Tanh(),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Linear(256 * 6 * 6, num_classes)

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x



modelSigmoid7 = AlexNetSigmoid7(num_classes=num_classes).to(device)
modelRelu7 = AlexNetRelu7(num_classes=num_classes).to(device)
modelTanh7 = AlexNetTanh7(num_classes=num_classes).to(device)


In [32]:
eval(modelRelu7)


Epoch: 001/010 | Batch 0000/0391 | Cost: 2.3021
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.3014
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.2973
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.2972
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.2909
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.2852
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.2583
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.1752
Epoch: 001/010 | Train: 25.554% |  Loss: 2.053
Time elapsed: 1.92 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.1098
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.0275
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.9134
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.0014
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.7741
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.8885
Epoch: 002/010 | Batch 0300/0391 | Cost: 1.7573
Epoch: 002/010 | Batch 0350/0391 | Cost: 1.6383
Epoch: 002/010 | Train: 39.612% |  Loss: 1.671
Time elapsed: 3.83 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.6484
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.7

In [33]:
eval(modelSigmoid7)


Epoch: 001/010 | Batch 0000/0391 | Cost: 2.4099
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.3247
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.3556
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.3624
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.3576
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.3188
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.3976
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.3273
Epoch: 001/010 | Train: 10.000% |  Loss: 2.330
Time elapsed: 2.04 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.3384
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.3717
Epoch: 002/010 | Batch 0100/0391 | Cost: 2.2946
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.3236
Epoch: 002/010 | Batch 0200/0391 | Cost: 2.2852
Epoch: 002/010 | Batch 0250/0391 | Cost: 2.3580
Epoch: 002/010 | Batch 0300/0391 | Cost: 2.3570
Epoch: 002/010 | Batch 0350/0391 | Cost: 2.3346
Epoch: 002/010 | Train: 10.000% |  Loss: 2.333
Time elapsed: 4.10 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 2.3270
Epoch: 003/010 | Batch 0050/0391 | Cost: 2.3

In [34]:
eval(modelTanh3)


Epoch: 001/010 | Batch 0000/0391 | Cost: 0.9859
Epoch: 001/010 | Batch 0050/0391 | Cost: 1.1368
Epoch: 001/010 | Batch 0100/0391 | Cost: 0.9076
Epoch: 001/010 | Batch 0150/0391 | Cost: 0.9843
Epoch: 001/010 | Batch 0200/0391 | Cost: 0.7018
Epoch: 001/010 | Batch 0250/0391 | Cost: 0.6648
Epoch: 001/010 | Batch 0300/0391 | Cost: 0.9376
Epoch: 001/010 | Batch 0350/0391 | Cost: 0.6469
Epoch: 001/010 | Train: 77.512% |  Loss: 0.640
Time elapsed: 2.09 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 0.6750
Epoch: 002/010 | Batch 0050/0391 | Cost: 0.7617
Epoch: 002/010 | Batch 0100/0391 | Cost: 0.7171
Epoch: 002/010 | Batch 0150/0391 | Cost: 0.8301
Epoch: 002/010 | Batch 0200/0391 | Cost: 0.7827
Epoch: 002/010 | Batch 0250/0391 | Cost: 0.8380
Epoch: 002/010 | Batch 0300/0391 | Cost: 0.8389
Epoch: 002/010 | Batch 0350/0391 | Cost: 0.6566
Epoch: 002/010 | Train: 78.136% |  Loss: 0.615
Time elapsed: 4.00 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 0.7245
Epoch: 003/010 | Batch 0050/0391 | Cost: 0.7

In [35]:
evalAdam(modelRelu7)


Epoch: 001/010 | Batch 0000/0391 | Cost: 1.0177
Epoch: 001/010 | Batch 0050/0391 | Cost: 1.7759
Epoch: 001/010 | Batch 0100/0391 | Cost: 1.5127
Epoch: 001/010 | Batch 0150/0391 | Cost: 1.4187
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.4691
Epoch: 001/010 | Batch 0250/0391 | Cost: 1.3392
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.1345
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.1880
Epoch: 001/010 | Train: 58.782% |  Loss: 1.155
Time elapsed: 1.93 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.1452
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.0703
Epoch: 002/010 | Batch 0100/0391 | Cost: 1.1536
Epoch: 002/010 | Batch 0150/0391 | Cost: 0.9929
Epoch: 002/010 | Batch 0200/0391 | Cost: 1.2124
Epoch: 002/010 | Batch 0250/0391 | Cost: 1.1725
Epoch: 002/010 | Batch 0300/0391 | Cost: 0.9862
Epoch: 002/010 | Batch 0350/0391 | Cost: 0.9723
Epoch: 002/010 | Train: 67.322% |  Loss: 0.931
Time elapsed: 4.17 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.0065
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.0

In [36]:
evalAdam(modelSigmoid7)


Epoch: 001/010 | Batch 0000/0391 | Cost: 2.2954
Epoch: 001/010 | Batch 0050/0391 | Cost: 2.3033
Epoch: 001/010 | Batch 0100/0391 | Cost: 2.3041
Epoch: 001/010 | Batch 0150/0391 | Cost: 2.3038
Epoch: 001/010 | Batch 0200/0391 | Cost: 2.3031
Epoch: 001/010 | Batch 0250/0391 | Cost: 2.3029
Epoch: 001/010 | Batch 0300/0391 | Cost: 2.3023
Epoch: 001/010 | Batch 0350/0391 | Cost: 2.3019
Epoch: 001/010 | Train: 10.000% |  Loss: 2.303
Time elapsed: 2.19 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 2.3027
Epoch: 002/010 | Batch 0050/0391 | Cost: 2.3026
Epoch: 002/010 | Batch 0100/0391 | Cost: 2.3034
Epoch: 002/010 | Batch 0150/0391 | Cost: 2.3028
Epoch: 002/010 | Batch 0200/0391 | Cost: 2.3027
Epoch: 002/010 | Batch 0250/0391 | Cost: 2.3035
Epoch: 002/010 | Batch 0300/0391 | Cost: 2.3035
Epoch: 002/010 | Batch 0350/0391 | Cost: 2.3022
Epoch: 002/010 | Train: 10.000% |  Loss: 2.303
Time elapsed: 4.43 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 2.3027
Epoch: 003/010 | Batch 0050/0391 | Cost: 2.3

In [37]:
evalAdam(modelTanh3)

Epoch: 001/010 | Batch 0000/0391 | Cost: 0.7708
Epoch: 001/010 | Batch 0050/0391 | Cost: 0.8933
Epoch: 001/010 | Batch 0100/0391 | Cost: 1.0275
Epoch: 001/010 | Batch 0150/0391 | Cost: 1.3089
Epoch: 001/010 | Batch 0200/0391 | Cost: 1.0265
Epoch: 001/010 | Batch 0250/0391 | Cost: 0.9178
Epoch: 001/010 | Batch 0300/0391 | Cost: 1.2055
Epoch: 001/010 | Batch 0350/0391 | Cost: 1.3093
Epoch: 001/010 | Train: 71.300% |  Loss: 0.852
Time elapsed: 2.12 min
Epoch: 002/010 | Batch 0000/0391 | Cost: 1.2242
Epoch: 002/010 | Batch 0050/0391 | Cost: 1.1395
Epoch: 002/010 | Batch 0100/0391 | Cost: 0.8627
Epoch: 002/010 | Batch 0150/0391 | Cost: 1.3960
Epoch: 002/010 | Batch 0200/0391 | Cost: 0.8891
Epoch: 002/010 | Batch 0250/0391 | Cost: 0.9814
Epoch: 002/010 | Batch 0300/0391 | Cost: 0.8970
Epoch: 002/010 | Batch 0350/0391 | Cost: 0.8781
Epoch: 002/010 | Train: 72.448% |  Loss: 0.801
Time elapsed: 4.29 min
Epoch: 003/010 | Batch 0000/0391 | Cost: 1.1750
Epoch: 003/010 | Batch 0050/0391 | Cost: 1.1