## MNIST via CNN

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision

# Define the convolutional neural network (CNN)
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 5)  # chnl-in, out, krnl
        self.conv2 = nn.Conv2d(32, 64, 5)
        self.fc1 = nn.Linear(1024, 512)   # [64*4*4, x]
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)     # 10 classes
        self.pool1 = nn.MaxPool2d(2, stride=2)
        self.pool2 = nn.MaxPool2d(2, stride=2)
        self.drop1 = nn.Dropout(0.25)
        self.drop2 = nn.Dropout(0.50)

    def forward(self, x):
        # convolution phase
        z = F.relu(self.conv1(x))   # Size([bs, 32, 24, 24])
        z = self.pool1(z)           # Size([bs, 32, 12, 12])
        z = self.drop1(z)
        z = F.relu(self.conv2(z))   # Size([bs, 64, 8, 8])
        z = self.pool2(z)           # Size([bs, 64, 4, 4])

        # neural network phase
        z = z.reshape(-1, 1024)     # Size([bs, 1024])
        z = F.relu(self.fc1(z))     # Size([bs, 512])
        z = self.drop2(z)
        z = F.relu(self.fc2(z))     # Size([bs, 256])
        z = self.fc3(z)             # Size([bs, 10])
        return z

# Load MNIST dataset
batch_size = 64
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=torchvision.transforms.ToTensor(), download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Initialize network and optimizer
model = Net()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Forward pass
        output = model(data)
        loss = F.cross_entropy(output, target)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 199954364.75it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 76329989.81it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 55367438.67it/s]

Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 6096169.21it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw

Epoch 1, Loss: 0.3991948068141937
Epoch 2, Loss: 0.46641403436660767
Epoch 3, Loss: 0.19204159080982208
Epoch 4, Loss: 0.15419679880142212
Epoch 5, Loss: 0.24739806354045868
Epoch 6, Loss: 0.17561829090118408
Epoch 7, Loss: 0.03065049648284912
Epoch 8, Loss: 0.10982850939035416
Epoch 9, Loss: 0.10394237190485
Epoch 10, Loss: 0.023665310814976692


## SGHMC algorithm

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
class SGHMC(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-2, scale_grad=1., gamma=1., beta=1.):
        defaults = dict(lr=lr, scale_grad=scale_grad, gamma=gamma, beta=beta)
        super(SGHMC, self).__init__(params, defaults)

    def step(self, closure=None):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['previous_grad'] = torch.zeros_like(p.data)
                    state['velocity'] = torch.zeros_like(p.data)
                
                previous_grad = state['previous_grad']
                velocity = state['velocity']

                # Noise term
                noise = torch.normal(mean=0., std=torch.sqrt(torch.tensor(2.*group['gamma']*group['beta'] - 1))).to(p.data.device)

                # Update the velocity and the previous gradient
                velocity = velocity * -group['gamma'] + d_p + noise
                previous_grad = d_p.clone()

                # Update the parameters
                p.data.add_(-group['lr'], velocity)


# Define the convolutional neural network (CNN)
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 5)  # chnl-in, out, krnl
        self.conv2 = nn.Conv2d(32, 64, 5)
        self.fc1 = nn.Linear(1024, 512)   # [64*4*4, x]
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)     # 10 classes
        self.pool1 = nn.MaxPool2d(2, stride=2)
        self.pool2 = nn.MaxPool2d(2, stride=2)
        self.drop1 = nn.Dropout(0.25)
        self.drop2 = nn.Dropout(0.50)

    def forward(self, x):
        # convolution phase
        z = F.relu(self.conv1(x))   # Size([bs, 32, 24, 24])
        z = self.pool1(z)           # Size([bs, 32, 12, 12])
        z = self.drop1(z)
        z = F.relu(self.conv2(z))   # Size([bs, 64, 8, 8])
        z = self.pool2(z)           # Size([bs, 64, 4, 4])

        # neural network phase
        z = z.reshape(-1, 1024)     # Size([bs, 1024])
        z = F.relu(self.fc1(z))     # Size([bs, 512])
        z = self.drop2(z)
        z = F.relu(self.fc2(z))     # Size([bs, 256])
        z = self.fc3(z)             # Size([bs, 10])
        return z

# Load MNIST dataset
batch_size = 64
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=torchvision.transforms.ToTensor(), download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# Initialize network and optimizer
model = Net()
optimizer = SGHMC(model.parameters(), lr=0.01)

# Training loop
for epoch in range(10):
    for batch_idx, (data, target) in enumerate(train_loader):
        # Forward pass
        output = model(data)
        loss = F.cross_entropy(output, target)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')


Epoch 1, Loss: 2.3077194690704346
Epoch 2, Loss: 2.298581600189209
Epoch 3, Loss: 2.3072409629821777
Epoch 4, Loss: 2.2922768592834473
Epoch 5, Loss: 2.533958673477173
Epoch 6, Loss: 2.307716131210327
Epoch 7, Loss: 2.2900078296661377
Epoch 8, Loss: 2.302164316177368
Epoch 9, Loss: 2.301670789718628
Epoch 10, Loss: 2.3042824268341064


## Compare with SGD and Adam

In [None]:
# List of optimizers to compare
optimizers = ["SGD", "Adam", "SGHMC"]

# Store the results
results = {}

# Training loop
for opt_name in optimizers:
    # Initialize network for each optimizer
    model = Net()

    if opt_name == "SGD":
        optimizer = optim.SGD(model.parameters(), lr=0.01)
    elif opt_name == "Adam":
        optimizer = optim.Adam(model.parameters(), lr=0.01)
    elif opt_name == "SGHMC":
        optimizer = SGHMC(model.parameters(), lr=0.01)

    print(f"\nTraining with {opt_name} optimizer:")
    for epoch in range(10):
        for batch_idx, (data, target) in enumerate(train_loader):
            # Forward pass
            output = model(data)
            loss = F.cross_entropy(output, target)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

    # Evaluation
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    accuracy = 100 * correct / total
    results[opt_name] = accuracy
    print(f'Accuracy of the network on the test images using {opt_name}: {accuracy}%')

# Print the final results
print("\nFinal Results:")
for opt_name, accuracy in results.items():
    print(f'{opt_name} Accuracy: {accuracy}%')



Training with SGD optimizer:
Epoch 1, Loss: 0.5428701639175415
Epoch 2, Loss: 0.33463892340660095
Epoch 3, Loss: 0.2645123302936554
Epoch 4, Loss: 0.23696470260620117
Epoch 5, Loss: 0.2525932192802429
Epoch 6, Loss: 0.19202767312526703
Epoch 7, Loss: 0.10083306580781937
Epoch 8, Loss: 0.19862957298755646
Epoch 9, Loss: 0.21516868472099304
Epoch 10, Loss: 0.043656159192323685
Accuracy of the network on the test images using SGD: 97.67%

Training with Adam optimizer:
Epoch 1, Loss: 0.2932542860507965
Epoch 2, Loss: 0.24368619918823242
Epoch 3, Loss: 0.336516797542572
Epoch 4, Loss: 0.017983315512537956
Epoch 5, Loss: 0.1276865154504776
Epoch 6, Loss: 0.024428585544228554
Epoch 7, Loss: 0.19916220009326935
Epoch 8, Loss: 0.18910767138004303
Epoch 9, Loss: 0.34055426716804504
Epoch 10, Loss: 0.019781511276960373
Accuracy of the network on the test images using Adam: 96.54%

Training with SGHMC optimizer:
Epoch 1, Loss: 2.315648317337036
Epoch 2, Loss: 2.296440362930298
Epoch 3, Loss: 2.32

## Training SGHMC

In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, 5)  # chnl-in, out, krnl
        self.conv2 = nn.Conv2d(32, 64, 5)
        self.fc1 = nn.Linear(1024, 512)   # [64*4*4, x]
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 10)     # 10 classes
        self.pool1 = nn.MaxPool2d(2, stride=2)
        self.pool2 = nn.MaxPool2d(2, stride=2)
        self.drop1 = nn.Dropout(0.25)
        self.drop2 = nn.Dropout(0.50)

    def forward(self, x):
        # convolution phase
        z = F.relu(self.conv1(x))   # Size([bs, 32, 24, 24])
        z = self.pool1(z)           # Size([bs, 32, 12, 12])
        z = self.drop1(z)
        z = F.relu(self.conv2(z))   # Size([bs, 64, 8, 8])
        z = self.pool2(z)           # Size([bs, 64, 4, 4])

        # neural network phase
        z = z.reshape(-1, 1024)     # Size([bs, 1024])
        z = F.relu(self.fc1(z))     # Size([bs, 512])
        z = self.drop2(z)
        z = F.relu(self.fc2(z))     # Size([bs, 256])
        z = self.fc3(z)             # Size([bs, 10])
        return z

class SGHMC(torch.optim.Optimizer):
    def __init__(self, params, lr=1e-2, scale_grad=1., gamma=1., beta=1.):
        defaults = dict(lr=lr, scale_grad=scale_grad, gamma=gamma, beta=beta)
        super(SGHMC, self).__init__(params, defaults)

    def step(self, closure=None):
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                d_p = p.grad.data

                state = self.state[p]

                # State initialization
                if len(state) == 0:
                    state['previous_grad'] = torch.zeros_like(p.data)
                    state['velocity'] = torch.zeros_like(p.data)
                
                previous_grad = state['previous_grad']
                velocity = state['velocity']

                # Noise term
                noise = torch.normal(mean=0., std=torch.sqrt(torch.tensor(2.*group['gamma']*group['beta'] - 1))).to(p.data.device)

                # Update the velocity and the previous gradient
                velocity = velocity * -group['gamma'] + d_p + noise
                previous_grad = d_p.clone()

                # Update the parameters
                p.data.add_(-group['lr'], velocity)

# Load MNIST dataset
batch_size = 64
train_dataset = torchvision.datasets.MNIST(root='./data', train=True, transform=torchvision.transforms.ToTensor(), download=True)
test_dataset = torchvision.datasets.MNIST(root='./data', train=False, transform=torchvision.transforms.ToTensor(), download=True)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)

# List of optimizers to compare
optimizers = ["SGHMC_gamma1"]

# Store the results
results = {}

# Training loop
for opt_name in optimizers:
    # Initialize network for each optimizer
    model = Net()

    if opt_name == "SGHMC_gamma1":
        optimizer = SGHMC(model.parameters(), lr=0.01, gamma=20.)
    # elif opt_name == "SGHMC_gamma2":
    #     optimizer = SGHMC(model.parameters(), lr=0.01, gamma=2.)
    # elif opt_name == "SGHMC_gamma3":
    #     optimizer = SGHMC(model.parameters(), lr=0.01, gamma=3.)
    # elif opt_name == "SGHMC_beta1":
    #     optimizer = SGHMC(model.parameters(), lr=0.01, beta=1.)
    # elif opt_name == "SGHMC_beta2":
    #     optimizer = SGHMC(model.parameters(), lr=0.01, beta=2.)
    # elif opt_name == "SGHMC_beta3":
    #     optimizer = SGHMC(model.parameters(), lr=0.01, beta=3.)

    print(f"\nTraining with {opt_name} optimizer:")
    for epoch in range(20):
        for batch_idx, (data, target) in enumerate(train_loader):
            # Forward pass
            output = model(data)
            loss = F.cross_entropy(output, target)

            # Backward pass and optimization
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print(f'Epoch {epoch+1}, Loss: {loss.item()}')

    # Evaluation
    correct = 0
    total = 0
    with torch.no_grad():
        for data, target in test_loader:
            outputs = model(data)
            _, predicted = torch.max(outputs.data, 1)
            total += target.size(0)
            correct += (predicted == target).sum().item()

    accuracy = 100 * correct / total
    results[opt_name] = accuracy
    print(f'Accuracy of the network on the test images using {opt_name}: {accuracy}%')

# Print the final results
print("\nFinal Results:")
for opt_name, accuracy in results.items():
    print(f'{opt_name} Accuracy: {accuracy}%')


Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to ./data/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9912422/9912422 [00:00<00:00, 131256763.97it/s]

Extracting ./data/MNIST/raw/train-images-idx3-ubyte.gz to ./data/MNIST/raw






Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to ./data/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28881/28881 [00:00<00:00, 86095020.49it/s]


Extracting ./data/MNIST/raw/train-labels-idx1-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1648877/1648877 [00:00<00:00, 34860959.23it/s]


Extracting ./data/MNIST/raw/t10k-images-idx3-ubyte.gz to ./data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4542/4542 [00:00<00:00, 15901943.88it/s]


Extracting ./data/MNIST/raw/t10k-labels-idx1-ubyte.gz to ./data/MNIST/raw


Training with SGHMC_gamma1 optimizer:


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at ../torch/csrc/utils/python_arg_parser.cpp:1485.)
  p.data.add_(-group['lr'], velocity)


Epoch 1, Loss: 2.304826498031616
Epoch 2, Loss: 2.2983460426330566
Epoch 3, Loss: 2.303752899169922
Epoch 4, Loss: 2.3017473220825195
Epoch 5, Loss: 2.289079189300537
Epoch 6, Loss: 2.305964708328247
Epoch 7, Loss: 2.3190977573394775
Epoch 8, Loss: 2.3076488971710205
Epoch 9, Loss: 2.28902006149292
Epoch 10, Loss: 2.2949905395507812
Epoch 11, Loss: 2.2826132774353027
Epoch 12, Loss: 2.307840347290039
Epoch 13, Loss: 2.3168587684631348
Epoch 14, Loss: 2.2804901599884033
Epoch 15, Loss: 2.2970571517944336
Epoch 16, Loss: 2.280909299850464
Epoch 17, Loss: 2.3005664348602295
Epoch 18, Loss: 2.3042876720428467
Epoch 19, Loss: 2.3109662532806396
Epoch 20, Loss: 2.2855658531188965
Accuracy of the network on the test images using SGHMC_gamma1: 11.35%

Final Results:
SGHMC_gamma1 Accuracy: 11.35%
