## Load Data

In [1]:
import torch
from torch import nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torchvision
from torchvision.datasets import FashionMNIST
import torchvision.transforms as transforms
import numpy as np
import random

np.random.seed(0)
random.seed(0)
torch.manual_seed(0)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(device)

cuda:0


In [2]:
batch_size = 512
num_epochs = 15

train_dataset = FashionMNIST('./data', train=True, download=True, transform=transforms.ToTensor())
train_loader = DataLoader(train_dataset, batch_size, shuffle=True)

### Update Skip Connection

In [3]:
class MLP(nn.Module):
    def __init__(self, input_dims, hidden_dims, output_dims):
        super(MLP, self).__init__()
        self.layer1 = nn.Linear(input_dims, hidden_dims)
        self.bn1 = nn.BatchNorm1d(hidden_dims)

        self.layer2 = nn.Linear(hidden_dims, hidden_dims)
        self.bn2 = nn.BatchNorm1d(hidden_dims)

        self.layer3 = nn.Linear(hidden_dims, hidden_dims)
        self.bn3 = nn.BatchNorm1d(hidden_dims)

        self.output = nn.Linear(hidden_dims, output_dims)

        for m in self.modules():
            # if isinstance(m, nn.Linear):
            #     nn.init.normal_(m.weight, mean=0.0, std=0.05)
            #     nn.init.constant_(m.bias, 0.0)

            # He initialization
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
                nn.init.constant_(m.bias, 0.0)

    def forward(self, x):
        x = nn.Flatten()(x)
        x = self.layer1(x)
        x = self.bn1(x)
        residual = x

        x = nn.ReLU()(x)
        #? Create Skip Connection

        x = self.layer2(x)
        x = self.bn2(x)
        x = nn.ReLU()(x)

        x = self.layer3(x)
        x = self.bn3(x)
        x = nn.ReLU()(x)
        x = x + residual  #? Add Skip Connection to final Hidden Layer

        out = self.output(x)

        return out

In [4]:
model = MLP(input_dims=784, hidden_dims=128, output_dims=10).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [5]:
for epoch in range(num_epochs):
    t_loss = 0
    t_acc = 0
    cnt = 0
    for X, y in train_loader:
        X, y = X.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(X)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()

        t_loss += loss.item()
        t_acc += (torch.argmax(outputs, 1) == y).sum().item()
        cnt += len(y)

    t_loss /= len(train_loader)
    t_acc /= cnt
    print(f"Epoch {epoch+1}/{num_epochs}, Train_Loss: {t_loss:.4f}, Train_Acc: {t_acc:.4f}")

Epoch 1/15, Train_Loss: 0.5250, Train_Acc: 0.8178
Epoch 2/15, Train_Loss: 0.3543, Train_Acc: 0.8744
Epoch 3/15, Train_Loss: 0.3125, Train_Acc: 0.8881
Epoch 4/15, Train_Loss: 0.2833, Train_Acc: 0.8986
Epoch 5/15, Train_Loss: 0.2587, Train_Acc: 0.9066
Epoch 6/15, Train_Loss: 0.2384, Train_Acc: 0.9128
Epoch 7/15, Train_Loss: 0.2236, Train_Acc: 0.9187
Epoch 8/15, Train_Loss: 0.2061, Train_Acc: 0.9253
Epoch 9/15, Train_Loss: 0.1922, Train_Acc: 0.9300
Epoch 10/15, Train_Loss: 0.1796, Train_Acc: 0.9352
Epoch 11/15, Train_Loss: 0.1702, Train_Acc: 0.9382
Epoch 12/15, Train_Loss: 0.1599, Train_Acc: 0.9431
Epoch 13/15, Train_Loss: 0.1520, Train_Acc: 0.9446
Epoch 14/15, Train_Loss: 0.1373, Train_Acc: 0.9506
Epoch 15/15, Train_Loss: 0.1311, Train_Acc: 0.9528
