In [1]:
import torch
import torch.nn as nn

In [2]:
device = "cuda"

In [3]:
import torchvision.datasets as dsets
import torchvision.transforms as transform

mnist_train = dsets.MNIST(root="MNIST_data/", download=True, train=True, transform=transform.ToTensor())
mnist_test = dsets.MNIST(root="MNIST_data/", download=True, train=False, transform=transform.ToTensor())

from torch.utils.data import DataLoader

train_dataloader = DataLoader(mnist_train, batch_size=64, shuffle=True)
test_dataloader = DataLoader(mnist_test, batch_size=64, shuffle=True)

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz to MNIST_data/MNIST/raw/train-images-idx3-ubyte.gz


  0%|          | 0/9912422 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/train-images-idx3-ubyte.gz to MNIST_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw/train-labels-idx1-ubyte.gz


  0%|          | 0/28881 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/train-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz to MNIST_data/MNIST/raw/t10k-images-idx3-ubyte.gz


  0%|          | 0/1648877 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/t10k-images-idx3-ubyte.gz to MNIST_data/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw/t10k-labels-idx1-ubyte.gz


  0%|          | 0/4542 [00:00<?, ?it/s]

Extracting MNIST_data/MNIST/raw/t10k-labels-idx1-ubyte.gz to MNIST_data/MNIST/raw



In [4]:
import matplotlib.pyplot as plt

def test_model(model, dataloader, is_plot=False):
  count = 0
  dataloader_size = len(dataloader) if len(dataloader) <= 20 else 20
  if is_plot == True: 
    plt.figure(figsize=(8, 8 * dataloader_size))
  model.eval()
  for i, (X, y) in enumerate(dataloader):
    X = X.view(-1, 28 * 28).to(device)
    y = y.to(device)
    with torch.no_grad():
      pred = model(X)
      pred_ = torch.argmax(pred, dim=1)
      count += (pred_ == y).sum().item()

    if i < 20 and is_plot == True:
      plt.subplot(dataloader_size, 1, i+1)
      plt.title(f"Answer : {y[0]} / Prediction : {pred_[0]}")
      plt.imshow(X[0].to("cpu").view(28, 28))
  return count, len(dataloader.dataset)

# 1. ReLU

In [36]:
drop_prob = 0.7
model = nn.Sequential(
    nn.Linear(784, 64),
    nn.BatchNorm1d(64),
    nn.ReLU(),
    nn.Dropout(p=drop_prob),
    nn.Linear(64, 64),
    nn.BatchNorm1d(64),
    nn.ReLU(),
    nn.ReLU(),
    nn.Dropout(p=drop_prob),
    nn.Linear(64, 10),
).to(device)
model

Sequential(
  (0): Linear(in_features=784, out_features=64, bias=True)
  (1): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
  (3): Dropout(p=0.7, inplace=False)
  (4): Linear(in_features=64, out_features=64, bias=True)
  (5): BatchNorm1d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (6): ReLU()
  (7): ReLU()
  (8): Dropout(p=0.7, inplace=False)
  (9): Linear(in_features=64, out_features=10, bias=True)
)

# 2. Weight Initialization

In [37]:
torch.nn.init.kaiming_normal_(model[0].weight)
torch.nn.init.kaiming_normal_(model[4].weight)
torch.nn.init.kaiming_normal_(model[9].weight)

Parameter containing:
tensor([[-2.5539e-02, -9.9717e-02, -1.1992e-01, -4.4291e-02,  3.3809e-02,
         -1.5490e-01,  1.4299e-01,  1.3804e-02, -2.8507e-02,  7.1522e-02,
          8.9104e-02, -7.0011e-02, -1.8521e-01,  1.4837e-02,  2.9404e-01,
         -1.4725e-01,  1.8745e-01, -1.2621e-01,  2.9198e-01, -1.3677e-03,
         -6.5634e-02, -6.2897e-02, -7.2332e-02, -2.6741e-03,  2.3747e-02,
          3.2049e-01,  5.9601e-02, -8.8824e-02,  7.9368e-02,  1.1847e-01,
         -8.9695e-02, -1.0707e-01,  1.8951e-01,  1.3867e-01,  2.4380e-01,
         -2.3711e-01, -3.6928e-01,  1.8247e-01, -2.1555e-01, -8.5596e-02,
          1.0665e-03,  1.1216e-01,  6.7593e-03,  2.5052e-01,  1.1915e-01,
          2.4078e-01,  6.8416e-02, -1.3727e-02, -8.2008e-02, -2.1521e-01,
         -5.3721e-01,  3.7423e-02,  7.0222e-02, -5.0535e-02,  8.0575e-02,
          1.6422e-01,  2.5668e-01,  1.4765e-01,  2.4442e-01,  1.0234e-01,
         -1.6986e-01,  8.8085e-02, -3.4842e-01, -1.8148e-01],
        [ 1.6292e-02, -2.537

In [40]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lambda epoch: 0.99 ** epoch)
loss_fn = torch.nn.CrossEntropyLoss()#.to(device)

In [41]:
epochs = 100 

for epoch in range(epochs):
  avg_cost = 0.
  total_batch = len(train_dataloader)
  for X, y in train_dataloader:
    model.train()
    X = X.view(-1, 28 * 28).to(device)
    y = y.to(device)
    pred = model(X)
    loss = loss_fn(pred, y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    avg_cost += loss / total_batch
  print(f"Epoch: {epoch+1} - Cost: {avg_cost:.9f}")

Epoch: 1 - Cost: 0.573443949
Epoch: 2 - Cost: 0.574099422
Epoch: 3 - Cost: 0.566139758
Epoch: 4 - Cost: 0.564816773
Epoch: 5 - Cost: 0.561017275
Epoch: 6 - Cost: 0.564903080
Epoch: 7 - Cost: 0.563955724
Epoch: 8 - Cost: 0.567482173
Epoch: 9 - Cost: 0.566916764
Epoch: 10 - Cost: 0.562906504
Epoch: 11 - Cost: 0.560734332
Epoch: 12 - Cost: 0.560138524
Epoch: 13 - Cost: 0.556279242
Epoch: 14 - Cost: 0.557636857
Epoch: 15 - Cost: 0.554516256
Epoch: 16 - Cost: 0.553520262
Epoch: 17 - Cost: 0.561273456
Epoch: 18 - Cost: 0.549941957
Epoch: 19 - Cost: 0.552753985
Epoch: 20 - Cost: 0.557072103
Epoch: 21 - Cost: 0.549146295
Epoch: 22 - Cost: 0.553338945
Epoch: 23 - Cost: 0.554265618
Epoch: 24 - Cost: 0.556270361
Epoch: 25 - Cost: 0.551348686
Epoch: 26 - Cost: 0.555679321
Epoch: 27 - Cost: 0.553032219
Epoch: 28 - Cost: 0.555779696
Epoch: 29 - Cost: 0.551447928
Epoch: 30 - Cost: 0.548355579
Epoch: 31 - Cost: 0.557434559
Epoch: 32 - Cost: 0.547376335
Epoch: 33 - Cost: 0.547629595
Epoch: 34 - Cost: 0

In [42]:
correct_num, total_num = test_model(model, test_dataloader)
print(f"{correct_num} / {total_num} => Accuracy: {correct_num / total_num * 100}")

9527 / 10000 => Accuracy: 95.27
