In [1]:
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets, transforms

# Get Device for Training

In [2]:
# cuda :
# https://pytorch.org/docs/stable/cuda.html
# https://pytorch.org/docs/stable/notes/cuda.html
# mps :
# https://pytorch.org/docs/stable/notes/mps.html#mps-backend

device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.beckends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [3]:
# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")

else:
    mps_device = torch.device("mps")

    # Create a Tensor directly on the mps device
    x = torch.ones(5, device=mps_device)
    # Or
    x = torch.ones(5, device="mps")

    # Any operation happens on the GPU
    y = x * 2

    # Move your model to mps just like any other device
    model = YourFavoriteNet()
    model.to(mps_device)

    # Now every call runs on the GPU
    pred = model(x)

MPS not available because the current PyTorch install was not built with MPS enabled.


# Define the Class

In [4]:
class NeuralNetwork(nn.Module) :
    def __init__(self) : 
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28 * 28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )
        
    def forward(self, x) :
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits

In [5]:
model = NeuralNetwork().to(device)
print(model)

NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)


Calling the model on the input returns a 2-dimensional tensor with dim=0 corresponding to each output of 10 raw predicted values for each class, <br>
and dim=1 corresponding to the individual values of each output. <br>
We get the prediction probabilities by passing it through an instance of the nn.Softmax module.<br>

-> model()'s input : $(m, N_x)$ = (#examples, #features)<br>
-> model()'s output : (#examples, #logits)

In [6]:
X = torch.rand(3, 1, 28, 28, device=device) # (1, 28, 28) image 3개
logits = model(X)
print(f"logits.shape : {logits.shape}") 

pred_prob = nn.Softmax(dim=1)(logits)
print(f"pred_prob : {pred_prob}")
# y_pred = pred_prob.argmax(1, keepdim=True)          # --> Softmax() class의 argmax() method 사용
y_pred = torch.argmax(pred_prob, dim=1, keepdim=True) # --> torch의 argmax() function 사용
print(f"Predicted class : {y_pred}")

logits.shape : torch.Size([3, 10])
pred_prob : tensor([[0.1030, 0.0886, 0.0940, 0.0942, 0.0957, 0.1068, 0.1031, 0.1016, 0.1073,
         0.1056],
        [0.1000, 0.0878, 0.0871, 0.0916, 0.1063, 0.1094, 0.1017, 0.1076, 0.1067,
         0.1019],
        [0.1043, 0.0866, 0.0940, 0.0990, 0.0971, 0.1084, 0.1030, 0.1038, 0.1017,
         0.1021]], device='cuda:0', grad_fn=<SoftmaxBackward0>)
Predicted class : tensor([[8],
        [5],
        [5]], device='cuda:0')


# Model Layers

In [7]:
input_image = torch.rand(3, 28, 28)
print(f"input_image.size : {input_image.size()}")
print(f"input_image.shape : {input_image.shape}")

input_image.size : torch.Size([3, 28, 28])
input_image.shape : torch.Size([3, 28, 28])


# nn.Flatten

In [8]:
flatten = nn.Flatten()
flat_image = flatten(input_image)  # (3, 28, 28) --> (3, 784)
print(f"flat_image.size : {flat_image.size()}") 

flat_image.size : torch.Size([3, 784])


# nn.Linear

In [9]:
layer1 = nn.Linear(in_features=28*28, out_features=20)
print(f"layer1.weight.size : {layer1.weight.size()}")
affine1 = layer1(flat_image)
print(f"affine1.size : {affine1.size()}")
print(f"affine1 : {affine1}")
activation1 = nn.ReLU()(affine1)
print(f"activation1.size with ReLU: {activation1.size()}")
print(f"activation1 : {activation1}")

# flat_image  = A0 = (3, 784)
# layer1      = W1 = (784, 20)
# affine1     = Z1 = (A0 * W1 + b1)= (3, 20)
# activation1 = A1 = ReLU(affine1) = (3, 20)

layer1.weight.size : torch.Size([20, 784])
affine1.size : torch.Size([3, 20])
affine1 : tensor([[-0.1451,  0.1046,  0.4040,  0.1877,  0.1190, -0.2734, -0.4865, -0.5338,
          0.2051, -0.1349,  0.1995,  0.2797, -0.6617,  0.6059,  0.6286,  0.6489,
         -0.0919, -0.3810,  0.1206, -0.2253],
        [-0.2602,  0.1511,  0.3721,  0.1184, -0.0568,  0.1026, -0.3421, -0.3563,
          0.0222, -0.1032,  0.2224,  0.2366, -0.1367,  0.5039,  0.2181,  0.1348,
         -0.1633, -0.1460,  0.1541, -0.2740],
        [-0.5460,  0.2400,  0.2226, -0.3415,  0.3030, -0.1392, -0.8176, -0.6040,
         -0.0797, -0.1238,  0.1129,  0.1245, -0.1638,  0.4336,  0.1582,  0.3988,
          0.3445, -0.1881,  0.0723, -0.2274]], grad_fn=<AddmmBackward0>)
activation1.size with ReLU: torch.Size([3, 20])
activation1 : tensor([[0.0000, 0.1046, 0.4040, 0.1877, 0.1190, 0.0000, 0.0000, 0.0000, 0.2051,
         0.0000, 0.1995, 0.2797, 0.0000, 0.6059, 0.6286, 0.6489, 0.0000, 0.0000,
         0.1206, 0.0000],
        [0.

# nn.Sequential

In [22]:
seq_modules = nn.Sequential(
    flatten,                # (3, 28, 28) --> (3, 784)
    layer1,                 # (3, 784) --> (3, 20)
    nn.ReLU(),              # (3, 20) --> (3, 20)
    nn.Linear(20, 10)       # (3, 20) --> (3, 10)
)

input_image = torch.rand(3, 28, 28)
logits = seq_modules(input_image)
print(f"logits.size : {logits.size()}") # 3 : batch size, 10 : class 개수

logits.size : torch.Size([3, 10])


# nn.Softmax

In [10]:
softmax = nn.Softmax(dim=1)
pred_prob = softmax(logits) # dim=1 : sum of each row = 1
print(f"pred_prob.size : {pred_prob.size()}")
print(f"pred_prob : {pred_prob}")

print("")

softmax = nn.Softmax(dim=0)
pred_prob = softmax(logits) # dim=0 : sum of each column = 1
print(f"pred_prob.size : {pred_prob.size()}")
print(f"pred_prob : {pred_prob}")

pred_prob.size : torch.Size([3, 10])
pred_prob : tensor([[0.1030, 0.0886, 0.0940, 0.0942, 0.0957, 0.1068, 0.1031, 0.1016, 0.1073,
         0.1056],
        [0.1000, 0.0878, 0.0871, 0.0916, 0.1063, 0.1094, 0.1017, 0.1076, 0.1067,
         0.1019],
        [0.1043, 0.0866, 0.0940, 0.0990, 0.0971, 0.1084, 0.1030, 0.1038, 0.1017,
         0.1021]], device='cuda:0', grad_fn=<SoftmaxBackward0>)

pred_prob.size : torch.Size([3, 10])
pred_prob : tensor([[0.3326, 0.3344, 0.3390, 0.3282, 0.3175, 0.3265, 0.3323, 0.3221, 0.3372,
         0.3385],
        [0.3255, 0.3339, 0.3167, 0.3218, 0.3555, 0.3371, 0.3306, 0.3439, 0.3384,
         0.3292],
        [0.3419, 0.3317, 0.3443, 0.3500, 0.3270, 0.3364, 0.3371, 0.3340, 0.3245,
         0.3323]], device='cuda:0', grad_fn=<SoftmaxBackward0>)


# Model Parameters

In [28]:
print(f"Model structure : {model}")

for name, param in model.named_parameters() :
    print(f"Layer : {name} | Size : {param.size()} | Values : {param[:2]} \n")

Model structure : NeuralNetwork(
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear_relu_stack): Sequential(
    (0): Linear(in_features=784, out_features=512, bias=True)
    (1): ReLU()
    (2): Linear(in_features=512, out_features=512, bias=True)
    (3): ReLU()
    (4): Linear(in_features=512, out_features=10, bias=True)
  )
)
Layer : linear_relu_stack.0.weight | Size : torch.Size([512, 784]) | Values : tensor([[ 0.0090,  0.0008,  0.0309,  ..., -0.0298,  0.0097, -0.0284],
        [-0.0215,  0.0191,  0.0116,  ...,  0.0250,  0.0236, -0.0118]],
       device='cuda:0', grad_fn=<SliceBackward0>) 

Layer : linear_relu_stack.0.bias | Size : torch.Size([512]) | Values : tensor([ 0.0065, -0.0335], device='cuda:0', grad_fn=<SliceBackward0>) 

Layer : linear_relu_stack.2.weight | Size : torch.Size([512, 512]) | Values : tensor([[ 0.0412, -0.0058, -0.0211,  ..., -0.0380, -0.0230, -0.0007],
        [-0.0231,  0.0331, -0.0067,  ...,  0.0153,  0.0271,  0.0223]],
       device='cuda:0', grad_f

# Further Reading, torch.nn : https://pytorch.org/docs/stable/nn.html