In [None]:
%matplotlib inline

In [None]:
import torch
import torch.nn as nn # Use torch.nn as nn
import torch.nn.functional as F # Use torch.nn.functional as F

class Net(nn.Module):
    # Network for Simple Image Processing
    # Spec
    # INPUT : 32 * 32 pixels 1 image
    # Layer 1 : 3 * 3 square kernel -> 28 * 28 pixels with 6 output channels (Convolution) / 14 * 14 pixels with 6 output channels(Subsampling)
    # Layer 2 : 3 * 3 square kernel -> 10 * 10 pixels with 16 output channels (Convolution) / 5 * 5 pixels with 16 output channels(Subsampling)
    # Layer 3 : 120 nodes
    # Layer 4 : 84 nodes
    # Layer 5(OUTPUT) : 10 nodes (labels for 1 to 10)
    def __init__(self):
        super(Net, self).__init__()
        # 2 Convolution Layers
        self.conv1 = nn.Conv2d(1, 6, 3) # 1 input image channel, 6 output channels, 3 x 3 square convolution
        self.conv2 = nn.Conv2d(6, 16, 3) # 6 input channels, 16 output channels, 3 * 3 square convolution
        # 2 Fulley Connected Layers
        self.fc1 = nn.Linear(16 * 6 * 6, 120)
        self.fc2 = nn.Linear(120, 84)
        # 1 Output Layer
        self.fc3 = nn.Linear(84, 10)
        
    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2)) # Conv -> ReLu -> Pooling(Max-Pooling)
        x = F.max_pool2d(F.relu(self.conv1(x)), 2) # 2 -> equals to (2, 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x) # no Activation
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]
        num_features = 1
        for s in size:
            num_features *= s
        return num_features

net = Net()
print(net)

In [None]:
# implementation of NN with numpy
# 목표 : random한 input이 random한 output을 학습하는 과정
import numpy as np

# N is batch size; D_in is input dimension; H is hidden Dimension; D_out is output dimension
N, D_in, H, D_out = 64, 1000, 100, 10

# Create Random input and output data
x = np.random.randn(N, D_in) # randn : rand var with (avg0, std1) as [M, N] array
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

learning_rate = 1e-6

for t in range(500):
    # Forward Prop
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)
    
    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # Backprop
    grad_y_pred = 2.0 * (y_pred - y) # 2*(o-d) : Euclidean Distance의 도함수
    grad_w2 = h_relu.T.dot(grad_y_pred) # gradient descent
    grad_h_relu = grad_y_pred.dot(w2.T) # output layer -> hidden layer gradient 전파
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0 # ReLu를 사용하고 있으므로 h < 0인 값은 그 gradient도 0으로 초기화 (전파하지 않음)
    grad_w1 = x.T.dot(grad_h) # hidden layer -> input gradient 전파 / 보통 input은 layer로 치지 않는다.
    
    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
import torch


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1) # numpy의 dot product와 같은 효과
    h_relu = h.clamp(min=0) # (x,0~x) 로 fitting
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    if t % 100 == 99: # 이전과 달리 100번째마다 print
        print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

In [None]:
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()