In [1]:
# 1000/100/10 neural network
import torch

n = 64 # Training set size
d_in = 1000 # Dimension of data in
h = 100 # Dimension of hiddden layer
d_out = 10 # Dimension of data out
epochs = 501 # Number of epochs used to train network

x = torch.randn(n, d_in, dtype = torch.float, requires_grad = False) # Input tensor
y = torch.randn(n, d_out, dtype = torch.float, requires_grad = False)# Output tensor

In [4]:
# Weights to connect layers
w1 = torch.randn(d_in, h) # 1000 x 100 weights connects 1000 neurons from input layer to 100 neurons on hidden layer
w2 = torch.randn(h, d_out) # 100 x 10 weights connects 100 neurons from hidden layer to 10 neurons on output layer

learning_rate = 1e-6

for epoch in range(epochs):
    # Forward pass to compute a predicted y
    hidden = x.mm(w1) # Matrix multiply by existing weights to create hidden layer
    hidden_relu = hidden.clamp(min = 0) # Apply activation function (everything < 0 = 0)
    y_predict = hidden_relu.mm(w2) # Matrix multiply the activation function by the second set of weights
    
    loss = (y_predict - y).pow(2).sum() # Calculate the MSE Loss
    if not epoch % 50:
        print("Iteration: %4d - Loss: %0.2e" % (epoch, loss.item()))
    
    # Adjust the weights with respect to loss (backward pass)
    grad_y_predict = 2.0 * (y_predict - y) # Vectorized difference between prediction against desired values
    w2_grad = hidden_relu.t().mm(grad_y_predict) # Second level weights, use a hidden relu function, transpose, then matrix multiply
    grad_h_relu = grad_y_predict.mm(w2.t()) # Do the same for level 1 weights (multiply by transposed second set weights)
    grad_h = grad_h_relu.clone() # Clone to not modify values
    grad_h[h < 0] = 0 # Compute gradient h (same as '.clamp(min = 0)')
    w1_grad = x.t().mm(grad_h) # Compute the second set of weights
    
    # Adjust the weights with gradient descent
    w1 -= learning_rate * w1_grad
    w2 -= learning_rate * w2_grad

Iteration:    0 - Loss: 3.43e+07
Iteration:   50 - Loss: 1.38e+04
Iteration:  100 - Loss: 4.88e+02
Iteration:  150 - Loss: 3.09e+01
Iteration:  200 - Loss: 2.45e+00
Iteration:  250 - Loss: 2.18e-01
Iteration:  300 - Loss: 2.07e-02
Iteration:  350 - Loss: 2.14e-03
Iteration:  400 - Loss: 3.11e-04
Iteration:  450 - Loss: 7.98e-05
Iteration:  500 - Loss: 3.17e-05


In [6]:
# Same process, now using Autograd
w1 = torch.randn(d_in, h, dtype = torch.float, requires_grad = True)
w2 = torch.randn(h, d_out, dtype = torch.float, requires_grad = True)

for epoch in range(epochs):
    # Forward Pass
    ## Matrix multiply the first set of weights with x data, apply an activation function to get the relu data, 
    ## then matrix multiply that by the second set of weights to get y pred.
    y_predict = x.mm(w1).clamp(min = 0).mm(w2)
    
    loss = (y_predict - y).pow(2).sum() # Calculate the MSE Loss
    if not epoch % 50:
        print("Iteration: %4d - Loss: %0.2e" % (epoch, loss.item()))
    
    loss.backward() # Autograd to compute a backward pass
    
    # Update the weights using gradient descent with no autograd since we do not have to keep the gradients on the weights
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        w1.grad.zero_()
        w2.grad.zero_()

Iteration:    0 - Loss: 3.74e+07
Iteration:   50 - Loss: 1.07e+04
Iteration:  100 - Loss: 2.18e+02
Iteration:  150 - Loss: 7.44e+00
Iteration:  200 - Loss: 3.12e-01
Iteration:  250 - Loss: 1.46e-02
Iteration:  300 - Loss: 9.63e-04
Iteration:  350 - Loss: 1.56e-04
Iteration:  400 - Loss: 5.18e-05
Iteration:  450 - Loss: 2.52e-05
Iteration:  500 - Loss: 1.57e-05
