We will use a fully-connected ReLU network. The network will have a single hidden layer, and will be trained with gradient descent to fit random data by minimizing the Euclidean distance between the network output and the true output.

# Numpy Manual Implementation

In [0]:
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [2]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 37464306.02665963
1 35205035.88998476
2 35178056.68145451
3 31590972.093677323
4 23364013.263457127
5 14116720.418241685
6 7535728.961410831
7 4001389.1329726903
8 2338694.7305772104
9 1554502.0511118
10 1149074.2404032873
11 908982.891279147
12 746970.137394632
13 627225.1011647236
14 533736.7789093328
15 458335.89020568156
16 396377.224333078
17 344671.5538670771
18 301102.11588113033
19 264172.9595740306
20 232646.593176866
21 205589.25362193314
22 182278.0221673398
23 162078.34955425008
24 144508.19797399573
25 129186.50015730213
26 115765.85144995837
27 103969.27185476618
28 93571.28497818905
29 84378.67079372508
30 76231.93295938306
31 68997.28895926574
32 62558.03203825449
33 56809.8278573272
34 51671.64086402253
35 47062.54499654175
36 42921.386067465144
37 39195.44061923259
38 35833.13656502163
39 32795.732647969315
40 30051.414215805493
41 27569.168187154977
42 25315.911530696787
43 23270.05713479766
44 21408.85970635403
45 19715.61160638468
46 18175.509700241353
47 16770.0

# PyTorch Manual Implementation

In [0]:
import torch

dtype = torch.float
device = torch.device("cuda:0")
# device = torch.device("cpu") # Uncomment this to run on CPU


# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10


# Create random input and output data
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)


# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

In [4]:
learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum().item()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 31291236.0
1 30283598.0
2 38954632.0
3 51016556.0
4 54652256.0
5 40618124.0
6 19306376.0
7 6751145.0
8 2498025.75
9 1309041.5
10 918597.6875
11 733604.3125
12 613222.0625
13 521659.0625
14 447726.28125
15 386740.0625
16 335766.3125
17 292828.0625
18 256488.390625
19 225504.28125
20 198932.46875
21 176058.34375
22 156285.53125
23 139107.78125
24 124129.15625
25 111036.8359375
26 99566.671875
27 89476.921875
28 80579.5390625
29 72719.34375
30 65746.875
31 59549.6328125
32 54024.8515625
33 49082.27734375
34 44672.11328125
35 40721.9921875
36 37172.28125
37 33979.6796875
38 31104.7734375
39 28516.23828125
40 26175.013671875
41 24055.19921875
42 22133.197265625
43 20387.9140625
44 18802.728515625
45 17361.716796875
46 16047.4296875
47 14847.98828125
48 13751.037109375
49 12746.955078125
50 11827.32421875
51 10983.3515625
52 10208.322265625
53 9496.5478515625
54 8841.53515625
55 8238.177734375
56 7681.8095703125
57 7168.37890625
58 6694.189453125
59 6256.0283203125
60 5850.7880859375
61 54