In [142]:
import torch
device = torch.device('cpu')

In [143]:
learning_rate = 1e-6
x = torch.randn(64, 1000, device=device)
y = torch.randn(64, 10, device=device)

w1 = torch.randn(1000, 100, device=device)
w2 = torch.randn(100, 10, device=device)

In [144]:
for t in range(30):
    h = x.mm(w1) # 64 x 100
    h_relu = h.clamp(min=0) # 64 x 100
    y_pred = h_relu.mm(w2) # 64 x 10
    loss = (y_pred - y) # 64 x 10
    
    grad_y_pred = 2.0 * loss # 64 x 10
    grad_w2 = h_relu.t().mm(grad_y_pred) # 100 x 10
    grad_h_relu = grad_y_pred.mm(w2.t()) # 64 x 100
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2
    
    print(loss.pow(2).sum())

tensor(30570998.)
tensor(25449520.)
tensor(25784770.)
tensor(27259042.)
tensor(26666554.)
tensor(22452662.)
tensor(15865979.)
tensor(9667770.)
tensor(5438052.)
tensor(3067293.)
tensor(1848839.)
tensor(1229853.5000)
tensor(899446.5000)
tensor(706773.8125)
tensor(582078.7500)
tensor(493572.6250)
tensor(425997.1875)
tensor(371837.8125)
tensor(327192.9375)
tensor(289580.9375)
tensor(257480.5312)
tensor(229841.4219)
tensor(205885.2188)
tensor(184994.1562)
tensor(166698.5938)
tensor(150604.0625)
tensor(136395.0312)
tensor(123810.7031)
tensor(112622.1094)
tensor(102660.9453)


## Implement our own deep learning model by using PyTorch

In [145]:
import torch
from torch.utils.data import TensorDataset, DataLoader

device = torch.device('cpu')
learning_rate = 1e-2

In [146]:
x = torch.randn(64, 1000, device=device)
y = torch.randn(64, 10, device=device)
loader = DataLoader(TensorDataset(x,y), batch_size=8)

In [147]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)
        
    def forward(self, x):
        h = self.linear1(x)
        h_relu = torch.nn.functional.relu(h)
        y_pred = self.linear2(h_relu)
        return y_pred

In [148]:
model = TwoLayerNet(D_in=1000, H=100, D_out=10)
model = model.to(device=device)

optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

In [149]:
model.named_parameters()

<generator object Module.named_parameters at 0x7f5eb0ef2180>

In [150]:
for epoch in range(1):
    for x_batch, y_batch in loader:
        y_pred = model(x_batch)
        loss = torch.nn.functional.mse_loss(y_pred, y_batch)
        
        print(loss.item())
        
        loss.backward()
        for name, param in model.named_parameters():
            if param.grad is not None:
                print(f'Parameter: {name}, Gradient norm: {param.grad.norm().item()}')
        optimizer.step()
        optimizer.zero_grad()

0.941053032875061
Parameter: linear1.weight, Gradient norm: 2.621346950531006
Parameter: linear1.weight, Gradient norm: 2.621346950531006
Parameter: linear1.bias, Gradient norm: 0.07075899094343185
Parameter: linear1.bias, Gradient norm: 0.07075899094343185
Parameter: linear2.weight, Gradient norm: 0.8423654437065125
Parameter: linear2.weight, Gradient norm: 0.8423654437065125
Parameter: linear2.bias, Gradient norm: 0.18831156194210052
Parameter: linear2.bias, Gradient norm: 0.18831156194210052
0.8698580861091614
Parameter: linear1.weight, Gradient norm: 2.808884620666504
Parameter: linear1.weight, Gradient norm: 2.808884620666504
Parameter: linear1.bias, Gradient norm: 0.08466572314500809
Parameter: linear1.bias, Gradient norm: 0.08466572314500809
Parameter: linear2.weight, Gradient norm: 0.8893697261810303
Parameter: linear2.weight, Gradient norm: 0.8893697261810303
Parameter: linear2.bias, Gradient norm: 0.1852453052997589
Parameter: linear2.bias, Gradient norm: 0.1852453052997589
1