In [2]:
# -*- coding: utf-8 -*-
import torch
import math


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Prepare the input tensor (x, x^2, x^3).
p = torch.tensor([1, 2, 3])
xx = x.unsqueeze(-1).pow(p)

# Use the nn package to define our model and loss function.
model = torch.nn.Sequential(
    torch.nn.Linear(3, 1),
    torch.nn.Flatten(0, 1)
)
loss_fn = torch.nn.MSELoss(reduction='sum')

# Use the optim package to define an Optimizer that will update the weights of
# the model for us. Here we will use RMSprop; the optim package contains many other
# optimization algorithms. The first argument to the RMSprop constructor tells the
# optimizer which Tensors it should update.
learning_rate = 1e-3
optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)
for t in range(2000):
    # Forward pass: compute predicted y by passing x to the model.
    y_pred = model(xx)

    # Compute and print loss.
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Before the backward pass, use the optimizer object to zero all of the
    # gradients for the variables it will update (which are the learnable
    # weights of the model). This is because by default, gradients are
    # accumulated in buffers( i.e, not overwritten) whenever .backward()
    # is called. Checkout docs of torch.autograd.backward for more details.
    optimizer.zero_grad()

    # Backward pass: compute gradient of the loss with respect to model
    # parameters
    loss.backward()

    # Calling the step function on an Optimizer makes an update to its
    # parameters
    optimizer.step()


linear_layer = model[0]
print(f'Result: y = {linear_layer.bias.item()} + {linear_layer.weight[:, 0].item()} x + {linear_layer.weight[:, 1].item()} x^2 + {linear_layer.weight[:, 2].item()} x^3')

99 3502.250244140625
199 1666.557861328125
299 1427.4190673828125
399 1292.9361572265625
499 1131.565673828125
599 958.1602783203125
699 790.1309814453125
799 636.689453125
899 500.6023864746094
999 382.4841003417969
1099 282.303955078125
1199 199.62237548828125
1299 133.82302856445312
1399 83.97315979003906
1499 48.941253662109375
1599 26.49860954284668
1699 14.60747241973877
1799 10.011432647705078
1899 9.011791229248047
1999 8.927379608154297
Result: y = 0.0005028784507885575 + 0.8550434112548828 x + 0.0005029808962717652 x^2 + -0.09366913884878159 x^3


In [129]:
import torch
from torch.autograd import Variable
from torch.nn import functional as F

loss_fn = torch.nn.MSELoss(reduction='sum')

class MLP(torch.nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.linear1 = torch.nn.Linear(2, 50)
        self.linear2 = torch.nn.Linear(50, 16)
        self.linear3 = torch.nn.Linear(16, 1)

    def forward(self, x):
        layer1_out = F.relu(self.linear1(x))
        layer2_out = F.relu(self.linear2(layer1_out))
        out = self.linear3(layer2_out)
        return out, layer1_out, layer2_out

batchsize = 1000
lambda1, lambda2 = 0.1, 0.1

model = MLP()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

# usually following code is looped over all batches 
# but let's just do a dummy batch for brevity

inputs = torch.rand(1000).reshape(-1,2)
targets = inputs.sum(axis=1).reshape(-1,1)
mse=[]
for t in range(2000):
    optimizer.zero_grad()
    outputs, layer1_out, layer2_out = model(inputs)
    mse_loss = loss_fn(outputs, targets)

    all_linear1_params = torch.cat([x.view(-1) for x in model.linear1.parameters()])
    all_linear2_params = torch.cat([x.view(-1) for x in model.linear2.parameters()])

    l1_regularization = lambda1 * torch.norm(all_linear1_params, 1)
    l2_regularization = lambda2 * torch.norm(all_linear2_params, 2)
    mse.append(mse_loss)
    loss = mse_loss + l1_regularization + l2_regularization
    loss.backward()
    optimizer.step()

In [120]:
[x for x in model.linear2.parameters()][0][:,0]
[x for x in model.linear2.parameters()][0][:,1:]
[x for x in model.linear2.parameters()][0][:,:3]

tensor([ 0.0593,  0.1378,  0.0661, -0.1313,  0.0464, -0.0451, -0.0467,  0.0934,
        -0.0864,  0.0827,  0.1078,  0.0044, -0.0883, -0.1276, -0.0286, -0.0314],
       grad_fn=<SelectBackward0>)

In [126]:
[x for x in model.linear1.parameters()][0][:,0]

tensor([-0.1681,  0.2755, -0.1224, -0.2722,  0.6572,  0.1530,  0.1879, -0.2204,
         0.6531, -0.3459,  0.0753, -0.3451,  0.4316, -0.1994,  0.1465,  0.3100,
         0.0160, -0.3059,  0.4430,  0.1239, -0.5761,  0.0114, -0.2714,  0.5075,
        -0.3019, -0.5241,  0.1346, -0.3772,  0.3538,  0.1654,  0.0055,  0.2951,
         0.3105,  0.6046, -0.2313,  0.0493,  0.1271,  0.4579, -0.5893, -0.4037,
         0.2082, -0.5880,  0.3097,  0.6044,  0.5200, -0.1111, -0.5425,  0.6961,
        -0.6421,  0.5669], grad_fn=<SelectBackward0>)

In [130]:
test_inputs = torch.rand(1000).reshape(-1,2)
test_targets = test_inputs.sum(axis=1).reshape(-1,1)

F.mse_loss(model(test_inputs)[0],test_targets)

tensor(3.8447e-05, grad_fn=<MseLossBackward0>)