In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [14]:
input = torch.randn(1, 1, 32, 32)
conv1 = nn.Conv2d(1, 6, 5)
conv2 = nn.Conv2d(6, 16, 5)
conv_out = F.max_pool2d(conv2(F.max_pool2d(conv1(input), (2, 2))), 2)
conv_out.shape

torch.Size([1, 16, 5, 5])

In [15]:
fc1 = nn.Linear(16 * 5 * 5, 120)
out1 = fc1(conv_out.flatten(1))
out1.shape

torch.Size([1, 120])

In [16]:
fc2 = nn.Linear(120, 10)
out2 = F.relu(fc2(out1))
out2.shape

torch.Size([1, 10])

In [17]:
target = torch.randn(10)  # a dummy target, for example
target = target.view(1, -1)  # make it the same shape as output
criterion = nn.MSELoss()

loss = criterion(out2, target)
loss

tensor(0.6889, grad_fn=<MseLossBackward0>)

In [21]:
print(loss.grad_fn)
print(loss.grad_fn.next_functions[0][0])
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])

<MseLossBackward0 object at 0x7f8d013158d0>
<ReluBackward0 object at 0x7f8d01317cd0>
<AddmmBackward0 object at 0x7f8d013142b0>


In [32]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(120, 50, dtype=torch.float64)

    def foward(self, x):
        return F.relu(self.fc(x))

In [33]:
net = Net()
for f in net.parameters():
    print(f.data.shape, f.data.dtype)

torch.Size([50, 120]) torch.float64
torch.Size([50]) torch.float64


In [None]:
import nltk

In [1]:
import numpy as np
import math

In [None]:
x = np.linspace(-math.pi, math.pi, 2000)
x.shape, x[0], x[-1]

((2000,), -3.141592653589793, 3.141592653589793)

In [11]:
y = np.sin(x)
y.shape, y[0], y[500]

((2000,), -1.2246467991473532e-16, -0.9999996912662218)

In [13]:
a = np.random.randn()
b = np.random.randn()
c = np.random.randn()
d = np.random.randn()
a

0.0574660160049302

In [17]:
y_pred = a + b * x + c * x ** 2 + d * x ** 3
y_pred.shape, np.square(y_pred - y).sum()

((2000,), 198434.87185622856)

In [24]:
lr = 1e-6
for t in range(5000):
    y_pred = a + b * x + c * x ** 2 + d * x ** 3
    loss = np.square(y_pred - y).sum()

    if t % 1000 == 999:
        print(t, loss)

    grad_y_pred = 2.0 * (y_pred - y)
    grad_a = grad_y_pred.sum()
    grad_b = (grad_y_pred * x).sum()
    grad_c = (grad_y_pred * x ** 2).sum()
    grad_d = (grad_y_pred * x ** 3).sum()

    a -= lr * grad_a
    b -= lr * grad_b
    c -= lr * grad_c
    d -= lr * grad_d

print(f'Result: y = {a:.3} + {b:.3} x + {c:.3} x^2 + {d:.3} x^3')

999 8.817165410007025
1999 8.817165410007025
2999 8.817165410007025
3999 8.817165410007025
4999 8.817165410007025
Result: y = 2.77e-17 + 0.857 x + -2.11e-17 x^2 + -0.0933 x^3


In [29]:
a = torch.full((), -1.0, device=torch.device("cpu"), dtype=dtype, requires_grad=True)
a

tensor(-1., requires_grad=True)

In [30]:
import torch
import math


class LegendrePolynomial3(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return 0.5 * (5 * input ** 3 - 3 * input)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        return grad_output * 1.5 * (5 * input ** 2 - 1)


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0")  # Uncomment this to run on GPU

# Create Tensors to hold input and outputs.
# By default, requires_grad=False, which indicates that we do not need to
# compute gradients with respect to these Tensors during the backward pass.
x = torch.linspace(-math.pi, math.pi, 2000, device=device, dtype=dtype)
y = torch.sin(x)

# Create random Tensors for weights. For this example, we need
# 4 weights: y = a + b * P3(c + d * x), these weights need to be initialized
# not too far from the correct result to ensure convergence.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
a = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
b = torch.full((), -1.0, device=device, dtype=dtype, requires_grad=True)
c = torch.full((), 0.0, device=device, dtype=dtype, requires_grad=True)
d = torch.full((), 0.3, device=device, dtype=dtype, requires_grad=True)

learning_rate = 5e-6
for t in range(2000):
    # To apply our Function, we use Function.apply method. We alias this as 'P3'.
    P3 = LegendrePolynomial3.apply

    # Forward pass: compute predicted y using operations; we compute
    # P3 using our custom autograd operation.
    y_pred = a + b * P3(c + d * x)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        a -= learning_rate * a.grad
        b -= learning_rate * b.grad
        c -= learning_rate * c.grad
        d -= learning_rate * d.grad

        # Manually zero the gradients after updating weights
        a.grad = None
        b.grad = None
        c.grad = None
        d.grad = None

print(f'Result: y = {a.item()} + {b.item()} * P3({c.item()} + {d.item()} x)')

99 209.95834350585938
199 144.66018676757812
299 100.70249938964844
399 71.03519439697266
499 50.97850799560547
599 37.403133392333984
699 28.206867218017578
799 21.973188400268555
899 17.7457275390625
999 14.877889633178711
1099 12.93176555633545
1199 11.610918045043945
1299 10.714258193969727
1399 10.10548210144043
1499 9.692106246948242
1599 9.411375999450684
1699 9.220745086669922
1799 9.091286659240723
1899 9.003362655639648
1999 8.943641662597656
Result: y = -3.3247032904526463e-10 + -2.208526849746704 * P3(-2.395617870742939e-10 + 0.2554861009120941 x)


In [31]:
# -*- coding: utf-8 -*-
import torch
import math


class Polynomial3(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate four parameters and assign them as
        member parameters.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        In the forward function we accept a Tensor of input data and we must return
        a Tensor of output data. We can use Modules defined in the constructor as
        well as arbitrary operators on Tensors.
        """
        return self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = Polynomial3()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters (defined 
# with torch.nn.Parameter) which are members of the model.
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-6)
for t in range(2000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

99 1805.1796875
199 1207.095458984375
299 808.5635375976562
399 542.875244140625
499 365.6612854003906
599 247.39735412597656
699 168.4302215576172
799 115.67200469970703
899 80.4023666381836
999 56.809165954589844
1099 41.01624298095703
1199 30.437484741210938
1299 23.346179962158203
1399 18.589094161987305
1499 15.395310401916504
1599 13.249396324157715
1699 11.806288719177246
1799 10.835009574890137
1899 10.180669784545898
1999 9.73942756652832
Result: y = 0.018791040405631065 + 0.8327891826629639 x + -0.003241765545681119 x^2 + -0.08992347121238708 x^3


In [34]:
a = torch.nn.Parameter(torch.randn(()))
a.data, a.grad

(tensor(0.3259), None)

In [42]:
import random
random.randint(4, 6)

5

In [43]:
# -*- coding: utf-8 -*-
import random
import torch
import math


class DynamicNet(torch.nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate five parameters and assign them as members.
        """
        super().__init__()
        self.a = torch.nn.Parameter(torch.randn(()))
        self.b = torch.nn.Parameter(torch.randn(()))
        self.c = torch.nn.Parameter(torch.randn(()))
        self.d = torch.nn.Parameter(torch.randn(()))
        self.e = torch.nn.Parameter(torch.randn(()))

    def forward(self, x):
        """
        For the forward pass of the model, we randomly choose either 4, 5
        and reuse the e parameter to compute the contribution of these orders.

        Since each forward pass builds a dynamic computation graph, we can use normal
        Python control-flow operators like loops or conditional statements when
        defining the forward pass of the model.

        Here we also see that it is perfectly safe to reuse the same parameter many
        times when defining a computational graph.
        """
        y = self.a + self.b * x + self.c * x ** 2 + self.d * x ** 3
        for exp in range(4, random.randint(4, 6)):
            y = y + self.e * x ** exp
        return y

    def string(self):
        """
        Just like any class in Python, you can also define custom method on PyTorch modules
        """
        return f'y = {self.a.item()} + {self.b.item()} x + {self.c.item()} x^2 + {self.d.item()} x^3 + {self.e.item()} x^4 ? + {self.e.item()} x^5 ?'


# Create Tensors to hold input and outputs.
x = torch.linspace(-math.pi, math.pi, 2000)
y = torch.sin(x)

# Construct our model by instantiating the class defined above
model = DynamicNet()

# Construct our loss function and an Optimizer. Training this strange model with
# vanilla stochastic gradient descent is tough, so we use momentum
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-8, momentum=0.9)
for t in range(30000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x)

    # Compute and print loss
    loss = criterion(y_pred, y)
    if t % 2000 == 1999:
        print(t, loss.item())

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

print(f'Result: {model.string()}')

1999 4138.115234375
3999 1923.0311279296875
5999 902.533203125
7999 429.1354675292969
9999 207.27027893066406
11999 102.66796875
13999 53.71826171875
15999 29.76695442199707
17999 19.074792861938477
19999 13.861791610717773
21999 11.00837516784668
23999 9.752821922302246
25999 9.414834976196289
27999 9.126008987426758
29999 8.98507308959961
Result: y = 0.012064521200954914 + 0.8532969355583191 x + -0.0027639928739517927 x^2 + -0.09325013309717178 x^3 + 0.00013554545876104385 x^4 ? + 0.00013554545876104385 x^5 ?


In [47]:
import math

weights = torch.randn(784, 10) / math.sqrt(784)
weights.requires_grad_()
weights

tensor([[ 0.0126, -0.0498, -0.0186,  ..., -0.0418,  0.0047, -0.0299],
        [ 0.0378,  0.0375,  0.0543,  ..., -0.0771, -0.0421,  0.0119],
        [ 0.0012,  0.0659, -0.0649,  ..., -0.0226, -0.0375,  0.0742],
        ...,
        [ 0.0400, -0.0204, -0.0317,  ..., -0.0120, -0.0054,  0.0050],
        [-0.0429, -0.0169,  0.0157,  ...,  0.0318,  0.0110, -0.0339],
        [ 0.0092, -0.0273,  0.0347,  ...,  0.0091, -0.0041,  0.0499]],
       requires_grad=True)