# pyTorch tutorials
Taken from the official website.

In [18]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np

## Essentials
### Creating empty tensors

In [2]:
vuoto1 = torch.empty(5, 3)  # creates a 5-by-3 empty tensor
print(vuoto1)

random1 = torch.rand(5, 3)  # creates a 5-by-3 random tensor
print(random1)

zeros1 = torch.zeros(10, 2)  # creates a 10-by-2 zeroed tensor
print(zeros1)

tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 3.5831e-19,  1.4013e-45,  2.1390e-19],
        [ 4.5740e-41,  0.0000e+00,  0.0000e+00],
        [ 4.2803e-28, -1.5849e+29,  4.2804e-28]])
tensor([[0.7789, 0.7275, 0.5827],
        [0.5068, 0.0809, 0.3517],
        [0.4566, 0.9399, 0.5542],
        [0.1592, 0.8336, 0.0607],
        [0.4987, 0.9373, 0.2256]])
tensor([[0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.],
        [0., 0.]])


### Constructing a tensor from data

In [3]:
# To construct a tensor from data, we can do:
from_data1 = torch.tensor([10, 20.3, 2])
print(from_data1)

# We can specify the type of data we want
from_data2 = torch.tensor([10, 20.3, 2], dtype=torch.long)
print(from_data2)

from_data3 = torch.tensor([10, 20.3, 2], dtype=torch.double)
print(from_data3)

tensor([10.0000, 20.3000,  2.0000])
tensor([10, 20,  2])
tensor([10.0000, 20.3000,  2.0000], dtype=torch.float64)


### Converting other data types to tensors

In [4]:
# Or it is possible to "convert" and reuse other tensors as well
from_data4 = from_data3.new_empty(10, 2)
print(from_data4)
from_data5 = torch.randn_like(from_data1, dtype=torch.float)
print(from_data5)

# To obtain tensor size:
print(from_data5.size())
print(from_data4.size())

tensor([[-2.6816e+154, -2.6816e+154],
        [ 9.1503e+199,   2.6355e+92],
        [ 6.2920e+233,   6.0934e-13],
        [ 4.9428e+160,  4.8256e+276],
        [ 1.1132e+171,  2.6469e+180],
        [ 7.6876e+170,  2.5172e+180],
        [ 4.6568e+164,  4.9775e+151],
        [  2.3352e+30,  3.4351e+228],
        [ 1.0258e+200,  3.4521e+175],
        [ 7.0636e-308,  4.9407e-324]], dtype=torch.float64)
tensor([ 0.5048,  1.2861, -1.5151])
torch.Size([3])
torch.Size([10, 2])


## Operations on tensors

In [5]:
random2 = torch.randn_like(random1)
print(random1 + random2)  # Sum of tensors
print(torch.add(random1, random2))  # Another way, explicit method call
risultato_somma = torch.empty_like(random1)  # Another way, involving pre-initializing the result storage tensor
torch.add(random1, random2, out=risultato_somma)
print(risultato_somma)
random3 = torch.randn_like(random1)
random3.add_(random2)  # In place addition, will add random2 to random3, and reassign the result

# Tensors support all the NumPy-like indexing facilities:
print(random1[:, 1])

# Also to reshape, it's possible to use the torch.view() methods:
x1 = torch.randn(4, 4)  # 4 by 4 random tensor, size == 16
y1 = x1.view(16)  # now, the same elements are arranged as a 16 (by 1) tensor
z1 = x1.view(-1, 8)  # ask to put 8 elements on the second dimension and to infer how many it needs to put in the first (-1)
z2 = x1.view(-1, 2, 2)  # now let's ask to do a 3-D tensor with 2 elements on the second dimension and 2 elements on the third dimension, let's allow torch to infer how many elementds it should put on the remaining (1st) dimension
print(x1.size(), y1.size(), z1.size())
print()
print(z2, z2.size())

tensor([[ 1.9989,  0.2214,  1.2303],
        [-0.8960, -0.3022, -1.1373],
        [-0.1791,  1.3076,  0.3348],
        [-0.0956,  0.3738,  0.4948],
        [ 0.6427,  1.1976,  0.3401]])
tensor([[ 1.9989,  0.2214,  1.2303],
        [-0.8960, -0.3022, -1.1373],
        [-0.1791,  1.3076,  0.3348],
        [-0.0956,  0.3738,  0.4948],
        [ 0.6427,  1.1976,  0.3401]])
tensor([[ 1.9989,  0.2214,  1.2303],
        [-0.8960, -0.3022, -1.1373],
        [-0.1791,  1.3076,  0.3348],
        [-0.0956,  0.3738,  0.4948],
        [ 0.6427,  1.1976,  0.3401]])
tensor([0.7275, 0.0809, 0.9399, 0.8336, 0.9373])
torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])

tensor([[[ 0.0109, -0.9613],
         [ 3.1196,  0.3278]],

        [[ 0.7942, -0.9987],
         [ 0.0810,  0.4879]],

        [[ 0.2968, -0.1284],
         [ 0.2168,  1.2693]],

        [[ 0.6893, -0.8886],
         [ 0.0332, -1.3138]]]) torch.Size([4, 2, 2])


The command ```torch.view()``` can be used to resize/reshape tensors (by selecting the desired elements) by asking it to 
rearrange the elements so that they fit in the requested size/dimensions.

In [6]:
# For a singleton tensor, we can get the value as a normal number with:
singleton1 = torch.randn(1)
print(singleton1)
print(singleton1.item())

tensor([0.9488])
0.9488030672073364


We can use the ```torch.item()``` method to extract the actual element from the tensor, as seen above.

## Numpy integration

In [7]:
a1 = torch.ones(10)
print(a1)
a1_numpy = a1.numpy()  # To convert to a NumPy array
print(a1_numpy)

# Let's see what happens with:
a1.add_(1)
print(a1)
print(a1_numpy)  # it carries the change from the tensor to the NumPy array

tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
tensor([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.])
[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]


Any operator which mutates directly the original variable (like in the example above) is post-fixed with a "_" symbol.

In [8]:
# And the contrary is doable, too
numpy_a1 = np.ones(10)
from_numpy_a1 = torch.from_numpy(numpy_a1)
np.add(numpy_a1, 1, out=numpy_a1)
print(numpy_a1)
print(from_numpy_a1)

[2. 2. 2. 2. 2. 2. 2. 2. 2. 2.]
tensor([2., 2., 2., 2., 2., 2., 2., 2., 2., 2.], dtype=torch.float64)


## CUDA tensors
To take advantage of a GPU with CUDA support, we should "address" the tensors to the correct device, like shown below. 

In [9]:
# let us run this cell only if CUDA is available
# We will use ``torch.device`` objects to move tensors in and out of GPU
if torch.cuda.is_available():
    device = torch.device("cuda")          # a CUDA device object
    y = torch.ones_like(x1, device=device)  # directly create a tensor on GPU
    x1 = x1.to(device)                       # or just use strings ``.to("cuda")``
    z = x1 + y
    print(z)
    print(z.to("cpu", torch.double))       # ``.to`` can also change dtype together!
else:
    print("CUDA not available on this machine/setup")

CUDA not available on this machine/setup


# Autograd: automatic differentiation
By setting a tensor's attribute ```.required_grad``` as ```True```, the operations done on the tensor will be tracked 
automatically. To halt the tracking, we can call the ```.detach()``` method and future operations on the tensor will not
be tracked. To temporarily perform untracked operations on a tensor that is being tracked, we can wrap the computations
within a ```with torch.no_grad():``` block.
Every tensor has a a reference to a function that has created the tensor, ```.grad_fn``` (except for user-made tensors, 
whose ```grad_fn``` is ```None```.
All of this has to do with the fact that we are trying to find the optimal weights for several layers of neurons. In order
to efficiently do this, **the cost function needs to be properly back-propagated from the output layer all the way to the
input**. Automatic differentiation is an efficient algorithm to perform this task, and keeping track of every operation
performed on our tensors is essential for it to work properly.
Let's try this.

In [10]:
x = torch.ones(2, 2, requires_grad=True)
print(x)
# Let's perform an operation on this tensor.
y = x + 2
print(y)
# Given that the tensor y was created through an operation, it has a grad_fn attached.
print(y.grad_fn)
print(x.grad_fn)  # But a user-made tensor, on the other hand, has None as it's grad_fn
z = y * y * 3
out = z.mean()
print(x, out)

tensor([[1., 1.],
        [1., 1.]], requires_grad=True)
tensor([[3., 3.],
        [3., 3.]], grad_fn=<AddBackward0>)
<AddBackward0 object at 0x127cb3b90>
None
tensor([[1., 1.],
        [1., 1.]], requires_grad=True) tensor(27., grad_fn=<MeanBackward0>)


# Neural Networks
The ```nn``` module depends on ```autograd``` to define models and differentiate them. This module's purpose is to contain
layers and a method ```forward(input)``` that returns the ```output```.
Let's now define an example neural network (```convnet``` I think, used to classify handwritten digits?).

In [11]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        # This network takes 1 input image channel, has 6 output channels and a 3x3 square convolution kernel
        self.conv1 = nn.Conv2d(1, 6, 3)
        self.conv2 = nn.Conv2d(6, 16, 3)
        self.fc1 = nn.Linear(16 * 6 * 6, 120)
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)
        # Don't have clues about this. Maybe it's just me?
        
    def forward(self, x):
        # Max pooling over a (2, 2) window, they say
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square we can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x))
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x
    
    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features
    
net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


This defines the architecture of the net. We have to define the ```forward``` function, while the ```backward``` function will be 
automatically defined by ```autograd```.
We can then obtain the list of learnable parameters using the following syntax:

In [12]:
params = list(net.parameters())
print(len(params))
print(params[0].size())

10
torch.Size([6, 1, 3, 3])


Now let's try a random 32x32 input. This net expects an input size of 32x32.

In [13]:
my_input = torch.randn(1, 1, 32, 32)
out = net(my_input)
print(out)

net.zero_grad()  # Let's zero the gradient buffers
out.backward(torch.randn(1, 10))  # and reset the backprops randomly

tensor([[-0.1405,  0.0683,  0.0055, -0.0666,  0.0289, -0.1163, -0.0818,  0.0975,
         -0.1403,  0.0360]], grad_fn=<AddmmBackward>)


## Recap
* ```torch.Tensor``` is a multi-dimensional array supporting ```autograd``` operations like ```backwards()```; it also 
**holds the gradient** with regard to the tensor itself.
* ```(torch.)nn.Module``` contains the neural network module, convenient way of encapsulating the parameters with various
helpers.
* ```(torch.)nn.Parameter``` is a kind of tensor automatically registered as a parameter when assigned as an attribute
to a module.
* ```autograd.Function``` implements ```forward``` and ```backward``` definitions of an autograd operation. Every tensor
operation creates at least a single function node that connects to functions that created a tensor and encodes its history.
operation creates at least a single function node that connects to functions that created a tensor and encodes its history.

# Loss Function
A loss function takes a (output, target) pair of inputs and computes a value that estimates how far away the output is
from the target. The package ```nn``` contains several different loss functions: a simple loss is ```nn.MSELoss``` which
computes the mean-squared error between the input and the target.
Example:

In [14]:
output = net(my_input)
target = torch.randn(10)  # dummy target
target = target.view(1, -1)  # make it the same shape as the output
criterion = nn.MSELoss()

loss = criterion(output, target)
print(loss)

tensor(1.0317, grad_fn=<MseLossBackward>)


If we followed loss in the ```backward``` direction using its ```.grad_fn``` attribute, we will walk through a graph of 
all the computations from ```input``` to ```loss``` (the last being performed). Something similar to:

```input -> conv2d -> relu -> maxpool2d -> conv2d -> relu -> maxpool2d
      -> view -> linear -> relu -> linear -> relu -> linear
      -> MSELoss
      -> loss```

When calling ```loss.backward()```, the whole graph is differentiated with regard to the ```loss``` and all the tensors 
in the graph that have ```requires_grad=True``` will have their ```.grad``` tensor accumulated with the gradient. Let's
try this:

In [15]:
print(loss.grad_fn)  # this would be MSELoss
print(loss.grad_fn.next_functions[0][0])  # this would be Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0])  # and this would be reLU

<MseLossBackward object at 0x127d16510>
<AddmmBackward object at 0x127d16710>
<AccumulateGrad object at 0x127d16510>


# Backprop
In order to backpropagate the error, we just have to call ```loss.backwards()```.

In [16]:
net.zero_grad()  # Let's zero all the gradients again

print("conv1.bias.grad before backward")
print(net.conv1.bias.grad)

loss.backward()

print("conv1.bias.grad after backward")
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0033, -0.0021,  0.0218,  0.0212,  0.0193, -0.0135])


# Updating the weights
Now that we have computed the loss function, we can deal with how to update the weights. The simplest update rule
in practice is the **Stochastic Gradient Descent (SGD)**:
```weight = weight - learning_rate * gradient```. Let's try this.

In [17]:
learning_rate = 0.01
for f in net.parameters():
    f.data.sub_(f.grad.data * learning_rate)

This works, but several other update rules (such as SGD, Nesterov-SGD, Adam, RMSProp) are included in the ```torch.optim```
package. Let's try.

In [19]:
# Let's create an optimizer!
optimizer = optim.SGD(net.parameters(), lr=0.01)

# And then we would put the following in our training loop:
optimizer.zero_grad()
output = net(my_input)
loss = criterion(output, target)
loss.backward()
optimizer.step()  # Actual update

# Training a classifier
boiadeh