In [2]:
import torch

##### Pytorch usa tensors que son como Numpy Arrays pero que permiten usar GPU

In [8]:
# Creo un tensor a partir de un array
x = torch.tensor([5.5, 3])
x

tensor([5.5000, 3.0000])

In [10]:
# Creo un tensor nuevo lleno de unos de tamaño 5x3 y de tipo double
x = x.new_ones(5, 3, dtype=torch.double)
x

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]], dtype=torch.float64)

In [11]:
# Creo un tensor nuevo de randoms de las mismas dimensiones que x y type double
x = torch.randn_like(x, dtype=torch.float)
x

tensor([[ 1.1481,  0.8755,  0.1508],
        [-2.4927, -0.5325,  0.5597],
        [-2.0421,  1.4451,  0.5499],
        [ 1.5889,  1.3446,  0.8928],
        [-1.5053,  1.1538, -1.4614]])

In [14]:
# x.add_(y) le suma y a x y reemplaza a x
# En general en Pytorch, _ al final del método significa que reemplaza el valor original
# También, las operaciones permiten cierta flexibilidad con las dimensiones. Notar que
# las dimensiones no son iguales pero la suma se puede hacer. Esto se llama "broadcasting".
y = torch.ones(5,1)
x.add_(y)
x

tensor([[ 3.1481,  2.8755,  2.1508],
        [-0.4927,  1.4675,  2.5597],
        [-0.0421,  3.4451,  2.5499],
        [ 3.5889,  3.3446,  2.8928],
        [ 0.4947,  3.1538,  0.5386]])

"Any operation that mutates a tensor in-place is post-fixed with an _. For example: x.copy_(y), x.t_(), will change x"


In [18]:
#// view() cambia las dimensiones de un tensor. 
#En el ejemplo, x es de 4x4, y es de 1x16 y z es de 2x8.
#(the size -1 is inferred from other dimensions)

x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
print(y), print(z)

tensor([ 0.8942,  0.2657, -0.2630,  1.7027,  0.2278, -0.8865, -0.0218, -0.2573,
         0.1125,  0.4335, -0.7348, -0.0514, -0.5023, -0.8460, -0.3186, -1.0333])
tensor([[ 0.8942,  0.2657, -0.2630,  1.7027,  0.2278, -0.8865, -0.0218, -0.2573],
        [ 0.1125,  0.4335, -0.7348, -0.0514, -0.5023, -0.8460, -0.3186, -1.0333]])


(None, None)

In [22]:
# item() extrae el valor de un tensor de size 1.
# En el ejemplo, x es de 4x4, y es un tensor que 
# contiene el a_11 de x, y z es el valor a_11

x = torch.rand(4,4)
y = x[0,0]
z = y.item()

print(x), print(z)

tensor([[0.9758, 0.9461, 0.7154, 0.2642],
        [0.5464, 0.9209, 0.2802, 0.2258],
        [0.4051, 0.2930, 0.6805, 0.7186],
        [0.5035, 0.8637, 0.4811, 0.7705]])
0.9757923483848572


(None, None)

In [26]:
# numpy() permite pasar de Tensor a Numpy Array. 
# y no es una copia de x, ES x visto como numpy array. 
# IMPORTANTE: Cualquier cambio en x afecta también a y.

x = torch.rand(4,4)
y = x.numpy()
print(x), print(type(x)), print(y), print(type(y))

tensor([[0.6285, 0.3848, 0.2676, 0.0810],
        [0.6853, 0.5309, 0.6695, 0.5338],
        [0.3306, 0.0384, 0.1694, 0.4270],
        [0.2405, 0.1109, 0.7073, 0.7763]])
<class 'torch.Tensor'>
[[0.628523   0.38484317 0.2676205  0.08104604]
 [0.68534005 0.5308684  0.66948014 0.533823  ]
 [0.3305546  0.03840983 0.16943514 0.4270267 ]
 [0.24050122 0.11094588 0.70730984 0.77629733]]
<class 'numpy.ndarray'>


(None, None, None, None)

## AutoGrad

"torch.Tensor is the central class of the package. If you set its attribute .requires_grad as True, it starts to track all operations on it. When you finish your computation you can call .backward() and have all the gradients computed automatically. The gradient for this tensor will be accumulated into .grad attribute."

In [41]:
# Como requires_grad = True, se van a trackear todas las operaciones sobre x y sus "hijos".
# grad_fn es da la información de qué tipo de función generó al tensor, en el caso de y,
# la función que la generó fue una suma.
x = torch.tensor([1.], requires_grad=True)
y = x + 2
print(y)

tensor([3.], grad_fn=<AddBackward0>)


In [42]:
# Hacemos algunas cuentas mas...
z = y * y * 3
print(z)

tensor([27.], grad_fn=<MulBackward0>)


In [43]:
# backward() computa el gradiente hasta el origen, en este caso hasta x y lo guarda
# en x.grad
z.backward()

In [45]:
# la función que toma x y devuelve z es 3(x+2)^2, cuyo gradiente respecto a x es
# 6(x+2). Como x es 1, esto devuelve 18.
x.grad

tensor([18.])

In [52]:
# Vemos el caso de una campo escalar (R³ -> R)
x = torch.tensor([1.,2.,3.], requires_grad = True)
y = x*10
z = y.mean()
z

tensor(20., grad_fn=<MeanBackward0>)

In [53]:
# En el caso de un un campo escalar de dimensión 3, el gradiente es un vector
# de dimension 3
z.backward()
x.grad

In [11]:
# Vemos el caso de un campo vectorial (R³ -> R³)
x = torch.rand(3, requires_grad = True)
y = x*x
y

tensor([0.3067, 0.0051, 0.0071], grad_fn=<MulBackward0>)

In [12]:
# Si tratamos de hacer backpropagation no vamos a poder porque sólo lo puede hacer
# para funciones con output de dimensión 1. Si ejecutamos y.backward() nos dirá:
# RuntimeError: grad can be implicitly created only for scalar outputs
# Como y.backward() calcula el jacobiano de la función que transforma x en y, hay que
# hay que pasarle un vector para que multiplique al jacobiano y devuelva otro vector.

l = torch.tensor([1.,1.,2.])  # Este vector me lo inventé
y.backward(l)

In [13]:
# Y ahora sí tengo el vector x.grad que es la derivada de la función que transforma a x
# en y en la dirección del vector l.
x.grad

tensor([1.1076, 0.1427, 0.3382])

## Neural Nets

An nn.Module contains layers, and a method forward(input)that returns the output.

In [14]:
import torch
import torch.nn as nn
import torch.nn.functional as F


class Net(nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(1, 6, 3)  # Toma un channel, devuelve 6 con kernel 3x3
        self.conv2 = nn.Conv2d(6, 16, 3) # Toma 6 channels, devuelve 16 con 3x3 kernel
        # an affine operation: y = Wx + b
        self.fc1 = nn.Linear(16 * 6 * 6, 120)  # 6*6 from image dimension
        self.fc2 = nn.Linear(120, 84)
        self.fc3 = nn.Linear(84, 10)

    def forward(self, x):
        # Max pooling over a (2, 2) window
        x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
        # If the size is a square you can only specify a single number
        x = F.max_pool2d(F.relu(self.conv2(x)), 2)
        x = x.view(-1, self.num_flat_features(x)) # aplana la imagen a un vector
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

    def num_flat_features(self, x):
        size = x.size()[1:]  # all dimensions except the batch dimension
        num_features = 1
        for s in size:
            num_features *= s
        return num_features


net = Net()
print(net)

Net(
  (conv1): Conv2d(1, 6, kernel_size=(3, 3), stride=(1, 1))
  (conv2): Conv2d(6, 16, kernel_size=(3, 3), stride=(1, 1))
  (fc1): Linear(in_features=576, out_features=120, bias=True)
  (fc2): Linear(in_features=120, out_features=84, bias=True)
  (fc3): Linear(in_features=84, out_features=10, bias=True)
)


torch.nn only supports mini-batches. The entire torch.nn package only supports inputs that are a mini-batch of samples, and not a single sample.

For example, nn.Conv2d will take in a 4D Tensor of nSamples x nChannels x Height x Width.

If you have a single sample, just use input.unsqueeze(0) to add a fake batch dimension.

In [36]:
# net.parameters() devuelve los parametros o pesos de la red.
params = list(net.parameters())
print(params[0].size())    # Acá tengo los pesos de la primer capa convolucional
print(params[2].size())    # Acá tengo los pesos de la segunda capa convolucional
print(params[4].size())    # Acá tengo los pesos de la primer capa lineal

torch.Size([6, 1, 3, 3])
torch.Size([16, 6, 3, 3])
torch.Size([120, 576])


In [52]:
# Probamos mandarle un input al azar.
# Recordar que el input que acepta tiene que tener las dimensiones de:
# nSamples x nChannels x Height x Width.
input = torch.randn(1, 1, 32, 32)
out = net(input)   # Lo mismo que llamar a net.forward(input)
print(out)

tensor([[ 0.0144,  0.0273, -0.0275, -0.1011,  0.0435,  0.0849, -0.0759,  0.0107,
          0.1155, -0.0927]], grad_fn=<AddmmBackward>)


In [53]:
# Zero the gradient buffers of all parameters and backprops with random gradients:
net.zero_grad()
out.backward(torch.randn(1, 10))

## Loss

In [105]:
output = net(input)
print(output)
print(output.size())
print("\n")
target = torch.randn(10)     # a dummy target, for example
print(target)
print(target.size())
print("\n")
target = target.view(1, -1)  # make it the same shape as output
print(target)
print(target.size())

tensor([[ 0.0144,  0.0273, -0.0275, -0.1011,  0.0435,  0.0849, -0.0759,  0.0107,
          0.1155, -0.0927]], grad_fn=<AddmmBackward>)
torch.Size([1, 10])


tensor([-1.1819, -1.1137, -0.1241,  1.8570,  2.1434,  0.9439,  0.5108,  0.8496,
        -1.7820, -0.1530])
torch.Size([10])


tensor([[-1.1819, -1.1137, -0.1241,  1.8570,  2.1434,  0.9439,  0.5108,  0.8496,
         -1.7820, -0.1530]])
torch.Size([1, 10])


In [106]:
criterion = nn.MSELoss()
loss = criterion(output, target)
print(loss)

tensor(1.6376, grad_fn=<MseLossBackward>)


In [107]:
print(loss.grad_fn)  # MSELoss
print(loss.grad_fn.next_functions[0][0])  # Linear
print(loss.grad_fn.next_functions[0][0].next_functions[0][0]) # ReLU

<MseLossBackward object at 0x7f4a9574a5c0>
<AddmmBackward object at 0x7f4a9574a588>
<AccumulateGrad object at 0x7f4a9574a5c0>


## BackPropagation

To backpropagate the error all we have to do is to loss.backward(). You need to clear the existing gradients though, else gradients will be accumulated to existing gradients.

Now we shall call loss.backward(), and have a look at conv1’s bias gradients before and after the backward.

In [108]:
net.zero_grad()     # zeroes the gradient buffers of all parameters

print('conv1.bias.grad before backward')
print(net.conv1.bias.grad)

loss.backward()

print('conv1.bias.grad after backward')
print(net.conv1.bias.grad)

conv1.bias.grad before backward
tensor([0., 0., 0., 0., 0., 0.])
conv1.bias.grad after backward
tensor([ 0.0142, -0.0011, -0.0067, -0.0109, -0.0019, -0.0094])


In [140]:
# Noto que net.parameters() contiene los parametros (pesos) en .data pero también
# contiene los gradientes en .grad
print("Pesos de la capa 4")
print(list(net.parameters())[3].data)
print("\n")
print("Gradiente de la capa 4")
print(list(net.parameters())[3].grad)

Pesos de la capa 4
tensor([-0.0699, -0.2959, -0.4610,  0.0054, -0.3026, -0.0740, -0.1181, -0.1369,
        -0.1198,  0.0096, -0.3139,  0.0485, -0.1253, -0.0129, -0.0891, -0.2784])


Gradiente de la capa 4
tensor([ 0.1128,  0.4082,  0.4433,  0.0034,  0.3045,  0.2302,  0.0121,  0.1037,
         0.0080,  0.0000,  0.2605,  0.0000,  0.1816,  0.0101, -0.0068,  0.4309])


## Optimizers

The simplest update rule used in practice is the Stochastic Gradient Descent (SGD):

weight = weight - learning_rate * gradient

However, as you use neural networks, you want to use various different update rules such as SGD, Nesterov-SGD, Adam, RMSProp, etc. To enable this, we built a small package: torch.optim that implements all these methods. Using it is very simple:

In [139]:
import torch.optim as optim

# create your optimizer
optimizer = optim.SGD(net.parameters(), lr=0.1)

# in your training loop:
optimizer.zero_grad()   # zero the gradient buffers
output = net(input)     # Forward pass
loss = criterion(output, target)    #Calculo el loss para este mini batch
loss.backward()         # Calculo el backward pass
optimizer.step()        # Actualizo los pesos