# pytorch

One of the most popular frameworks (see also chainer). It supports (python) imperative programming. It is also possible to export models and import them in C++ code for efficiency (production code). The C++ API is super easy to learn. Extremely user friendly, most cited in recent years for computer vision and nlp research. 

In [1]:
import torch

### Resources 
#### Tutorials
    1. https://pytorch.org/tutorials/
    2. fast.ai # tutorials built on pytorch

#### Awesome pytorch
    1. Just google "awesome pytorch", extreme support over the internet 
    2. https://github.com/eriklindernoren/PyTorch-GAN #Nice GAN implementations
#### Community (forum)
    1. https://discuss.pytorch.org/
    2. https://forums.fast.ai/ # fastai forum, excellent resource
#### API Reference
    1. https://pytorch.org/docs/stable/index.html

## Array creation routines

In [2]:
torch.tensor(((1,2,3),(5,6,7)))

tensor([[1, 2, 3],
        [5, 6, 7]])

In [3]:
x = torch.ones((2,3))
print (x)

tensor([[1., 1., 1.],
        [1., 1., 1.]])


In [4]:
# torch.rand((2,3)) #returns a uniform in [0,1]
# torch.randn((2,3)) #returns a normal of shape(2,3) (mu=0,sigma=1)
y = torch.Tensor(2,3).uniform_(-1.,1.) # inplace creation "_", avoids unnecessary copying
print(y)

tensor([[-0.2710, -0.5871,  0.7785],
        [ 0.3618, -0.5209,  0.1836]])


In [5]:
x = torch.full((2,3), 2.0)
print(x)

tensor([[2., 2., 2.],
        [2., 2., 2.]])


In [6]:
x.shape, x.size(),x.dtype,x.device

(torch.Size([2, 3]), torch.Size([2, 3]), torch.float32, device(type='cpu'))

## Operations

In [7]:
x*y

tensor([[-0.5420, -1.1742,  1.5570],
        [ 0.7236, -1.0419,  0.3672]])

In [8]:
x+y

tensor([[1.3944, 2.1511, 1.6545],
        [1.7295, 2.3431, 1.0858]])

In [9]:
torch.exp(y)

tensor([[0.5458, 1.1631, 0.7078],
        [0.7630, 1.4093, 0.4008]])

In [10]:
# torch.dot applies only on 1D vectors
torch.mm(x,y.transpose(0,1)), torch.dot(x.view(-1),x.view(-1))

(tensor([[-1.6000, -1.6832],
         [-1.6000, -1.6832]]), tensor(24.))

## Indexing/slicing

In [11]:
y[1,2]

tensor(-0.9142)

In [12]:
y[:,1:3]

tensor([[ 0.1511, -0.3455],
        [ 0.3431, -0.9142]])

In [13]:
y[:,1:3]=2
print(y)

tensor([[-0.6056,  2.0000,  2.0000],
        [-0.2705,  2.0000,  2.0000]])


In [14]:
y[:1,:3]

tensor([[-0.6056,  2.0000,  2.0000]])

## Broadcasting

Operating between two arrays (tensors) of different dimensionality:
1. Follows numpy semantics
2. They must have same rank
3. The dimension that has value 1 is repeated along that axis

In [15]:
x = torch.ones(3,3)
print ("-----------------")
print('x = ', x)
print ("-----------------")
y = torch.tensor([[0.,1.,2.]])
print('y = ', y)
print ("-----------------")
print('y.shape= ',y.shape)
print ("-----------------")
print('x + y = ', x + y)
print ("-----------------")
print('x * y = ', x * y)

-----------------
x =  tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
-----------------
y =  tensor([[0., 1., 2.]])
-----------------
y.shape=  torch.Size([1, 3])
-----------------
x + y =  tensor([[1., 2., 3.],
        [1., 2., 3.],
        [1., 2., 3.]])
-----------------
x * y =  tensor([[0., 1., 2.],
        [0., 1., 2.],
        [0., 1., 2.]])


make explicit the dimension you want to broadcast to

In [16]:
x = torch.ones(3,3)
print ("-----------------")
print('x = ', x)
print ("-----------------")
y = torch.tensor([[0.],[1.],[2.]])
print('y = ', y)
print ("-----------------")
print('y.shape= ',y.shape)
print ("-----------------")
print('x + y = ', x + y)
print ("-----------------")
print('x * y = ', x * y)

-----------------
x =  tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])
-----------------
y =  tensor([[0.],
        [1.],
        [2.]])
-----------------
y.shape=  torch.Size([3, 1])
-----------------
x + y =  tensor([[1., 1., 1.],
        [2., 2., 2.],
        [3., 3., 3.]])
-----------------
x * y =  tensor([[0., 0., 0.],
        [1., 1., 1.],
        [2., 2., 2.]])


## Back and forth to numpy

In [17]:
a = x.numpy() # misses the "as"
print(a)

[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]


In [18]:
x = torch.tensor(a)
# x = torch.from_numpy(a) # more efficient method for creating torch.Tensor objects
print (x)

tensor([[1., 1., 1.],
        [1., 1., 1.],
        [1., 1., 1.]])


## Managing device (context) - CPU/GPU

### Find out how many GPUs exist

In [19]:
# This shows if there are GPUs available in your system 
print (torch.cuda.is_available()) # prints True if GPU exists on your system and IS recognized (did u install correctly?)

# This the cound of available gpu devices
print(torch.cuda.device_count())

True
1


In [20]:
# If you have a gpu available
a = torch.ones((2,2)) # default creation on cpu (ram memory)

if torch.cuda.is_available(): 
    a = a.to(torch.device('cuda'))
    #a = a.to(torch.device('cuda:0'))
    
# Alternative creation routines
# a = torch.ones((2,2)).cuda() # equivalent definition
# a = torch.ones(2,2).cuda(0) # here 0 represents the index of the gpu device
print (a)
print (a.device)

tensor([[1., 1.],
        [1., 1.]], device='cuda:0')
cuda:0


#### Alternative ways to copy a cpu tensor to gpu

In [21]:
a = a.cuda()
a = a.to(torch.device('cuda'))

# recommended 1:
if torch.cuda.is_available():
    a = a.cuda()
    
# recommended 2, define global device variable (avoids multiple if checks):
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

c = torch.rand(3,3)
c = c.to(device)
print (c)

tensor([[0.9078, 0.5974, 0.0684],
        [0.1241, 0.4069, 0.1941],
        [0.3595, 0.5783, 0.7154]], device='cuda:0')


### In order to perform operations between tensors (vectors), both of them must live in the same device (context)

In [22]:
a = torch.rand((3,3)) # lives on CPU (default)
# a = torch.rand(3,3,device=torch.device('cpu')) # alternative definition, lives on cpu
print (a)
print(a.device)
b = torch.rand(3,3,device=torch.device('cuda')) # lives on GPU
# alternative definitions
#b = torch.rand(3,3).cuda()
#b = torch.rand(3,3).to(torch.device('cuda'))
print (b)
print (b.device)

tensor([[0.4699, 0.4209, 0.9168],
        [0.8787, 0.8552, 0.9404],
        [0.5054, 0.7507, 0.9644]])
cpu
tensor([[0.4838, 0.5177, 0.4177],
        [0.9930, 0.1040, 0.5883],
        [0.5516, 0.3947, 0.0138]], device='cuda:0')
cuda:0


In [23]:
# Correct operation 
print (a.cuda()+b) # copy a to G-pu, result lives on GPU
print (a+b.cpu()) # copy b to C-pu, result lives on CPU

tensor([[0.9538, 0.9385, 1.3345],
        [1.8717, 0.9592, 1.5287],
        [1.0571, 1.1454, 0.9782]], device='cuda:0')
tensor([[0.9538, 0.9385, 1.3345],
        [1.8717, 0.9592, 1.5287],
        [1.0571, 1.1454, 0.9782]])


In [24]:
# to err is to learn
a = a.to(torch.device('cpu'))
b = b.to(torch.device('cuda'))
print (a+b)

RuntimeError: expected type torch.FloatTensor but got torch.cuda.FloatTensor

## Some basic linear algebra

## Scalars

In [25]:
a = torch.tensor([3])
print ("This is a as an array:= ", a)
a = a.item()
print ("This is a as a float:= ", a)

This is a as an array:=  tensor([3])
This is a as a float:=  3


## matrix - vector product (not broadcasting!)

In [26]:
a = torch.rand(3,5)
print ("a= ",a)
b = torch.rand(5,1) # Compare with declaring explicitely 
print ("b= ",b)

a=  tensor([[0.0187, 0.6996, 0.3053, 0.9990, 0.9009],
        [0.7379, 0.9380, 0.5124, 0.4761, 0.7619],
        [0.2273, 0.0469, 0.7664, 0.3275, 0.0321]])
b=  tensor([[0.3260],
        [0.8682],
        [0.4630],
        [0.1604],
        [0.1782]])


In [27]:
# mm: matrix multiplication
result = torch.mm(a,b)
print ("result:= ",result)
print ("shape of output matrix: ", result.shape)

result:=  tensor([[1.0756],
        [1.5043],
        [0.5279]])
shape of output matrix:  torch.Size([3, 1])


**Remark**: in pytorch dot operation applies only to vectors

In [28]:
a = torch.rand(3)
b = torch.rand(3)
result = torch.dot(a,b)
print ("a.b:= ", result)

a.b:=  tensor(0.5438)


### Compare with declaring matrix b only with its first dimension 

In [29]:
a = torch.rand(3,5)
#print ("a= ",a)
b = torch.rand(5,1) # Compare with declaring explicitely 
#print ("b= ",b)

print ("initial b.shape= ", b.shape)
b = torch.squeeze(b) # just like np.squeeze - removes the redundant dimension
print ("new shape of b: ", b.shape)

initial b.shape=  torch.Size([5, 1])
new shape of b:  torch.Size([5])


In [30]:
# In pytorch the operator mm does not allow matrix-vector product
result = torch.mm(a,b)
print ("result= ", result)
print ("shape of output matrix: ", result.shape)

RuntimeError: Dimension out of range (expected to be in range of [-1, 0], but got 1)

**Tip**: A handy operation in pytorch is torch.tensor.view, to change the shape of a tensor/layer/Array

In [31]:
print ("a.shape before:= ", a.shape)
a = a.view(-1) # equivalent to np.flatten, does not copy elements
print ("a.shape after:= ",a.shape)

a.shape before:=  torch.Size([3, 5])
a.shape after:=  torch.Size([15])


## Automatic differentiation

We train neural networks with (modified versions of) gradient descent. Therefore we need a mechanism that automatically evaluates derivatives for us. 

In [32]:
x = torch.tensor([0.,1.,2.,3.])

In [33]:
print (x.shape)

torch.Size([4])


x.grad stores the value of the derivatives of functions that take x as input, with respect to x

In [34]:
type(x.grad)

NoneType

We need to explicitely declare we require gradient evaluation

In [35]:
print (x.requires_grad )
x.requires_grad = True
print (x.requires_grad)

False
True


in pytorch we do not tell explicitely to record the computation graph, it is done automatically.
The backwarad function applies only to scalar (i.e. not tensor/array) objects. So we need to evaluate a scalar loss before calling it

In [36]:
y = x**2
y.backward() # This doesn't work, because y is not a scalar

RuntimeError: grad can be implicitly created only for scalar outputs

In [37]:
y = torch.dot(x,x) # This is a scalar, (x1^2+x2^2+x3^2+x4^2)
y.backward() # now this works

In [38]:
# The derivative is 2*x_i, for each index i=1,...,4
print (x.grad, 2*x)

tensor([0., 2., 4., 6.]) tensor([0., 2., 4., 6.], grad_fn=<MulBackward0>)


In [39]:
# A bit more complex - hey, did you zero the gradients?
z = x**2
y = torch.sum(z*z)
y.backward()

In [40]:
print (x.grad,4*x**3)

tensor([  0.,   6.,  36., 114.]) tensor([  0.,   4.,  32., 108.], grad_fn=<MulBackward0>)


# WHAT JUST HAPPENED?

In pytorch, by default, when a new gradient computation is called (f(x).backward()) the new gradients are added in place to the old gradients. We need to explicitly zero them before a new backward computation

In [41]:
# Without zero grad, dy/dx != 4*x^3
x = torch.tensor([0,1.,2.,3])
x.requires_grad=True
for _ in range(3):
    z = x**2
    y = torch.sum(z*z)
    y.backward()
    print (x.grad,4*x**3)

tensor([  0.,   4.,  32., 108.]) tensor([  0.,   4.,  32., 108.], grad_fn=<MulBackward0>)
tensor([  0.,   8.,  64., 216.]) tensor([  0.,   4.,  32., 108.], grad_fn=<MulBackward0>)
tensor([  0.,  12.,  96., 324.]) tensor([  0.,   4.,  32., 108.], grad_fn=<MulBackward0>)


In [42]:
# With correct zero grad, dy/dx = 4*x^3
x = torch.tensor([0,1.,2.,3])
x.requires_grad=True
for _ in range(3):
    z = x**2
    y = torch.sum(z*z)
    y.backward()
    print (x.grad,4*x**3)
    x.grad.zero_()

tensor([  0.,   4.,  32., 108.]) tensor([  0.,   4.,  32., 108.], grad_fn=<MulBackward0>)
tensor([  0.,   4.,  32., 108.]) tensor([  0.,   4.,  32., 108.], grad_fn=<MulBackward0>)
tensor([  0.,   4.,  32., 108.]) tensor([  0.,   4.,  32., 108.], grad_fn=<MulBackward0>)


## Suggested reading: autograd: 
https://pytorch.org/tutorials/beginner/blitz/autograd_tutorial.html#sphx-glr-beginner-blitz-autograd-tutorial-py