# PyTorch Cheatsheet

[Pytorch Documentation](https://pytorch.org/docs/stable/index.html)

## Tensors

**Creating tensors**

In [4]:
import torch

# Create a Torch tensor
t = torch.Tensor([[1, 2, 3], [4, 5, 6]])
t

tensor([[1., 2., 3.],
        [4., 5., 6.]])

In [5]:
# Construct a matrix filled zeros and of dtype long:
x = torch.zeros(5, 3, dtype=torch.long)
print(x)

tensor([[0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0],
        [0, 0, 0]])


In [6]:
# Create tensor from normal distribution randoms
t = torch.randn(3, 3)
t

tensor([[-2.2548, -0.3908,  0.8081],
        [-0.4188,  0.7338,  0.3413],
        [-0.5131,  0.6303,  1.9887]])

**Inspecting tensors**

In [7]:
import torch
x = torch.zeros((1, 1)).long()
x

tensor([[0]])

In [8]:
x[0, 0]

tensor(0)

In [9]:
x[0]

tensor([0])

In [10]:
# Some tensor info
t = torch.randn(3, 4, 2)
print('Tensor shape:', t.shape)   # t.size() gives the same
print('Number of dimensions:', t.dim())
print('Tensor type:', t.type())   # there are other types


Tensor shape: torch.Size([3, 4, 2])
Number of dimensions: 3
Tensor type: torch.FloatTensor


In [11]:
print(t.dtype) # Type of the data contained within the tensor.
print(t.device) # where tensor computations will be performed (CPU or GPU)
print(t.layout) # how the tensor is stored in memory

torch.float32
cpu
torch.strided


In [12]:
# number of elements contained within the tensor
t.numel()

24

**Reshaping Tensors**

Reshaping changes the tensor's shape but not the underlying data

In [31]:
t = torch.randn(3, 4)
t

tensor([[-1.2331,  1.4226, -0.3856,  0.5372],
        [-1.0866,  0.3230,  0.5670, -0.3185],
        [ 2.0182, -0.3583,  1.5225, -0.2749]])

In [32]:
t.reshape([2,6])

tensor([[-1.2331,  1.4226, -0.3856,  0.5372, -1.0866,  0.3230],
        [ 0.5670, -0.3185,  2.0182, -0.3583,  1.5225, -0.2749]])

In [33]:
t.reshape([6,2])

tensor([[-1.2331,  1.4226],
        [-0.3856,  0.5372],
        [-1.0866,  0.3230],
        [ 0.5670, -0.3185],
        [ 2.0182, -0.3583],
        [ 1.5225, -0.2749]])

In [27]:
t.reshape([4,3])

tensor([[-1.8118, -1.0233, -0.3762],
        [ 0.9262,  1.9108, -0.5269],
        [ 0.9078, -0.5441,  0.2761],
        [-0.2772,  2.1320, -0.2285]])

In [28]:
t.reshape([3,4])

tensor([[-1.8118, -1.0233, -0.3762,  0.9262],
        [ 1.9108, -0.5269,  0.9078, -0.5441],
        [ 0.2761, -0.2772,  2.1320, -0.2285]])

In [29]:
 t.reshape(2,2,3)

tensor([[[-1.8118, -1.0233, -0.3762],
         [ 0.9262,  1.9108, -0.5269]],

        [[ 0.9078, -0.5441,  0.2761],
         [-0.2772,  2.1320, -0.2285]]])

**Resizing using torch.view**

In [56]:
x = torch.randn(4, 4)
y = x.view(16)
z = x.view(-1, 8)  # the size -1 is inferred from other dimensions
print(x.size(), y.size(), z.size())

torch.Size([4, 4]) torch.Size([16]) torch.Size([2, 8])


In [57]:
# View again...
x = t.view(6,2)
x.size()

torch.Size([6, 2])

In [59]:
y = x.view(3, 2, 2)
print(y)

tensor([[[-1.2331,  1.4226],
         [-0.3856,  0.5372]],

        [[-1.0866,  0.3230],
         [ 0.5670, -0.3185]],

        [[ 2.0182, -0.3583],
         [ 1.5225, -0.2749]]])


In [60]:
print(y.view(6, -1))

tensor([[-1.2331,  1.4226],
        [-0.3856,  0.5372],
        [-1.0866,  0.3230],
        [ 0.5670, -0.3185],
        [ 2.0182, -0.3583],
        [ 1.5225, -0.2749]])


**Squeezing and unsqueezing**

* Squeezing a tensor removes the dimensions or axes that have a length of one.

* Unsqueezing a tensor adds a dimension with a length of one.

* These functions allow us to expand or shrink the rank (number of dimensions) of our tensor

In [37]:
print(t.reshape([1,12]).shape)
print(t.reshape([1,12]).squeeze())
print(t.reshape([1,12]).squeeze().shape)

torch.Size([1, 12])
tensor([-1.2331,  1.4226, -0.3856,  0.5372, -1.0866,  0.3230,  0.5670, -0.3185,
         2.0182, -0.3583,  1.5225, -0.2749])
torch.Size([12])


In [63]:
y = torch.randn(3,2,3)

In [64]:
print(y.size())
y3 = y.unsqueeze(2)
print(y3.size())
print(y3.squeeze().size())

torch.Size([3, 2, 3])
torch.Size([3, 2, 1, 3])
torch.Size([3, 2, 3])


In [50]:
x

tensor([[ 7, 17, 13,  2,  1, 17],
        [ 6, 13, 15, 17,  2,  5],
        [ 6, 12, 11,  8, 15,  0]])

**Transpose**

In [72]:
x = torch.randn(3,2)
x

tensor([[ 2.1132,  0.3919],
        [ 2.1337, -0.3215],
        [ 0.1802, -0.0220]])

In [73]:
y = x.transpose(0, 1)

In [74]:
y.shape

torch.Size([2, 3])

**Broadcasting**

[See](http://pytorch.org/docs/0.3.1/notes/broadcasting.html)

In [75]:
# broadcasting
x = torch.rand(3, 1)
y = torch.rand(3, 2)
print(x)
print(y)
print(x + y)

tensor([[0.7466],
        [0.2970],
        [0.2107]])
tensor([[0.1799, 0.3676],
        [0.2930, 0.1088],
        [0.1289, 0.3924]])
tensor([[0.9264, 1.1142],
        [0.5900, 0.4059],
        [0.3396, 0.6031]])


**Basic Tensor Operations**

**Matrix product**

In [57]:
# Compute matrix product
x = torch.rand(3, 2)
y = torch.rand(2, 3)
print(x.matmul(y))

x = torch.rand(2, 3, 2)
y = torch.rand(2, 2, 3)
print("m",x.bmm(y))

t = (torch.Tensor([[2, 4], [5, 10]]).mm(torch.Tensor([[10], [20]])))
t


tensor([[0.2104, 0.2674, 0.1711],
        [0.7613, 0.5942, 0.4864],
        [0.8272, 0.6107, 0.5162]])
m tensor([[[0.5248, 0.1521, 0.0502],
         [0.8608, 0.2549, 0.0874],
         [0.4297, 0.1748, 0.0879]],

        [[0.5194, 0.5558, 0.9434],
         [0.4951, 0.4243, 0.8227],
         [0.3243, 0.3828, 0.6149]]])


tensor([[100.],
        [250.]])

**Concatenating tensors**

We combine tensors using the cat() function, and the resulting tensor will have a shape that depends on the shape of the two input tensors.

In [77]:
t1 = torch.tensor([
    [1,2],
    [3,4]
])
t2 = torch.tensor([[5,6],[7,8]])
print(t1)
print(t2)

tensor([[1, 2],
        [3, 4]])
tensor([[5, 6],
        [7, 8]])


In [78]:
# Combine row wize
torch.cat((t1, t2), dim=0)

tensor([[1, 2],
        [3, 4],
        [5, 6],
        [7, 8]])

In [79]:
# Combine columns wize
torch.cat((t1, t2), dim=1)

tensor([[1, 2, 5, 6],
        [3, 4, 7, 8]])

**Stacking tensors**

In [81]:
t1 = torch.tensor([
    [1,1,1,1],
    [1,1,1,1],
    [1,1,1,1],
    [1,1,1,1]
])

t2 = torch.tensor([
    [2,2,2,2],
    [2,2,2,2],
    [2,2,2,2],
    [2,2,2,2]
])

t3 = torch.tensor([
    [3,3,3,3],
    [3,3,3,3],
    [3,3,3,3],
    [3,3,3,3]
])

In [82]:
t = torch.stack((t1, t2, t3))
t.shape

torch.Size([3, 4, 4])

**Extracting Tensor Values** 

In [39]:
# Slicing
t = torch.Tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]])

# Every row, only the last column
print(t[:, -1])

# First 2 rows, all columns
print(t[:2, :])

# Lower right most corner
print(t[-1:, -1:])


tensor([3., 6., 9.])
tensor([[1., 2., 3.],
        [4., 5., 6.]])
tensor([[9.]])


In [None]:
#If you have a one element tensor, use .item() to get the value as a Python number

x = torch.randn(1)
print(x)
print(x.item())

**Max value**

In [37]:
x = torch.arange(0,10).resize_((2,5))

# Return max values and their position 
topk, indices = torch.topk(x, 1)

print(x)
print(topk)
print(indices)

tensor([[0, 1, 2, 3, 4],
        [5, 6, 7, 8, 9]])
tensor([[4],
        [9]])
tensor([[4],
        [4]])


**PyTorch Tensor To and From Numpy ndarray**

You can easily create a tensors from an ndarray and vice versa. These operations are fast, since the data of both structures will share the same memory space, and so no copying is involved. This is obviously an efficient approach.


In [None]:

# Numpy ndarray <--> PyTorch tensor
import numpy as np

# ndarray to tensor
a = np.random.randn(3, 5)
t = torch.from_numpy(a)
print(a)
print(t)
print(type(a))
print(type(t))


### GPUs

PyTorch tensors have inherent GPU support. Specifying to use the GPU memory and CUDA cores for storing and performing tensor calculations is easy; the cuda package can help determine whether GPUs are available, and the package's cuda() method assigns a tensor to the GPU.


In [None]:
# Checking whether GPU is available
torch.cuda.is_available()

# Move to GPU
t.cuda()

## Shuffling and batching data

In [None]:
from torch.utils.data import TensorDataset, DataLoader
# the TensorDataset is a ready to use class to represent your data as list of tensors. 
# Note that input_features and labels must match on the length of the first dimension
train_set = TensorDataset(X_train, Y_train)
valid_set = TensorDataset(X_valid, Y_valid)

# DataLoader shuffles and batches the data and load its in parallel using multiprocessing workers
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(valid_set, batch_size=batch_size)

## Specifying a neural network


Within the __init__ we define the layers of the module. 

Our three layers are an embedding layer, our RNN, and a linear layer. All layers have their parameters initialized to random values, unless explicitly specified.

The embedding layer is used to transform our sparse one-hot vector (sparse as most of the elements are 0) into a dense embedding vector (dense as the dimensionality is a lot smaller and all the elements are real numbers). This embedding layer is simply a single fully connected layer. As well as reducing the dimensionality of the input to the RNN, there is the theory that words which have similar impact on the sentiment of the review are mapped close together in this dense vector spac

In [None]:
from torch import tensor
from torch import nn
from torch import sigmoid
import torch.nn.functional as F
import torch.optim as optim

# Training data and ground truth
x_data = tensor([[1.0], [2.0], [3.0], [4.0]])
y_data = tensor([[0.], [0.], [1.], [1.]])


class Model(nn.Module):
    def __init__(self):
        """
        create an instance of the base nn.Module class
        """
        super(Model, self).__init__()
        self.linear = nn.Linear(1, 1)  # One in and one out

    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data.
        """
        y_pred = sigmoid(self.linear(x))
        return y_pred


# our model
model = Model()

# Construct our loss function and an Optimizer. The call to model.parameters()
# in the SGD constructor will contain the learnable parameters of the two
# nn.Linear modules which are members of the model.
criterion = nn.BCELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# Training loop
for epoch in range(1000):
    # Forward pass: Compute predicted y by passing x to the model
    y_pred = model(x_data)

    # Compute and print loss
    loss = criterion(y_pred, y_data)
    print(f'Epoch {epoch + 1}/1000 | Loss: {loss.item():.4f}')

    # Zero gradients, perform a backward pass, and update the weights.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

# After training
print(f'\nLet\'s predict the hours need to score above 50%\n{"=" * 50}')
hour_var = model(tensor([[1.0]]))
print(f'Prediction after 1 hour of training: {hour_var.item():.4f} | Above 50%: {hour_var.item() > 0.5}')
hour_var = model(tensor([[7.0]]))
print(f'Prediction after 7 hours of training: {hour_var.item():.4f} | Above 50%: { hour_var.item() > 0.5}')


### MLP

In [None]:
import torch
from torch import nn
from torch import sigmoid
import torch.nn.functional as F
import torch.optim as optim
# Training data and ground truth
x_data = torch.tensor([[1.0], [2.0], [3.0], [4.0]])
y_data = torch.tensor([[0.], [0.], [1.], [1.]])
class Model(nn.Module):
    def __init__(self):
        """
        In the constructor we instantiate nn.Linear module
        """
        super(Model, self).__init__()
        self.linear = nn.Linear(1, 1)  # One in and one out
    def forward(self, x):
        """
        In the forward function we accept a Variable of input data and we must return
        a Variable of output data.
        """
        y_pred = sigmoid(self.linear(x))
        return y_pred
# our model
model = Model()

### RNN

In [None]:
class RNN(nn.Module):
  
        
    def __init__(self):
        super().__init__()
        
        # Here we define the network layers
        
        # An embedding layer projecting vectors of size vocab_size into embeddings of size embed_size
        # Assigns to each word in the vocabulary an embedding of size embed_size
        self.embed = nn.Embedding(len(vocab), embed_size)
        
        # A recurrent (GRU) layer to process each input token (represented by its embedding)
        # The GRU network takes as input the embedding (of size embed_size) of the current word 
        # and the previous hidden state (of size hidden_size)
        self.rnn = nn.GRU(embed_size, hidden_size, num_layers=1, bidirectional=False, batch_first=True)
        
        # Drop out layer for regularisation
        self.dropout = nn.Dropout(0.3)
        
        # Fully connected layer mapping 
        # the last layer maps a hidden state to a vector of size the number of classes
        self.decision = nn.Linear(hidden_size, len(set(labels)))
         
    def forward(self, x):
        # Here we say how the layers are connected 
       
        #  for each token in the input, retrieve the corresponding embeddings 
        # x = [batch size, max sent length]
        embed = self.embed(x)
        
       
        # Run the RNN on the input embeddings
        # embed = [batch size, sent len, emb dim]
        # output is the sequence of hidden states produced by the RNN
        # hidden is the last hidden state produced
        output, hidden = self.rnn(embed)
        
        # output = [sent len, batch size, hidden size]
        # hidden = [1, batch size, hidden size]
        # hidden = (num_layers * num_directions, batch, hidden_size)
        
        # Apply dropout (for regularisation)
        drop = self.dropout(hidden)
        
        # Apply the fully connected layer to the output of the dropout
        # drop = (num_layers * num_directions, batch_size, hidden_size)
        # hidden size: (num_layers * num_directions, batch, hidden_size)
        # Expected input size: (batch_size, _size)
        return self.decision(drop.transpose(0, 1).contiguous().view(x.size(0), -1))
        #return self.decision(drop)
    
rnn_model = RNN()
rnn_model.to(device)

### Embedding layer

* The nn.Embedding module holds a Tensor of dimension (vocab_size, embedding_size), i.e. of the size of the vocabulary x the dimension of each vector embedding, and a method for retrieving the embedding of a word. 


In [None]:
import torch
from torch import nn
# Create an embedding layer
# Parameters: (vocab_size, embedding_size)
embedding = nn.Embedding(1000,128)
# Print out the embedding of the token represented by index 3
embedding(torch.LongTensor([3]))

### Linear Layer

Parameters:

* in_features – size of each input sample
* out_features – size of each output sample

In [None]:
import torch
import torch.nn as nn

#x contains three inputs (i.e. the batch size is 3),
x = torch.tensor([[1.0, -1.0],
                  [0.0,  1.0],
                  [0.0,  0.0]])

in_features = x.shape[1]  # = 2
out_features = 1

# input x of shape (batch_size, in_features)
# output of shape (batchsize, out_features)
m = nn.Linear(in_features, out_features)

# output
y = m(x)
y