# PyTorch Basics + Reshape meaning
Based on https://pytorch.org/tutorials/beginner/pytorch_with_examples.html

In [2]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

import torch

In [3]:
dtype = torch.float
#device = torch.device("cpu")
device = torch.device("cuda:0")

In [4]:
# BS is batch size; D_in is input dimension;
# D_h is hidden dimension; D_out is output dimension.
BS, D_in, D_h, D_out = 64, 1000, 100, 10

## Autograd
It can quickly become hairy and difficult to compute the gradients correctly for a large network. Autograd in PyTorch alleviates this problem, by automatically calculating and keeping track of the gradients for your tensors. That is, if `requires_grad=True`. Essentially the gradients for tensor `x` will be another tensor `x.grad` calculated with respect to some scalar value (the loss function).

In [5]:
# Create random tensors to hold input and outputs.
# Default is requires_grad=False, which is what we want for these tensors, because
# we don't need to update them using gradients.
# Tensors are very much like numpy arrays, except that we can put them on the GPU!
x = torch.randn(BS, D_in, device=device, dtype=dtype)
y = torch.randn(BS, D_out, device=device, dtype=dtype)

In [6]:
# Create random Tensors for weights.
# Now we want requires_grad=True so that we can compute the gradients when doing a backward pass.
w1 = torch.randn(D_in, D_h, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(D_h, D_out, device=device, dtype=dtype, requires_grad=True)

In [7]:
learning_rate = 1e-6

In [8]:
for t in range(500):
    # Forward pass: calculate y_pred using the weight tensors and ReLU (clamp)
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # Compute and print the loss using operations on tensors.
    # The loss is a tensor of shape (1,), i.e. a scalar
    # loss.item gets the scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    if t % 100 == 99:
        print(t, loss.item())
        
    # Use autograd to calculate the backward pass. This will calculate the gradients
    # in regard to the loss scalar for all tensors involved with requires_grad=True.
    # And the results will be stored in the .grad field as a tensor
    loss.backward()
    
    # Update the weights using gradient descent. We don't want to keep track of the operations
    # using autograd, so we can wrap it in torch.no_grad() to avoid it.
    # It is also possible to use tensor.data field, which shares the storage
    # of the tensor, but without the tracking of operation history by autograd.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad
        
        # The gradients accumulate automatically, so we need to reset them.
        # Any torch functions ending in _ are in-place
        w1.grad.zero_()
        w2.grad.zero_()

99 339.53570556640625
199 1.9644684791564941
299 0.016090137884020805
399 0.0003336144145578146
499 4.796755820279941e-05


## nn.Module
While autograd is powerful, it is a bit too low level for most purposes. In PyTorch we use Modules, which are roughly equivalent to layers in a NN. Here is the same example as above, but using a nn.Module.

In [34]:
x = torch.randn(BS, D_in)
y = torch.randn(BS, D_out)

In [35]:
x.dtype # apparently float32 is the default

torch.float32

In [36]:
# nn.Sequential is a Module that contains other Modules and applies them in sequence to produce its output.
# nn.Linear applies a linear function and holds internal tensors for its weight and bias.
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, D_h),
    torch.nn.ReLU(),
    torch.nn.Linear(D_h, D_out),
)

In [37]:
# The nn package also contains definitions for popular loss functions.
loss_fn = torch.nn.MSELoss(reduction='sum')

In [38]:
learning_rate = 1e-4

In [39]:
for t in range(500):
    # Forward pass: compute y_pred using the model.
    # Module objects override __call__, so you can call them like functions.
    # nn.Module.__call__ :: Tensor -> Tensor
    y_pred = model(x)
    
    # Compute and print loss
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
        
    # Backward pass: compute gradients for all learnable parameters in the model 
    # i.e. values in the tensors with requires_grad=True
    loss.backward()
    
    # Update the weights using gradient descent. We can access the weights using model.parameters(), which
    # returns a list of tensors.
    with torch.no_grad():
        for param in model.parameters():
            param -= learning_rate * param.grad
    
    # Remember to zero the gradients!
    model.zero_grad()

99 2.059399127960205
199 0.02680256962776184
299 0.0006534525891765952
399 2.0763705833815038e-05
499 7.760471589790541e-07


## Optim
Updating the weights manually is quite easy when we use a simple method such as stochastic gradient descent, but it becomes quite complex, when we want to use more sophisticated optimizers such as AdaGrad, RMSProp, Adam, Adam_ann, etc.
PyTorch therefore provides an `optim` package, which abstracts the idea of an optimization algorithm and provides the commonly used algorithms.

In [88]:
x = torch.randn(BS, D_in)
y = torch.randn(BS, D_out)

In [89]:
model = torch.nn.Sequential(
    torch.nn.Linear(D_in, D_h),
    torch.nn.ReLU(),
    torch.nn.Linear(D_h, D_out),
)

In [90]:
loss_fn = torch.nn.MSELoss(reduction="sum")

In [91]:
learning_rate = 1e-4

In [92]:
# The first parameter for most optimizer functions is the tensors, which it should update.
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [93]:
for t in range(500):
    y_pred = model(x)
    
    loss = loss_fn(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    # I am not sure why they call zero_grad first. Seems like they could have waited till the end.
    optimizer.zero_grad()
    
    loss.backward()      
    
    # Calling the step function on an Optimizer makes an update to the parameters
    optimizer.step()
    
    

99 77.20761108398438
199 1.6605950593948364
299 0.014822077006101608
399 0.00010365447087679058
499 5.915860015193175e-07


## Custom nn Modules
We can also create custom Modules and use them with the PyTorch library.

In [97]:
class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, D_h, D_out):
        super().__init__()
        self.linear1 = torch.nn.Linear(D_in, D_h)
        self.linear2 = torch.nn.Linear(D_h, D_out)
    
    def forward(self, xb):
        '''
        forward takes a batch of x's and return y_pred. 
        It is what is called when you use the model as a function (model(x)).
        '''
        h_relu = self.linear1(xb).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred

In [98]:
x = torch.randn(BS, D_in)
y = torch.randn(BS, D_out)

In [99]:
model = TwoLayerNet(D_in, D_h, D_out)

In [100]:
# Criterion is, AFAIK, the same as loss function in a ML context
criterion = torch.nn.MSELoss(reduction="sum")

In [101]:
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)

In [102]:
for t in range(500):
    y_pred = model(x)
    
    loss = criterion(y_pred, y)
    if t % 100 == 99:
        print(t, loss.item())
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

99 2.6298742294311523
199 0.07662727683782578
299 0.0065430752001702785
399 0.0008591677178628743
499 0.00013513835438061506


## A Note on Shapes (NumPy / PyTorch)
NumPy arrays and, AFAIK, also PyTorch tensors consist of two primary parts.
- The **data buffer** which is just a block of raw elements,
- and a **view** which describes how to interpret the data buffer.

In [103]:
import numpy as np

In [114]:
a = np.arange(12)
a # the 'data buffer'

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [115]:
a.shape # this is a 'view'

(12,)

In [116]:
# But we can also create a different view
b = a.reshape((3, 4))

Which *doesn't* alther the data buffer underneath, but it allows us to index in a different way.
In this case using two indexes.

In [118]:
c = a.reshape((1, 2, 1, 6, 1)) # Basically, any dimensions with length 1 are "free"