In [10]:
# These commands will install the necessary packages and their dependencies in the current env.

# uncomment the following line to install the packages in the current env

!pip install numpy pandas urllib3 fastprogress
!pip install torch==2.2.0 #+cu121 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchvision==0.17.0 #+cu121 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchaudio==2.2.0 #+cu121 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchtext==0.16.2 #+cu121 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchdata==0.7.1 #+cu121 -f https://download.pytorch.org/whl/torch_stable.html
!pip install torchmetrics==0.11.4




# Introduction to PyTorch and Autograd

This Jupyter Notebook provides an introduction to PyTorch's autograd functionality and gradient computation. It covers various topics including:

- Basic tensor operations and conversions between NumPy arrays and PyTorch tensors.
- Mathematical operations using PyTorch tensors.
- The concept of autograd and how to use it for automatic differentiation.
- Computing gradients and higher-order derivatives.
- Intermediate gradient computation using `retain_grad` and hooks.
- Solving nonlinear equations using Newton-Raphson's method with PyTorch.
- Gradient computation through nonlinear solvers.

The notebook includes practical examples and code snippets to demonstrate these concepts.

## Torch and Numpy

Details about math operation in torch can be found in: http://pytorch.org/docs/torch.html#math-operations


In [11]:
import torch
import numpy as np

In [12]:
# convert numpy to tensor or vise versa
np_data = np.arange(6).reshape((2, 3))
torch_data = torch.from_numpy(np_data)
tensor2array = torch_data.numpy()
print(
    '\n Numpy array:\n', np_data,          # [[0 1 2], [3 4 5]]
    '\n\n Torch tensor:\n', torch_data,      #  0  1  2 \n 3  4  5    [torch.LongTensor of size 2x3]
    '\n\n tensor to array:\n', tensor2array, # [[0 1 2], [3 4 5]]
)


 Numpy array:
 [[0 1 2]
 [3 4 5]] 

 Torch tensor:
 tensor([[0, 1, 2],
        [3, 4, 5]]) 

 tensor to array:
 [[0 1 2]
 [3 4 5]]


In [13]:
# abs
data = [-1, -2, 1, 2]
tensor = torch.FloatTensor(data)  # 32-bit floating point
print(
    '\nabs',
    '\nnumpy: ', np.abs(data),          # [1 2 1 2]
    '\ntorch: ', torch.abs(tensor)      # [1 2 1 2]
)


abs 
numpy:  [1 2 1 2] 
torch:  tensor([1., 2., 1., 2.])


In [14]:
tensor.abs()

tensor([1., 2., 1., 2.])

In [15]:
# sin
print(
    '\nSin function-',
    '\nnumpy:\n', np.sin(data),      # [-0.84147098 -0.90929743  0.84147098  0.90929743]
    '\ntorch:\n', torch.sin(tensor)  # [-0.8415 -0.9093  0.8415  0.9093]
)


Sin function- 
numpy:
 [-0.84147098 -0.90929743  0.84147098  0.90929743] 
torch:
 tensor([-0.8415, -0.9093,  0.8415,  0.9093])


In [16]:
tensor.sigmoid()

tensor([0.2689, 0.1192, 0.7311, 0.8808])

In [17]:
tensor.exp()

tensor([0.3679, 0.1353, 2.7183, 7.3891])

In [18]:
# mean
print(
    '\nmean',
    '\nnumpy: ', np.mean(data),         # 0.0
    '\ntorch: ', torch.mean(tensor)     # 0.0
)


mean 
numpy:  0.0 
torch:  tensor(0.)


In [19]:
# matrix multiplication
data = [[1,2], [3,4]]
tensor = torch.FloatTensor(data)  # 32-bit floating point
# correct method
print(
    '\nmatrix multiplication (matmul)',
    '\nnumpy: ', np.matmul(data, data),     # [[7, 10], [15, 22]]
    '\ntorch: ', torch.mm(tensor, tensor)   # [[7, 10], [15, 22]]
)


matrix multiplication (matmul) 
numpy:  [[ 7 10]
 [15 22]] 
torch:  tensor([[ 7., 10.],
        [15., 22.]])


In [20]:
# incorrect method
data = np.array(data)
tensor = torch.Tensor(data)
print(
    '\nmatrix multiplication (dot)',
    '\nnumpy: ', data.dot(data),        # [[7, 10], [15, 22]]
    '\ntorch: ', #tensor.dot(tensor)     # NOT WORKING! Beware that torch.dot does not broadcast, only works for 1-dimensional tensor
)


matrix multiplication (dot) 
numpy:  [[ 7 10]
 [15 22]] 
torch: 


Note that:

torch.dot(tensor1, tensor2) → float

Computes the dot product (inner product) of two tensors. Both tensors are treated as 1-D vectors.

In [21]:
tensor.mm(tensor)

tensor([[ 7., 10.],
        [15., 22.]])

In [22]:
tensor * tensor

tensor([[ 1.,  4.],
        [ 9., 16.]])

In [23]:
torch.dot(torch.Tensor([2, 3]), torch.Tensor([2, 1]))

tensor(7.)

## Torch variables and autograd

In [24]:
import torch
from torch.autograd import Variable

In [25]:
tensor = torch.FloatTensor([[1,2],[3,4]])            # build a tensor
variable = Variable(tensor, requires_grad=True)      # build a variable, usually for compute gradients

print(tensor)       # [torch.FloatTensor of size 2x2]
print(variable)     # [torch.FloatTensor of size 2x2]

tensor([[1., 2.],
        [3., 4.]])
tensor([[1., 2.],
        [3., 4.]], requires_grad=True)


Till now the tensor and variable seem the same.

However, the variable is a part of the graph, it's a part of the auto-gradient.


In [26]:
t_out = torch.mean(tensor*tensor)       # x^2
v_out = torch.cos(torch.mean(variable*variable))   # x^2
print(t_out)
print(v_out)

tensor(7.5000)
tensor(0.3466, grad_fn=<CosBackward0>)


In [27]:
v_out.backward()    # backpropagation from v_out

$$ v_{out} = {{1} \over {4}} sum(variable^2) $$

the gradients w.r.t the variable, 

$$ {d(v_{out}) \over d(variable)} = {{1} \over {4}} 2 variable = {variable \over 2}$$

let's check the result pytorch calculated for us below:

In [28]:
variable.grad

tensor([[-0.4690, -0.9380],
        [-1.4070, -1.8760]])

In [29]:
variable # this is data in variable format

tensor([[1., 2.],
        [3., 4.]], requires_grad=True)

In [30]:
variable.data # this is data in tensor format

tensor([[1., 2.],
        [3., 4.]])

In [31]:
variable.data.numpy() # numpy format

array([[1., 2.],
       [3., 4.]], dtype=float32)

Note that we did `.backward()` on `v_out` but `variable` has been assigned new values on it's `grad`.

As this line 
```
v_out = torch.mean(variable*variable)
``` 
will make a new variable `v_out` and connect it with `variable` in computation graph.

In [32]:
type(v_out)

torch.Tensor

In [33]:
type(v_out.data)

torch.Tensor

## Autograd Example

In [34]:
import torch
from torch.autograd import grad
import torch.nn.functional as F

In PyTorch, the function is defined and computed as follows:

In [35]:
x = torch.tensor([3.])
w = torch.tensor([2.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)
a = F.relu(x*w + b)

In [36]:
a

tensor([7.], grad_fn=<ReluBackward0>)

- By default, PyTorch will automatically build a computation graph in the background if variables have the parameter `requires_grad=True` set.
- If new variables without that parameter set to True are used in a computation with a variable that has `requires_grad=True`, these new variables will also automatically have gradients set to true.
- This simply means that gradients for these variables will be computed.
- It is wasteful to set `requires_grad=True` if we don't need that variable's gradient.
- For example, we usually don't need the gradients of the training inputs `x`.

Let's compute the derivative of a with respect to w:

In [37]:
grad(a, w, retain_graph=True)

(tensor([3.]),)

- `retain_graph=True` keeps the computation graph in memory.
- This is useful for example purposes to reuse the `grad` function.
- In practice, we usually free the computation graph in every round.

In [38]:
grad(a, b)

(tensor([1.]),)

Note that PyTorch functions are usually more efficient, but we could also implement our own ReLU function as shown below:

In [39]:
x = torch.tensor([3.])
w = torch.tensor([2.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

def my_relu(z):
    if z > 0.:
        return z
    else:
        z[:] = 0.
        return z

a = my_relu(x*w + b)
grad(a, w)

(tensor([3.]),)

Note that even though the derivative of ReLU is not defined at 0, PyTorch autograd will do something that is reasonable for practical purposes:

In [40]:
x = torch.tensor([-1.])
w = torch.tensor([1.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

def my_relu(z):
    if z > 0.:
        return z
    else:
        z[:] = 0.
        return z

a = F.relu(x*w + b)
grad(a, w, retain_graph=False)

(tensor([-0.]),)

## Intermediate Gradients in PyTorch via autograd's `grad`

- In PyTorch, there are multiple ways to compute partial derivatives or gradients.
- The most straightforward way to compute partial derivatives is using autograd's `grad` function.
- By default, the `retain_graph` parameter of the `grad` function is set to `False`, which will free the graph after computing the partial derivative.
- To obtain multiple partial derivatives, set `retain_graph=True`.
- Note that setting `retain_graph=True` is inefficient as it requires multiple passes over the graph, recalculating intermediate results.

In [41]:
import torch
import torch.nn.functional as F
from torch.autograd import grad


x = torch.tensor([3.], requires_grad=True)
w = torch.tensor([2.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

u = x * w
v = u + b
a = F.relu(v)

d_a_b = grad(a, b, retain_graph=True)
d_a_u = grad(a, u, retain_graph=True)
d_a_v = grad(a, v, retain_graph=True)
d_a_w = grad(a, w, retain_graph=True)
d_a_x = grad(a, x)
    

for name, dvar in zip("xwbuv", (d_a_x, d_a_w, d_a_b, d_a_u, d_a_v)):
    print('d_a_%s:' % name, dvar)

d_a_x: (tensor([2.]),)
d_a_w: (tensor([3.]),)
d_a_b: (tensor([1.]),)
d_a_u: (tensor([1.]),)
d_a_v: (tensor([1.]),)


- A more efficient way is by passing a tuple to the `grad` function so that it can reuse intermediate results and only require one pass over the graph:

In [42]:
import torch
import torch.nn.functional as F
from torch.autograd import grad


x = torch.tensor([3.], requires_grad=True)
w = torch.tensor([2.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

u = x * w
v = u + b
a = F.relu(v)

partial_derivatives = grad(a, (x, w, b, u, v))

for name, grad in zip("xwbuv", (partial_derivatives)):
    print('d_a_%s:' % name, grad)

d_a_x: tensor([2.])
d_a_w: tensor([3.])
d_a_b: tensor([1.])
d_a_u: tensor([1.])
d_a_v: tensor([1.])


## Intermediate Gradients in PyTorch via `retain_grad`

- In PyTorch, we most often use the `backward()` method on an output variable to compute its partial derivative (or gradient) with respect to its inputs (typically, the weights and bias units of a neural network).
- By default, PyTorch only stores the gradients of the leaf variables (e.g., the weights and biases) via their `grad` attribute to save memory.
- If we are interested in the intermediate results in a computational graph, we can use the `retain_grad` method to store gradients of non-leaf variables as follows:

In [45]:
import torch
import torch.nn.functional as F
from torch.autograd import Variable


x = torch.tensor([3.], requires_grad=True)
w = torch.tensor([2.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

u = x * w
v = u + b
a = F.relu(v)

u.retain_grad()
v.retain_grad()

a.backward()

for name, var in zip("xwbuv", (x, w, b, u, v)):
    print('d_a_%s:' % name, var.grad)

d_a_x: tensor([2.])
d_a_w: tensor([3.])
d_a_b: tensor([1.])
d_a_u: tensor([1.])
d_a_v: tensor([1.])


## Intermediate Gradients in PyTorch Using Hooks

- We can also use something called hooks to obtain intermediate gradients, although not a recommended approach as a first choice.
- A hook is function that will be called every time a gradient with respect to the variable is computed. 
    (http://pytorch.org/docs/master/autograd.html#torch.autograd.Variable.register_hook)
- We can use these hooks in a combination with a little helper function, `save_grad`, and a `hook` closure writing the partial derivatives or gradients to a global variable `grads`.
- So, if we invoke the `backward` method on the output node `a`, all the intermediate results will be collected in `grads`, as illustrated below:

In [46]:
import torch
import torch.nn.functional as F


grads = {}
def save_grad(name):
    def hook(g):
        grads[name] = g
    return hook


x = torch.tensor([3.], requires_grad=True)
w = torch.tensor([2.], requires_grad=True)
b = torch.tensor([1.], requires_grad=True)

u = x * w
v = u + b

x.register_hook(save_grad('d_a_x'))
w.register_hook(save_grad('d_a_w'))
b.register_hook(save_grad('d_a_b'))
u.register_hook(save_grad('d_a_u'))
v.register_hook(save_grad('d_a_v'))

a = F.relu(v)

a.backward()

#grads

### What about sencond order derivative?

In [47]:
from torch.autograd import grad

x = torch.tensor([3.], requires_grad=True)
y  = torch.tensor([4.])

f = x**2 * y + y

df_dx = grad(f, x) # 2xy = 24

In [48]:
x = torch.tensor([3.], requires_grad=True)
y  = torch.tensor([4.], requires_grad=True)

f = x**2 * y + y

df_dx = grad(f, x, create_graph=True) # first order deriv
grad(df_dx, x) # 2y = 8

(tensor([8.]),)

- `retain_graph`: is meant for preserving the original graph for multiple backward passes with the same variables.
- `create_graph`: is meant for constructing a new graph for the gradients, enabling the computation of higher-order derivatives.

### Elementwise gradient computation

In [49]:
import torch
from torch.autograd import grad
import numpy as np

# Define a vector function
def vector_function(x):
    return x**2 + 3*x + 2

# Define the input tensor
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

# Compute the elementwise gradient
y = vector_function(x)
elementwise_grad = grad(y, x, grad_outputs=torch.ones_like(y))

print("Input tensor:", x)
print("Output tensor:", y)
print("Elementwise gradient:", elementwise_grad)

Input tensor: tensor([1., 2., 3.], requires_grad=True)
Output tensor: tensor([ 6., 12., 20.], grad_fn=<AddBackward0>)
Elementwise gradient: (tensor([5., 7., 9.]),)
