In [None]:
def dy_dx(x):
  # This function manually calculates the derivative of y = x^2 with respect to x.
  # The derivative of x^2 is 2x.
  return 2*x

In [None]:
# Call the dy_dx function with x=3.
# Expected output: 2 * 3 = 6.
dy_dx(3)

6

In [None]:
# Import the torch library, which is essential for working with tensors and PyTorch's autograd system.
import torch

In [None]:
# Initialize a PyTorch tensor 'x' with the value 3.0.
# requires_grad=True: This attribute tells PyTorch to track all operations involving this tensor.
# This is crucial for automatic differentiation, allowing gradients to be computed later.
x = torch.tensor(3.0, requires_grad=True)

In [None]:
# Define 'y' as 'x' squared.
# Since 'x' has requires_grad=True, operations on 'x' (like squaring) are tracked,
# and 'y' will be part of the computation graph for gradient calculation.
y = x**2

In [None]:
# Display the tensor 'x'.
# The output will show 'requires_grad=True', confirming gradient tracking is active.
x

tensor(3., requires_grad=True)

In [None]:
# Display the tensor 'y'.
# The output will show 'grad_fn=<PowBackward0>'.
# 'grad_fn' indicates the operation that created this tensor (here, power operation).
# This function is responsible for computing its gradient during backpropagation.
y

tensor(9., grad_fn=<PowBackward0>)

In [None]:
# Perform backpropagation: call .backward() on 'y'.
# This computes the gradients of 'y' with respect to all tensors that required gradients and contributed to 'y'.
# In this case, it computes dy/dx.
y.backward()

In [None]:
# Access the computed gradient of 'x'.
# After y.backward(), the gradient (dy/dx) is stored in the .grad attribute of 'x'.
# For y = x^2, dy/dx = 2x. With x=3, x.grad should be 6.
x.grad

tensor(6.)

In [None]:
import math

def dz_dx(x):
    # This function manually calculates the derivative of z = sin(x^2) with respect to x.
    # Using the chain rule: d/dx(sin(u)) = cos(u) * du/dx.
    # Here u = x^2, so du/dx = 2x.
    # Therefore, dz/dx = cos(x^2) * 2x.
    return 2 * x * math.cos(x**2)

In [None]:
# Attempting to call y1.backward() results in a RuntimeError.
# This is because 'y1' was created from the detached tensor 'z', which does not participate in gradient tracking.
# Therefore, 'y1' does not have a 'grad_fn' to backpropagate through.
y1.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
# Call the manual derivative function dz_dx with x=4.
# This calculates the exact derivative at that point.
dz_dx(4)

-7.661275842587077

In [None]:
# Re-initialize 'x' as a new PyTorch tensor with requires_grad=True for a new computation.
# This effectively starts a new computation graph.
x = torch.tensor(4.0, requires_grad=True)

In [None]:
# Define 'y' as 'x' squared, an intermediate step in the calculation of 'z'.
y = x ** 2

In [None]:
# Define 'z' as the sine of 'y'. This is the final scalar tensor for which we want gradients.
z = torch.sin(y)

In [None]:
# Display 'x'. It still has requires_grad=True.
x

tensor(4., requires_grad=True)

In [None]:
# Display 'y'. It shows 'grad_fn=<PowBackward0>', as it was created by the power operation.
y

tensor(16., grad_fn=<PowBackward0>)

In [None]:
# Display 'z'. It shows 'grad_fn=<SinBackward0>', as it was created by the sine operation.
z

tensor(-0.2879, grad_fn=<SinBackward0>)

In [None]:
# Call .backward() on 'z' to compute its gradient with respect to 'x'.
# PyTorch uses the chain rule to calculate dz/dx = dz/dy * dy/dx.
z.backward()

In [None]:
# Display the gradient of 'x'.
# This should match the manual calculation of dz_dx(4) from the previous cell.
x.grad

tensor(-7.6613)

In [None]:
# Attempting to access 'y.grad'.
# By default, PyTorch only stores gradients for leaf tensors (tensors created by the user directly).
# 'y' is a non-leaf tensor (result of an operation), and retain_grad() was not called on it.
# Therefore, its .grad attribute will be None, and a warning is typically issued.
y.grad

  y.grad


In [None]:
import torch

# Inputs for a simple logistic regression example
# x: Input feature. No requires_grad=True as it's an input, not a parameter to optimize.
x = torch.tensor(6.7)
# y: True label (binary). No requires_grad=True as it's a target.
y = torch.tensor(0.0)

# w: Weight parameter. Initially set to 1.0. No requires_grad=True yet for manual gradient demonstration.
w = torch.tensor(1.0)
# b: Bias parameter. Initially set to 0.0. No requires_grad=True yet for manual gradient demonstration.
b = torch.tensor(0.0)

In [None]:
# Binary Cross-Entropy Loss function for scalar inputs.
def binary_cross_entropy_loss(prediction, target):
    # epsilon: A small value added to predictions to prevent log(0), which is undefined.
    epsilon = 1e-8
    # Clamp predictions to be within [epsilon, 1 - epsilon] to avoid numerical instability.
    prediction = torch.clamp(prediction, epsilon, 1 - epsilon)
    # BCE loss formula: -(target * log(prediction) + (1 - target) * log(1 - prediction))
    return -(target * torch.log(prediction) + (1 - target) * torch.log(1 - prediction))

In [None]:
# Forward pass of the logistic regression model
# z: Linear part - weighted sum of input 'x' and bias 'b'.
z = w * x + b
# y_pred: Predicted probability using the sigmoid activation function.
# Sigmoid squashes the output of 'z' to a value between 0 and 1.
y_pred = torch.sigmoid(z)

# Compute binary cross-entropy loss between the predicted probability and the true label.
loss = binary_cross_entropy_loss(y_pred, y)

In [None]:
# Display the calculated loss value.
loss

tensor(6.7012)

In [None]:
# Manual calculation of gradients using the chain rule:
# 1. dL/d(y_pred): Derivative of the BCE loss with respect to the predicted probability (y_pred).
#    Formula: (y_pred - y) / (y_pred * (1 - y_pred))
dloss_dy_pred = (y_pred - y)/(y_pred*(1-y_pred))

# 2. dy_pred/dz: Derivative of the sigmoid activation function with respect to its input 'z'.
#    Formula: y_pred * (1 - y_pred)
dy_pred_dz = y_pred * (1 - y_pred)

# 3. dz/dw and dz/db: Derivatives of 'z' with respect to weight 'w' and bias 'b'.
#    Given z = w*x + b:
#    dz/dw = x
dz_dw = x
#    dz/db = 1
dz_db = 1

# dL/dw: Total gradient of loss with respect to weight 'w' using the chain rule.
#    dL/dw = (dL/d(y_pred)) * (dy_pred/dz) * (dz/dw)
dL_dw = dloss_dy_pred * dy_pred_dz * dz_dw
# dL/db: Total gradient of loss with respect to bias 'b' using the chain rule.
#    dL/db = (dL/d(y_pred)) * (dy_pred/dz) * (dz/db)
dL_db = dloss_dy_pred * dy_pred_dz * dz_db

The torch.gradient function in PyTorch is used to estimate the gradient of a function represented by a tensor. It computes partial derivatives along specified dimensions using the central differences method, which is accurate for functions with at least three continuous derivatives. This function is particularly useful for numerical differentiation in machine learning and scientific computing.

In [None]:
# Print the manually calculated gradients of the loss with respect to the weight (dw) and bias (db).
print(f"Manual Gradient of loss w.r.t weight (dw): {dL_dw}")
print(f"Manual Gradient of loss w.r.t bias (db): {dL_db}")

Manual Gradient of loss w.r.t weight (dw): 6.691762447357178
Manual Gradient of loss w.r.t bias (db): 0.998770534992218


In [None]:
# Re-initialize 'x' and 'y'. They don't require gradients as they are inputs/targets, not trainable parameters.
x = torch.tensor(6.7)
y = torch.tensor(0.0)

In [None]:
# Re-initialize 'w' and 'b' (weight and bias).
# requires_grad=True: This is crucial for PyTorch's autograd to track operations involving these tensors
# and automatically compute their gradients during backpropagation.
w = torch.tensor(1.0, requires_grad=True)
b = torch.tensor(0.0, requires_grad=True)

In [None]:
# Display the weight tensor 'w'.
# Output confirms 'requires_grad=True' is set, indicating it's a trainable parameter.
w

tensor(1., requires_grad=True)

In [None]:
# Display the bias tensor 'b'.
# Output confirms 'requires_grad=True' is set.
b

tensor(0., requires_grad=True)

In [None]:
# Calculate the linear combination 'z = w*x + b'.
# Since 'w' and 'b' have requires_grad=True, 'z' will also be part of the computation graph
# and will have a 'grad_fn' tracking its creation.
z = w*x + b
z

tensor(6.7000, grad_fn=<AddBackward0>)

In [None]:
# Apply the sigmoid activation to 'z' to get the predicted probability 'y_pred'.
# 'y_pred' also becomes part of the computation graph with a 'grad_fn'.
y_pred = torch.sigmoid(z)
y_pred

tensor(0.9988, grad_fn=<SigmoidBackward0>)

In [None]:
# Calculate the binary cross-entropy loss.
# Since 'y_pred' is part of the graph, 'loss' will also be part of the graph and have a 'grad_fn'.
loss = binary_cross_entropy_loss(y_pred, y)
loss

tensor(6.7012, grad_fn=<NegBackward0>)

In [None]:
# Call .backward() on the 'loss' tensor.
# This initiates backpropagation, automatically computing the gradients of 'loss' with respect to 'w' and 'b'.
# These gradients are then accumulated in the .grad attribute of 'w' and 'b'.
loss.backward()

In [None]:
# Print the gradients for 'w' and 'b' computed automatically by PyTorch's autograd.
# These should match the manually calculated gradients from the previous cells,
# demonstrating the power and convenience of autograd.
print(w.grad)
print(b.grad)

tensor(6.6918)
tensor(0.9988)


In [None]:
# Initialize 'x' as a 1D tensor (vector) with multiple elements.
# requires_grad=True: Enables gradient tracking for all elements in this vector.
# This demonstrates how autograd works with multi-dimensional tensors.
x = torch.tensor([1.0, 2.0, 3.0], requires_grad=True)

In [None]:
# Display the vector 'x', confirming that requires_grad=True is set.
x

tensor([1., 2., 3.], requires_grad=True)

In [None]:
# Calculate 'y' by squaring each element of 'x' and then taking the mean of the resulting squared values.
# The .mean() operation reduces the tensor to a scalar, which is necessary for calling .backward()
# without an explicit gradient argument.
y = (x**2).mean()
y

tensor(4.6667, grad_fn=<MeanBackward0>)

In [None]:
# Call .backward() on the scalar 'y'.
# This computes the gradients of 'y' with respect to each element of the vector 'x'.
y.backward()

In [None]:
# Display the gradients for 'x'.
# For y = (x_1^2 + x_2^2 + x_3^2) / 3, the derivative with respect to x_i is (2 * x_i) / 3.
# So, for x=[1, 2, 3], x.grad should be [2/3, 4/3, 6/3] = [0.6667, 1.3333, 2.0000].
x.grad

tensor([0.6667, 1.3333, 2.0000])

In [None]:
# Re-initialize 'x' as a scalar tensor with requires_grad=True.
# This is done to demonstrate the `zero_()` method for clearing gradients.
x = torch.tensor(2.0, requires_grad=True)
x

tensor(2., requires_grad=True)

In [None]:
# Calculate 'y = x ** 2'. 'y' becomes part of the computation graph.
y = x ** 2
y

tensor(4., grad_fn=<PowBackward0>)

In [None]:
# Call .backward() to compute dy/dx and store it in x.grad.
y.backward()

In [None]:
# Display x.grad, which will be 2 * 2 = 4.
x.grad

tensor(4.)

In [None]:
# Call x.grad.zero_() to clear the gradients.
# The underscore '_' denotes an in-place operation.
# This is crucial in training loops to prevent gradients from accumulating across different optimization steps.
# If gradients are not cleared, subsequent backward passes will add to the existing gradients.
x.grad.zero_()

tensor(0.)

In [None]:
# Re-initialize 'x' with requires_grad=True to demonstrate methods for disabling gradient tracking.
x = torch.tensor(2.0, requires_grad=True)
x

tensor(2., requires_grad=True)

In [None]:
# Calculate 'y = x ** 2'. Since 'x' has requires_grad=True, 'y' is part of the computation graph.
y = x ** 2
y

tensor(4., grad_fn=<PowBackward0>)

In [None]:
# Call .backward() to compute dy/dx and store it in x.grad.
y.backward()

In [None]:
# Display x.grad, which is 4.
x.grad

tensor(4.)

In [None]:
# There are three main options in PyTorch to disable gradient tracking:
# option 1 - requires_grad_(False) (in-place modification of a tensor's attribute)
# option 2 - detach() (creates a new tensor that is detached from the graph)
# option 3 - torch.no_grad() (a context manager to temporarily disable gradient calculations)
# These are useful for inference, freezing layers, or performing operations without affecting the graph.

In [None]:
# Option 1: Using requires_grad_(False)
# This method modifies the requires_grad attribute of the tensor 'x' in-place to False.
# After this, 'x' will no longer track gradients for any future operations it's involved in.
x.requires_grad_(False)

tensor(2.)

In [None]:
# Display 'x' again. Notice that 'requires_grad=True' is no longer displayed,
# confirming that gradient tracking has been disabled for 'x'.
x

tensor(2.)

In [None]:
# Calculate 'y = x ** 2' again.
# Since 'x' no longer requires gradients, this operation is not tracked by the autograd engine,
# and no computation graph will be built for 'y'.
y = x ** 2

In [None]:
# Display 'y'. It does not show 'grad_fn', indicating that no computation graph was created for its generation.
y

tensor(4.)

In [None]:
# Attempting to call y.backward() will result in a RuntimeError.
# This is because 'y' does not have a 'grad_fn' (as its creation was not tracked)
# and does not require gradients, so there is no computation graph to backpropagate through.
y.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
# Re-initialize 'x' with requires_grad=True to demonstrate the .detach() method.
x = torch.tensor(2.0, requires_grad=True)
x

tensor(2., requires_grad=True)

In [None]:
# Option 2: Using .detach()
# This creates a new tensor 'z' that is a copy of 'x' but is completely removed from the current computation graph.
# 'z' will not have requires_grad=True, even if 'x' does.
# Changes to 'z' will not affect 'x' in terms of gradient tracking.
z = x.detach()
z

tensor(2.)

In [None]:
# Calculate 'y = x ** 2'. This operation *is* tracked because 'x' still has requires_grad=True.
y = x ** 2

In [None]:
# Display 'y'. It shows 'grad_fn=<PowBackward0>', confirming it's part of the graph from 'x'.
y

tensor(4., grad_fn=<PowBackward0>)

In [None]:
# Calculate 'y1 = z ** 2'.
# Since 'z' was detached and does not require gradients, this operation is *not* tracked by autograd.
y1 = z ** 2
y1

tensor(4.)

In [None]:
# Calling y.backward() successfully computes gradients because 'y' is connected to 'x' in the computation graph.
y.backward()

In [None]:
y1.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
# Re-initialize 'x' with requires_grad=True to demonstrate the torch.no_grad() context manager.
x = torch.tensor(2.0, requires_grad=True)
x

tensor(2., requires_grad=True)

In [None]:
# Option 3: Using torch.no_grad()
# The 'with torch.no_grad():' block is a context manager that temporarily disables gradient tracking.
# Any operations performed inside this block will not build a computation graph,
# even if the input tensors (like 'x') have requires_grad=True.
# This is commonly used during model inference or evaluation to save memory and computation.
with torch.no_grad():
    y = x ** 2

In [None]:
# Display 'y'. Notice that 'grad_fn' is missing, even though 'x' still has requires_grad=True.
# This is because torch.no_grad() prevented the creation of the computation graph for this operation.
y

tensor(4.)

In [None]:
# Attempting to call y.backward() here would also result in a RuntimeError.
# Similar to the previous examples, 'y' does not have a 'grad_fn' because its computation
# occurred within the torch.no_grad() context, effectively disabling gradient tracking for that operation.
y.backward()

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn