In [None]:
import torch

Tutorial Used: https://medium.com/@ninads79shukla/pytorch-basics-3deffbebb2bd

# Tensors

In [None]:
# Create a tensor from a list
tensor_from_list = torch.tensor([2, 3, 4, 5])
print("Tensor from list:", tensor_from_list)

# Create a tensor with random values
random_tensor = torch.rand((2, 3))
print("Random tensor:\n", random_tensor)

# Create a tensor filled with zeros
zero_tensor = torch.zeros((3, 2))
print("Zero tensor:\n", zero_tensor)

Tensor from list: tensor([2, 3, 4, 5])
Random tensor:
 tensor([[0.6010, 0.4378, 0.3448],
        [0.9483, 0.8716, 0.2833]])
Zero tensor:
 tensor([[0., 0.],
        [0., 0.],
        [0., 0.]])


In [None]:
# Define two tensors
a = torch.tensor([1, 2, 3]) # create tensor from list
b = torch.tensor([4, 5, 6])

# Element-wise addition: add_zip
c = a + b
print("Element-wise addition:", c)

# Element-wise multiplication: mul_zip
d = a * b
print("Element-wise multiplication:", d)

Element-wise addition: tensor([5, 7, 9])
Element-wise multiplication: tensor([ 4, 10, 18])


In [None]:
# Define two matrices
matrix1 = torch.tensor([[1, 2], [3, 4]])
matrix2 = torch.tensor([[5, 6], [7, 8]])

# Matrix multiplication
matrix_product = torch.mm(matrix1, matrix2)
print("Matrix multiplication:\n", matrix_product)

Matrix multiplication:
 tensor([[19, 22],
        [43, 50]])


In [None]:
# Define a tensor
tensor = torch.tensor([1, 2, 3])

# In-place addition: add_map
tensor.add_(5)
print("In-place addition:", tensor)

In-place addition: tensor([6, 7, 8])


# Autograd (Auto-differentiation)

Autograd allows you to automatically compute gradients for tensor operations. This is crucial for optimizing neural networks using gradient descent.

We use gradient descent to optimize neural networks because it is an efficient and scalable algorithm for finding the set of weights and biases that minimize the network’s loss function

In [None]:
import torch

# Create a tensor with requires_grad=True
x = torch.tensor([2.0, 3.0], requires_grad=True)
print("Tensor x:", x)

# Perform operations
y = x + 2
z = y * y * 3
out = z.mean()

print("Tensor y:", y)
print("Tensor z:", z)
print("Tensor out:", out)

# Compute gradients
out.backward()

# Print gradients
print("Gradient of x:", x.grad)
print("Gradient of y:", y.grad)
# Only the leaf Tensor has the attribute .grad

Tensor x: tensor([2., 3.], requires_grad=True)
Tensor y: tensor([4., 5.], grad_fn=<AddBackward0>)
Tensor z: tensor([48., 75.], grad_fn=<MulBackward0>)
Tensor out: tensor(61.5000, grad_fn=<MeanBackward0>)
Gradient of x: tensor([12., 15.])
Gradient of y: None


  print("Gradient of y:", y.grad)


# Backpropagation in PyTorch

**Definition of Backpropagation Algorithm:**

Backpropagation is the process of adjusting the weights of a neural network by analyzing the error rate from the previous iteration.


**Backpropagation involves three main steps:**

1. Forward Pass: Compute the output of the network.
2. Compute Loss: Calculate the difference between the predicted output and the actual target.
3. Backward Pass: Compute the gradient of the loss with respect to each parameter using the chain rule.






## Example of Backpropagation - step by step

In [None]:
# 1. Define the model
import torch
import torch.nn as nn
import torch.optim as optim

class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(1, 10)  # Input layer to hidden layer
        self.fc2 = nn.Linear(10, 1)  # Hidden layer to output layer
        # so there is one hidden layer in this model, and in this hidden layer there are 10 neurons/splits

    def forward(self, x):
        x = torch.relu(self.fc1(x))  # Apply ReLU activation
        x = self.fc2(x)
        return x

model = SimpleNN()

**Output of the Last Layer**


1. Classification Tasks: For classification tasks, the type of activation function in the last layer depends on the number of classes:

- Binary Classification (Two Classes): Use a sigmoid activation function in the last layer. It squashes the output between 0 and 1, which can be interpreted as a probability for class membership.
- Multiclass Classification (More Than Two Classes): Use a softmax activation in the last layer to produce probabilities across multiple classes. Softmax converts the logits into a probability distribution where the sum of all output values equals 1.

2. Regression Tasks: For regression tasks, where you are predicting continuous values (like predicting house prices), no activation function is typically used in the last layer. The output of the network is just the raw score or value from the final neuron(s).


In [None]:
# 2. Define the Loss Function and Optimizer
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [None]:
# 3. Sample data: simple regression task
x_train = torch.tensor([[1.0], [2.0], [3.0], [4.0]])
y_train = torch.tensor([[2.0], [4.0], [6.0], [8.0]])

In [None]:
# 4. Forward pass: generate the output
outputs = model(x_train)
print("Model Outputs:\n", outputs)

Model Outputs:
 tensor([[-0.0695],
        [-0.0823],
        [-0.0950],
        [-0.1078]], grad_fn=<AddmmBackward0>)


In [None]:
# 5. Compute loss
loss = criterion(outputs, y_train)
print("Loss:", loss.item())

Loss: 30.958513259887695


In [None]:
# 6. Backward pass: perform the backward pass to compute the gradients on parameters

loss.backward()

At this point, the gradients of the loss with respect to each parameter are computed and stored in the .grad attribute of each parameter.

In [None]:
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"Gradient of {name}:\n", param.grad)

Gradient of fc1.weight:
 tensor([[-0.9748],
        [ 2.6472],
        [-5.7999],
        [ 0.0000],
        [ 0.2926],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000],
        [ 0.0000]])
Gradient of fc1.bias:
 tensor([-0.3255,  0.8840, -1.9369,  0.0000,  0.0977,  0.0000,  0.0000,  0.0000,
         0.0000,  0.0000])
Gradient of fc2.weight:
 tensor([[-22.7762, -38.0141, -15.7886,   0.0000, -24.9799,   0.0000,   0.0000,
           0.0000,   0.0000,   0.0000]])
Gradient of fc2.bias:
 tensor([-10.1773])


In [None]:
# 7. Update Parameters: Use the optimizer to update the model parameters.
optimizer.step()

optimizer.step() is then called to update the parameters of the model. It adjusts the parameters based on their gradients and the optimization algorithm used (e.g., Stochastic Gradient Descent, Adam, etc.). The optimizer uses the gradients stored in each parameter’s .grad attribute to update them according to its specific update rule.

## Example of Backpropagation - Full Training Loop

The Process:

1. Forward Pass: You compute the output of the model based on the current parameters.

2. Loss Calculation: You calculate the loss (error) based on the output and the target.

3. Backward Pass (Backpropagation): You call loss.backward() to compute the gradients of the loss with respect to each model parameter. These gradients are accumulated in the .grad attribute of each parameter.

4. Parameter Update: You call optimizer.step() to update the model’s parameters based on the gradients stored in .grad.

5. Zero the Gradients: Before the next iteration, you call optimizer.zero_grad() to clear the accumulated gradients so that new gradients can be calculated for the current batch of data.

In [None]:
# Training loop
for epoch in range(1000):
    model.train()

    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = model(x_train)

    # Compute the loss
    loss = criterion(outputs, y_train)

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/1000], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 0.0903
Epoch [200/1000], Loss: 0.0201
Epoch [300/1000], Loss: 0.0042
Epoch [400/1000], Loss: 0.0008
Epoch [500/1000], Loss: 0.0002
Epoch [600/1000], Loss: 0.0000
Epoch [700/1000], Loss: 0.0000
Epoch [800/1000], Loss: 0.0000
Epoch [900/1000], Loss: 0.0000
Epoch [1000/1000], Loss: 0.0000


**Epoch**: One complete pass through the entire training dataset. For example, if your dataset has 10,000 samples, one epoch means the model has seen all 10,000 samples once.

**Batch Size**: The number of training samples processed before the model's parameters are updated. Instead of using the entire dataset at once (which could be computationally expensive), the data is split into smaller groups called batches. Each batch is processed separately.

**Iteration**: One iteration refers to one update of the model’s parameters, which happens after processing one batch of data.

***number of iterations = number of samples/ batch size***

# Training a Simple Model: Regression

## 1. Define a dataset

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Generate some sample data
x_train = torch.tensor([[1.0], [2.0], [3.0], [4.0]])
y_train = torch.tensor([[2.0], [4.0], [6.0], [8.0]])

## 2. Define the model

In [None]:
class LinearRegressionModel(nn.Module):
    def __init__(self):
        super(LinearRegressionModel, self).__init__()
        self.linear = nn.Linear(1, 1)  # One input and one output

    def forward(self, x):
        return self.linear(x)

model = LinearRegressionModel()


## 3. Define the loss function and the optimizer

In [None]:
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

## 4. Training loop

write a training loop to iteratively perform the forward pass, compute the loss, perform the backward pass, and update the model parameters, and reset the gradients to zeroes


In [None]:
# Training loop
num_epochs = 1000
for epoch in range(num_epochs):
    model.train()  # Set the model to training mode

    # Zero the gradients
    optimizer.zero_grad()

    # Forward pass
    outputs = model(x_train)

    # Compute the loss
    loss = criterion(outputs, y_train)

    # Backward pass and optimize
    loss.backward()
    optimizer.step()

    if (epoch + 1) % 100 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [100/1000], Loss: 0.0594
Epoch [200/1000], Loss: 0.0326
Epoch [300/1000], Loss: 0.0179
Epoch [400/1000], Loss: 0.0098
Epoch [500/1000], Loss: 0.0054
Epoch [600/1000], Loss: 0.0030
Epoch [700/1000], Loss: 0.0016
Epoch [800/1000], Loss: 0.0009
Epoch [900/1000], Loss: 0.0005
Epoch [1000/1000], Loss: 0.0003


## 5. Testing the model

In [None]:
model.eval()  # Set the model to evaluation mode

with torch.no_grad():  # Disable gradient computation
    predictions = model(x_train)
    print("Predictions:\n", predictions)
    print("Actual:\n", y_train)

Predictions:
 tensor([[2.0264],
        [4.0128],
        [5.9992],
        [7.9856]])
Actual:
 tensor([[2.],
        [4.],
        [6.],
        [8.]])


**The Training and Evaluation Mode of the model**

- model.train(): Sets the model to training mode, enabling behaviors like dropout and batch normalization using the current batch statistics. Gradients are computed, and parameters can be updated.

- model.eval(): Sets the model to evaluation mode, disabling dropout and using the running statistics for batch normalization. It prepares the model for inference/testing.

- torch.no_grad(): Disables gradient computation to make inference more efficient and reduce memory usage, ensuring that no unnecessary gradients are tracked.

**Dropout** is a regularization technique used in neural networks to prevent overfitting. During training, it randomly "drops" (sets to zero) a fraction of neurons in the network at each forward pass, which forces the network to learn more robust and generalizable features.

# Training an Image Classification Model: MNIST Digit Classification