# PyTorch

PyTorch is a deep learning framework used for research and development in machine learning and artificial intelligence.

A tensor is a fundamental data structure that is similar to arrays or matrices. Tensors are the building blocks of neural networks and are used to represent data in the form of multi-dimensional arrays

### Types of Tensors
![](./Tensor.PNG)

In [1]:
import torch
import torch.nn as nn
import numpy as np

In [2]:
scalar = torch.tensor(42.0)  # Creates a scalar tensor with the value 42.0. has 0 dimensions
vector = torch.tensor([1, 2, 3, 4, 5])  # Creates a 1-D tensor with 5 elements
matrix = torch.tensor([[1, 2, 3], [4, 5, 6]])  # Creates a 2-D tensor with 2 rows and 3 columns
four_dim_tensor = torch.randn(32, 3, 64, 64)  # Create a 4-D tensor with shape (batch_size, channels, height, width)
four_dim_tensor[0]

tensor([[[ 6.6731e-01,  4.1031e-01, -1.5540e-01,  ...,  2.0594e-01,
          -7.7255e-01, -1.1984e-01],
         [-5.6662e-01,  7.0067e-01, -3.4180e-01,  ..., -1.1127e+00,
           1.8683e+00, -1.1175e+00],
         [ 8.8551e-01, -6.9390e-01,  6.1661e-01,  ...,  9.9827e-01,
           5.2346e-01,  1.1848e-01],
         ...,
         [-1.3863e+00, -1.5750e+00, -9.7003e-01,  ..., -2.8985e-01,
           1.1005e+00, -3.1892e-01],
         [-4.7203e-01, -2.4616e-01, -1.9265e+00,  ..., -7.0816e-01,
          -5.6475e-01, -8.3122e-01],
         [-1.1669e+00, -1.0620e-03, -1.9500e-02,  ...,  7.5206e-01,
           1.8913e+00,  6.1879e-01]],

        [[ 4.6298e-01,  5.4132e-01, -6.9952e-01,  ..., -4.9100e-01,
          -7.0300e-01, -6.1826e-01],
         [-8.0461e-01, -6.8585e-01, -5.3335e-01,  ..., -5.4940e-02,
          -2.2417e-01, -9.4653e-01],
         [ 1.5007e+00, -9.9051e-01,  1.6570e-02,  ..., -3.8530e-01,
          -2.6853e-01, -4.9141e-01],
         ...,
         [ 8.4170e-01,  5

Different arguments can be provided for tensor creation:
* Data
* Dtype
* Device (specify the device (CPU or GPU) on which the tensor should be located using this argument. If not provided, the tensor will be created on the CPU by default.)
* Requires_grad (If set to True, the tensor will be set up to track operations on it for automatic differentiation (autograd) during backpropagation. This is useful for gradient-based optimization and training deep learning models.)

In [3]:
tensor = torch.tensor(data=[[1, 2, 3], [4, 5, 6]], 
dtype=torch.float32, 
device='cpu', 
requires_grad=False)

In [4]:
tensor = torch.tensor(data=[[1, 2, 3], [3, 2, 3]])
tensor.type(torch.float32)
tensor.numel()  # total elements in tensor

6

In [5]:
reshaped_tensor = torch.reshape(tensor, (3, 2))
reshaped_tensor

tensor([[1, 2],
        [3, 3],
        [2, 3]])

In [6]:
reshaped_tensor = torch.reshape(tensor, (-1, 2))  # -1 is used to infer one of dimensions
reshaped_tensor

tensor([[1, 2],
        [3, 3],
        [2, 3]])

In [7]:
x = torch.randn(32, 3, 64, 64)
x_flattened = x.view(x.size(0), -1)
x_flattened

tensor([[-2.1679, -0.4802,  0.6324,  ...,  0.7223, -0.5063, -0.7060],
        [ 1.2336,  0.3826,  0.2796,  ..., -0.2616,  0.6765, -0.0804],
        [-1.3299,  0.9156, -0.3471,  ...,  0.0298, -0.8025,  0.7965],
        ...,
        [ 0.9928, -0.0565, -0.2597,  ..., -0.2338,  0.4734,  0.0774],
        [-0.2038, -0.8475, -1.2998,  ...,  2.0775,  0.3445,  0.5881],
        [-0.6292, -2.4918,  1.1873,  ...,  0.4216,  2.0810,  1.0084]])

In [8]:
expanded_tensor = torch.unsqueeze(tensor, dim=0)  #  Returns a new tensor with a dimension of size one inserted at the specified position.
expanded_tensor.size(), tensor.size()

(torch.Size([1, 2, 3]), torch.Size([2, 3]))

### Permute function
The permute() function allows to rearrange dimensions in a tensor, providing with the flexibility to change the shape and orientation of data

In [9]:
permuted_tensor = tensor.permute(1, 0)  # Swap dimensions 0 and 1
permuted_tensor.shape, tensor.shape

(torch.Size([3, 2]), torch.Size([2, 3]))

In [10]:
tensor, permuted_tensor

(tensor([[1, 2, 3],
         [3, 2, 3]]),
 tensor([[1, 3],
         [2, 2],
         [3, 3]]))

In [11]:
# Transposing a Tensor (Swapping Rows and Columns)
transposed_tensor_1 = tensor.t()
transposed_tensor_2 = torch.transpose(tensor, 0, 1)  # Swap axes 0 and 1

print(f'The original tensor shape is: {tensor.shape},\n' 
      f'The transposed tensor using .t shape is: {transposed_tensor_1.shape},\n' 
      f'The transposed tensor using .tranpose shape is: {transposed_tensor_2.shape}')

The original tensor shape is: torch.Size([2, 3]),
The transposed tensor using .t shape is: torch.Size([3, 2]),
The transposed tensor using .tranpose shape is: torch.Size([3, 2])


Addition and subtraction between tensors same shape

In [12]:
tensor_a = torch.tensor([[4, 5, 7], [8, 9, 0]])
tensor_b = torch.tensor([[5, 4, 3], [9, 8, 7]])

tensor_a + tensor_b

tensor([[ 9,  9, 10],
        [17, 17,  7]])

In [13]:
tensor_a - tensor_b

tensor([[-1,  1,  4],
        [-1,  1, -7]])

Element-wise multiplication between 2 tensors of same shape

In [14]:
tensor_a * tensor_b

tensor([[20, 20, 21],
        [72, 72,  0]])

Matrix-wise multiplication (dot product) between 2 tensors where the inner dimensions match (the number of columns in the first tensor equals the number of rows in the second tensor)

In [15]:
tensor_c = torch.tensor([[5, 4, 3], [9, 8, 7], [1, 1, 1]])
matmu = torch.matmul(tensor_a, tensor_c)
matmu

tensor([[ 72,  63,  54],
        [121, 104,  87]])

In [16]:
div = tensor_a / tensor_b
div

tensor([[0.8000, 1.2500, 2.3333],
        [0.8889, 1.1250, 0.0000]])

In [17]:
result_exp = tensor_a ** tensor_b
result_exp

tensor([[     1024,       625,       343],
        [134217728,  43046721,         0]])

In [18]:
result_sqrt = torch.sqrt(tensor_a)
result_sqrt

tensor([[2.0000, 2.2361, 2.6458],
        [2.8284, 3.0000, 0.0000]])

In [19]:
result_log = torch.log(tensor_a)  # natural logarithm (base e)
result_log

tensor([[1.3863, 1.6094, 1.9459],
        [2.0794, 2.1972,   -inf]])

In [20]:
tensor_a = tensor_a.type(torch.float32)
total_sum = torch.sum(tensor_a)

# Compute the mean along axis 1 (rows)
mean_along_rows = torch.mean(tensor_a, dim=1)

# Compute the maximum value along axis 0 (columns)
max_along_columns = torch.max(tensor_a, dim=0)

# Compute the minimum value along axis 1 (rows)
min_along_rows = torch.min(tensor_a, dim=1)

total_sum, mean_along_rows, max_along_columns, min_along_rows

(tensor(33.),
 tensor([5.3333, 5.6667]),
 torch.return_types.max(
 values=tensor([8., 9., 7.]),
 indices=tensor([1, 1, 0])),
 torch.return_types.min(
 values=tensor([4., 0.]),
 indices=tensor([0, 2])))

Broadcasting in PyTorch

The key idea behind broadcasting is that the smaller tensor is "broadcasted" or expanded to match the shape of the larger one

In [21]:
scalar = 2
result_broadcast = tensor_a + scalar
print(f'broadcast results is: {result_broadcast} and of shape {result_broadcast.shape}')

broadcast results is: tensor([[ 6.,  7.,  9.],
        [10., 11.,  2.]]) and of shape torch.Size([2, 3])


Two 2x2 tensors, tensor_a and tensor_b, and we want to concatenate them along dimension 0 to create a new tensor with a shape of 4x2.

In [22]:
tensor_a, tensor_b = torch.tensor([[2, 2], [2, 2]]), torch.tensor([[4, 4], [4, 4]])
concatenated_tensor = torch.cat((tensor_a, tensor_b), dim=0)
print(f'concatenated tensor is: {concatenated_tensor} and of shape {concatenated_tensor.shape}')

concatenated tensor is: tensor([[2, 2],
        [2, 2],
        [4, 4],
        [4, 4]]) and of shape torch.Size([4, 2])


### AutoGrad and Gradients

Autograd, short for Automatic Differentiation, is a key feature of PyTorch that allows for automatic computation of gradients (derivatives) of tensors. It is an essential component for training deep learning models through backpropagation.
1. **Gradient Calculation** - In deep learning, we often need to compute gradients of a loss function with respect to model parameters. Autograd simplifies this process. When you perform operations on tensors that require gradients, PyTorch automatically tracks these operations and constructs a computation graph.

2. **Computation Graph** - A computation graph is a directed acyclic graph (DAG) that represents the sequence of operations applied to tensors. Each operation in the graph is a node, and tensors flowing through these nodes are edges. The graph allows PyTorch to trace how input tensors influence the output tensors, which is crucial for gradient calculation.

3. **Dynamic Computational Graph** - PyTorch uses a dynamic computation graph, which means the graph is built on-the-fly as operations are executed. This dynamic nature allows flexibility and is well-suited for models with varying architectures or inputs of different shapes.

4. **Gradients** - Once you have a computation graph, you can compute gradients by backpropagating through the graph. Gradients represent how a small change in each input tensor would affect the final output. The gradients are computed using the chain rule of calculus, and they indicate the direction and magnitude of parameter updates during optimization.

In [30]:
x = torch.tensor([3.0, 2.0, 3.0], requires_grad=True)  # start tracking gradient

In [31]:
# forward pass. PyTorch records these operations in the computation graph
y = x * 2
z = y.mean()

### Backward pass

To compute gradients, initiate the backward pass using the backward() method on a scalar tensor (usually a loss)

Chain Rule: backThe ward pass uses the chain rule of calculus to calculate the gradients. It starts from the final scalar value z and works backward through the computation graph to compute the gradients of intermediate tensors with respect to the target tensor (x in this case).

It computes ∂z/∂y, which is the gradient of z with respect to y. Then, it computes ∂y/∂x, which is the gradient of y with respect to x

In [32]:
z.backward()

The result of the backward pass is stored in the .grad attribute of the tensors with requires_grad=True. In this case, x.grad will contain the gradient of z with respect to x.

In [39]:
x.grad

tensor([36.])

In [None]:
# Create a tensor with Autograd enabled (requires_grad=True)
x = torch.tensor([2.0], requires_grad=True)

# Perform some operations with Autograd enabled
y = x * 3
z = y ** 2
w = z.mean()

# Compute gradients while Autograd is enabled
w.backward()

# Access the gradient of x
gradient_with_autograd = x.grad

# Print the gradient
print("Gradient with Autograd:", gradient_with_autograd.item())


# Now, let's turn Autograd off for a specific tensor
x.requires_grad_(False)

# Perform operations without Autograd (Autograd is off for x)
y = x * 3
z = y ** 2
w = z.mean()
try:
    # Attempt to compute gradients 
    w.backward()
except:
    print("This tensor does't have require gradients set to True")

Gradient with Autograd: 36.0
This tensor does't have require gradients set to True


### nn.Parameter

In PyTorch, nn.Parameter is a class that is a subclass of the torch.Tensor class. It is specifically designed to be used as a container for tensors that should be considered parameters of a PyTorch nn.Module. Parameters are tensors that are meant to be learned during the training process, such as weights and biases in a neural network.

Why nn.Parameter is useful?

* Requires Grad Calculation: When you create a tensor using nn.Parameter, it is automatically registered as a parameter of the parent module, and PyTorch keeps track of it for gradient computation during backpropagation. This means that any operations involving these tensors will have gradients computed, allowing them to be updated during training using optimization techniques like stochastic gradient descent (SGD).
* Initialization: Parameters created using nn.Parameter are typically initialized with random values (e.g., Gaussian or uniform distribution) by default. However, you can customize the initialization method if needed.
* Access: You can easily access the parameters of a PyTorch module using the parameters() method, which returns an iterable containing all the nn.Parameter objects within the module.

In [None]:
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # Create an nn.Parameter for weight and bias
        self.weight = nn.Parameter(torch.randn(10, 5))
        self.bias = nn.Parameter(torch.zeros(10))


    def forward(self, x):
        # Use the parameters in the forward pass
        z = torch.matmul(x, self.weight.t()) + self.bias
        return z


# Instantiate the model
model = MyModel()

# Access and print the parameters
for param in model.parameters():
    print(param)

Parameter containing:
tensor([[ 0.3426,  0.2997,  0.0477,  0.9563, -0.8480],
        [-2.2974, -1.8996,  0.8777,  0.4756, -1.2931],
        [-0.3829,  0.3655, -0.4606, -2.3426,  0.7322],
        [-0.0354,  0.4037, -0.9113,  1.4786,  0.5362],
        [-0.0795,  0.3935,  1.7341, -0.0545,  0.2704],
        [-1.1718,  0.6017, -0.2846, -0.2442,  1.0230],
        [-0.8148,  0.9376,  0.7216, -1.5338,  0.6573],
        [ 0.3531, -0.1765,  0.6951,  2.3895,  0.3509],
        [-0.1698, -0.3017, -0.5952,  0.6879,  0.5409],
        [-0.1112, -1.4798, -1.2071, -0.5397,  1.2772]], requires_grad=True)
Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0.], requires_grad=True)


## Linear Layer

The Linear Layer in a PyTorch model receives input from every neuron of its preceding layer and performs matrix-vector multiplication

![](./Linear.PNG)

In [6]:
import torch
import torch.nn as nn


class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(128, 64)  # first linear layer
        self.relu = nn.ReLU()          # activation function
        self.fc2 = nn.Linear(64, 10)   # output layer for 10 classes


    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.fc2(x)
        return x


model = MLP()
input_tensor = torch.randn(32, 128)  # batch of 32 samples, input of 128 features
output = model(input_tensor)
print(output.shape)

torch.Size([32, 10])


## Batch normalization (BatchNorm2d)

Batch normalization applies a transformation that maintains the mean output close to 0 and the output standard deviation close to 1

This is a data normalization layer in deep learning, used to normalize input data over a mini-packet in convolutional neural networks. It helps to stabilize and accelerate learning by improving the generalizing ability of the model

![](./BatchNorm2d.PNG)

In [None]:
import torch
import torch.nn as nn


class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, padding=1)
        self.bn1 = nn.BatchNorm2d(16)  # BatchNorm after convolutional layer
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)


    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)  # Batch normalization
        x = self.relu(x)
        x = self.pool(x)
        return x



model = CNN()
input_tensor = torch.randn(8, 3, 32, 32)  # batch of 8 images 3x32x32
output = model(input_tensor)
print(output.shape)

torch.Size([8, 16, 16, 16])


## Dropout

The Dropout layer randomly sets input units to 0 with a specified probability (rate) at each step during training time. This regularization technique helps prevent overfitting by reducing the reliance on specific neurons

![](./Dropout.gif)

No Dropout is applied during testing, but the output values are multiplied by (1 - p) to compensate for the scaling caused by the dropout during training

In [1]:
import torch
import torch.nn as nn


class SimpleNN(nn.Module):
    def __init__(self):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(128, 64)
        self.dropout = nn.Dropout(p=0.5)  # 50% of neurons will be deactivated
        self.fc2 = nn.Linear(64, 10)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)  # Dropout usage
        x = self.fc2(x)
        return x


model = SimpleNN()
input_tensor = torch.randn(32, 128)  # batch of 32 features & size of 128
output = model(input_tensor)
print(output.shape)  # output tensor with 10 classes

torch.Size([32, 10])


## Conv2d

This is a convolutional layer used in neural networks for image processing. It applies learnable filters (kernels) to the input image to highlight features such as edges, textures, and objects

The Conv2D layer in PyTorch creates a convolution kernel that is applied to the layer input to produce a tensor of outputs. It is a fundamental building block of convolutional neural networks (CNNs) used for tasks such as image classification and feature extraction

![](./Conv2d.gif)

In [3]:
import torch
import torch.nn as nn


class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)  # reduces the size of the feature map by 2 times


    def forward(self, x):
        x = self.conv1(x)
        x = self.relu(x)
        x = self.pool(x)
        return x


model = SimpleCNN()
input_tensor = torch.randn(8, 3, 32, 32)
output = model(input_tensor)
print(output.shape)

torch.Size([8, 16, 16, 16])


## MaxPool2d

The MaxPooling2D layer in PyTorch downsamples the input along its height and width by taking the maximum value over an input window. It is a common operation used in convolutional neural networks (CNNs) for reducing spatial dimensions while preserving important features

![](./maxpool2d.png)

Why is it useful ?
* Reduces dimensionality by reducing the number of parameters
* Filters noise, highlighting the most significant features
* Makes the model resistant to small image shifts

## AvgPool2d

The AveragePooling2D layer in PyTorch downsamples the input along its height and width by taking the average value over an input window. It is a pooling operation commonly used in convolutional neural networks (CNNs) for reducing spatial dimensions while smoothing the features

![](./avgpool2d.png)

In [None]:
import torch.nn as nn

nn.AvgPool2d(
    kernel_size,  # size of the pooling window, specified as a tuple (height, width)
    stride=None,  # stride of the pooling operation along the height and width
    padding=0  # 
)

## RNN

This is a recurrent layer used in neural networks to process sequential data such as text, audio signals, and time series. It processes the input data step by step, preserving the hidden state that is transmitted between time steps

![](./rnn.png)

In [None]:
import torch
import torch.nn as nn


class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=2):
        super(RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)  # linear layer for prediction


    def forward(self, x):
        out, hidden = self.rnn(x)  # out: output on every step, hidden: last hidden state
        out = self.fc(out[:, -1, :])  # last time series
        return out


input_size = 10
hidden_size = 20
output_size = 1    # one output (e. g. regression)
seq_length = 5     # sequence length
batch_size = 3     # sequences in batch

model = RNN(input_size, hidden_size, output_size)
input_tensor = torch.randn(batch_size, seq_length, input_size)
output = model(input_tensor)
output

tensor([[0.5087],
        [0.1668],
        [0.2136]], grad_fn=<AddmmBackward0>)

## LSTM (long short-term memory)

The Long Short-Term Memory (LSTM) layer in PyTorch is a type of RNN layer designed to capture long-range dependencies in sequential data. It uses a memory cell and gates to control the flow of information through the network.

![](./lstm.png)

In [None]:
import torch
import torch.nn as nn


class SimpleLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(SimpleLSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)


    def forward(self, x):
        out, (h_n, c_n) = self.lstm(x)  # out: all outputs, h_n: las hidden state, c_n: memory
        out = self.fc(out[:, -1, :])  # last time series
        return out


input_size = 10
hidden_size = 20
output_size = 1
seq_length = 5
batch_size = 3

model = SimpleLSTM(input_size, hidden_size, output_size)
input_tensor = torch.randn(batch_size, seq_length, input_size)
output = model(input_tensor)
print(output)

tensor([[-0.1357],
        [-0.1517],
        [-0.3170]], grad_fn=<AddmmBackward0>)


## GRU

The Gated Recurrent Unit (GRU) layer in PyTorch is another type of RNN layer that is computationally efficient and can capture long-range dependencies. It uses update and reset gates to control information flow

![](./gru.png)

In [22]:
import torch
import torch.nn as nn


class SimpleGRU(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, num_layers=1):
        super(SimpleGRU, self).__init__()
        self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)


    def forward(self, x):
        out, h_n = self.gru(x)
        out = self.fc(out[:, -1, :])
        return out


input_size = 10
hidden_size = 20
output_size = 1
seq_length = 5
batch_size = 3

model = SimpleGRU(input_size, hidden_size, output_size)
input_tensor = torch.randn(batch_size, seq_length, input_size)
output = model(input_tensor)
print(output)

tensor([[0.2540],
        [0.1755],
        [0.0056]], grad_fn=<AddmmBackward0>)


## Multi-head Attention

Multi-Head Attention is a crucial component of transformer-based neural networks, such as the Transformer model and its variants (e.g., BERT, GPT). It enables the model to focus on different parts of the input sequence simultaneously, allowing it to capture complex relationships and dependencies within the data

![](./attention%20block.png)

In [None]:
nn.MultiheadAttention(
    embed_dim,  # dimension of the input embeddings.
    num_heads,  # number of attention heads. Each head attends to different parts of the input
    dropout=0.0,  # If non-zero, applies dropout to the output of the attention layers (default is 0.0)
    bias=True,  # If True, enables bias in the attention calculation (default is True)
    add_bias_kv=False,  #  If True, adds bias to the key and value sequences (default is False)
    add_zero_attn=False,  # If True, adds a learnable parameter to the attention calculation (default is False)
    kdim=None,  # The dimension of the key vectors. By default, it's set to embed_dim
    vdim=None  # The dimension of the value vectors. By default, it's set to embed_dim
)

In [25]:
import torch
import torch.nn as nn


# attention parameters
embed_dim = 32   # dimension of the input features
num_heads = 4    # Number of heads of attention
seq_length = 5
batch_size = 3   # samples in batch

# multi-head attention layer
mha = nn.MultiheadAttention(embed_dim, num_heads, batch_first=True)

# input data (query, key, value)
query = torch.randn(batch_size, seq_length, embed_dim)
key = torch.randn(batch_size, seq_length, embed_dim)
value = torch.randn(batch_size, seq_length, embed_dim)

# MultiheadAttention
output, attention_weights = mha(query, key, value)

print(output.shape)  # [batch_size, seq_len, embed_dim] → [3, 5, 32]
print(attention_weights.shape)  # [batch_size, num_heads, seq_len, seq_len] → [3, 4, 5, 5]

torch.Size([3, 5, 32])
torch.Size([3, 5, 5])


## Embedding layer

The Embedding Layer in PyTorch is used to create dense representations of categorical variables, commonly used in natural language processing tasks where words are converted into numerical vectors.

In [None]:
import torch.nn as nn

embedding_layer = nn.Embedding(
    num_embeddings, embedding_dim, padding_idx=None,
    max_norm=None, norm_type=2.0, scale_grad_by_freq=False,
    sparse=False, _weight=None
)

**num_embeddings**: Integer, the size of the vocabulary, i.e., the total number of unique categories

**embedding_dim**: Integer, the dimension of the dense embedding

**padding_idx**: Optional integer, indicating the padding index. If specified, the padding index will have a learned embedding with all zeros

**max_norm**: Optional float, if specified, will normalize embeddings during forward pass to have a maximum norm of this value

**norm_type**: Float, the type of norm to be applied when max_norm is specified (e.g., 2.0 for L2 norm)

**scale_grad_by_freq**: Boolean, whether to scale gradients by the frequency of the words during training

**sparse**: Boolean, indicating whether to use sparse gradients for embeddings

**_weight**: Optional pre-trained embedding weights (a tensor)

## Custom Layer in PyTorch

Custom layers in PyTorch allow you to define your own neural network components with custom behavior. We can create custom layers by subclassing nn.Module. Implement the __init__ method to set up any learnable parameters and other configuration options. Then, implement the forward method to define the forward pass logic.



In [None]:
import torch
import torch.nn as nn
import numpy


class CustomLinear(nn.Module):
    def __init__(self, in_features, out_features):
        super(CustomLinear, self).__init__()
        self.weight = nn.Parameter(torch.randn(out_features, in_features))
        self.bias = nn.Parameter(torch.randn(out_features))


    def forward(self, x):
        out = torch.matmul(x, self.weight.t()) + self.bias
        return out


# Instantiate the custom linear layer
custom_layer = CustomLinear(64, 32)  # Example: input size 64, output size 32

# Example input data
input_data = torch.randn(16, 64)

# Use the custom layer
output = custom_layer(input_data)
print(output.shape)

torch.Size([16, 32])


## Activation function

Activation functions are essential components in neural networks that introduce non-linearity, allowing neural networks to model complex relationships in data

## Sigmoid function

Sigmoid is another common activation function that maps input values to the range [0, 1]. It is often used in binary classification problems where the output represents probabilities

In [6]:
input_tensor = torch.tensor([2.0, 1.0, -2.0])

# Define the Sigmoid activation function
sigmoid = nn.Sigmoid()

# Apply Sigmoid to the input
output = sigmoid(input_tensor)

output

tensor([0.8808, 0.7311, 0.1192])

## Softmax

Softmax is used in multi-class classification problems to convert a vector of raw scores into a probability distribution over multiple classes. It exponentiates each score and normalizes them to sum to 1

In [7]:
# Create a sample input tensor (raw scores)
input_tensor = torch.tensor([2.0, 1.0, 0.1])

# Define the Softmax activation function
softmax = nn.Softmax(dim=0)

# Apply Softmax to the input
output = softmax(input_tensor)

print(output)

tensor([0.6590, 0.2424, 0.0986])


## ReLU (Rectified Linear Activation)

ReLU (Rectified Linear Unit) is one of the most commonly used activation functions in neural networks. It replaces all negative values in the input with zero and keeps positive values unchanged

In [8]:
# Create a sample input tensor
input_tensor = torch.tensor([-1.0, 2.0, -0.5, 3.0])

# Define the ReLU activation function
relu = nn.ReLU()

# Apply ReLU to the input
output = relu(input_tensor)

print(output)

tensor([0., 2., 0., 3.])


## Custom Activation Function in PyTorch

In [None]:
import torch
import torch.nn as nn


class Activation(nn.Module):
    def __init__(self, x):
        super().__init__()
        self.x = x


    def forward(self):
        return self.x * torch.sigmoid(self.x)


input_tensor = torch.tensor([-1.0, 2.0, -0.5, 3.0])
function = Activation(input_tensor)
function.forward()

tensor([-0.2689,  1.7616, -0.1888,  2.8577])

## Building models in PyTorch

Steps to Create a Custom Model

1) **Import Dependencies**: Import the necessary PyTorch modules and packages, such as torch.nn for defining neural network components.

2) **Define the Model Class**: Create a custom Python class that inherits from nn.Module. This class will represent your neural network model. Define the network's architecture by adding layers and specifying their forward pass in the forward method.

3) **Initialize Layers**: In the __init__ method of your custom model class, initialize the layers (e.g., convolutional layers, fully connected layers) that you'll use in your neural network.

4) **Forward Pass**: Implement the forward pass in the forward method of your model class. This method defines how the input data passes through the layers of your model to produce an output.

5) **Training and Optimization**: After defining your custom model, you can use it for training and optimization tasks. You'll need to define a loss function and choose an optimization algorithm (e.g., stochastic gradient descent) to train your model on your dataset.

In [None]:
import torch
import torch.nn as nn


class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # fully connected MLP, 64 neurons for input and 10 for output
        self.fc1 = nn.Linear(64, 128)
        self.fc2 = nn.Linear(128, 10)


    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x


model = MyModel()

A more complex Convolutional Neural Network (CNN) model using PyTorch. This model is designed for image classification tasks and consists of both convolutional and fully connected layers

The use of nn.Sequential allows us to group layers together in a sequential manner, making the code more concise and readable.

In [1]:
import torch
import torch.nn as nn


class ComplexCNN(nn.Module):
    def __init__(self, num_classes):
        super(ComplexCNN, self).__init__()

        # Sequential block for convolutional layers
        self.conv_layers = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(64),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(128),
            nn.MaxPool2d(kernel_size=2, stride=2),

            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1),
            nn.ReLU(),
            nn.BatchNorm2d(256),
            nn.MaxPool2d(kernel_size=2, stride=2)
        )

        # Sequential block for fully connected layers
        self.fc_layers = nn.Sequential(
            nn.Linear(256 * 4 * 4, 512),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(512, num_classes)
        )


    def forward(self, x):
        # Forward pass through convolutional layers
        x = self.conv_layers(x)

        # Flatten the feature maps
        x = x.view(x.size(0), -1)

        # Forward pass through fully connected layers
        x = self.fc_layers(x)

        return x

## Visualizing Model Architecture with torchsummary

In [12]:
import torch
import torch.nn as nn
from torchsummary import summary


model = ComplexCNN(2)
summary(model, (3, 32, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 32, 32]           1,792
              ReLU-2           [-1, 64, 32, 32]               0
       BatchNorm2d-3           [-1, 64, 32, 32]             128
         MaxPool2d-4           [-1, 64, 16, 16]               0
            Conv2d-5          [-1, 128, 16, 16]          73,856
              ReLU-6          [-1, 128, 16, 16]               0
       BatchNorm2d-7          [-1, 128, 16, 16]             256
         MaxPool2d-8            [-1, 128, 8, 8]               0
            Conv2d-9            [-1, 256, 8, 8]         295,168
             ReLU-10            [-1, 256, 8, 8]               0
      BatchNorm2d-11            [-1, 256, 8, 8]             512
        MaxPool2d-12            [-1, 256, 4, 4]               0
           Linear-13                  [-1, 512]       2,097,664
             ReLU-14                  [

## Callbacks

A callback is a mechanism in deep learning frameworks like PyTorch or TensorFlow that allows to customize and extend the behavior of a training process during training a neural network. It provides a way to specify certain actions to be taken at various points during training, such as at the end of an epoch or after each batch. Callbacks are often used for purposes like monitoring training progress, saving model checkpoints, applying learning rate schedules, early stopping, and more

**Customizable Actions**: Callbacks enable you to define custom actions or functions that will be executed at specific points during training. For example, you can specify that you want to save the model's weights after each epoch or display training metrics periodically.

**Modular and Reusable**: Callbacks are modular and reusable pieces of code. You can create your own custom callbacks to perform specific tasks tailored to your project's requirements.

**Non-Intrusive**: Callbacks don't interfere with the core training loop of the deep learning framework. They complement the training process without altering the fundamental training algorithm.

**Monitoring and Logging**: Callbacks are commonly used for monitoring training metrics such as loss, accuracy, or custom evaluation metrics. You can log these metrics to track how your model is performing over time.

**Early Stopping**: One common use of callbacks is early stopping, where training is halted when a certain condition is met, such as when the validation loss stops improving, to prevent overfitting.

**Model Checkpoint**s: You can use callbacks to save model checkpoints at specific intervals, ensuring that you can restore the model to a particular state if needed.

**Learning Rate Scheduling**: Callbacks can be used to adjust the learning rate during training, enabling you to fine-tune the training process for better convergence.

**TensorBoard Integration**: Callbacks can integrate with tools like TensorBoard for visualizing and analyzing training progress.

**Callback Chains**: Multiple callbacks can be chained together to perform a sequence of actions during training.

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


# base callback class
class Callback:
    def on_train_begin(self, logs=None):
        if logs is None:
            logs = {}
        
        # create learning history
        logs['history'] = {'loss': []}
        print("Training start: initialization of the settings metric")


    def on_epoch_begin(self, epoch, logs=None):
        if logs is None:
            logs = {}

        # reset the counter for the current epoch
        logs['epoch_loss'] = 0.0
        logs['batches'] = 0
        print(f"Epoch's start {epoch+1}.")


    def on_batch_end(self, batch, logs=None):
        if logs is None:
            logs = {}

        logs['epoch_loss'] += logs.get('loss', 0.0)
        logs['batches'] += 1  # update batches counter
        # output an intermediate value for the current batch
        print(f"  Batch {batch+1} is done, loss = {logs.get('loss', 0.0):.4f}")


    def on_epoch_end(self, epoch, logs=None):
        if logs is None:
            logs = {}

        # calculate avg loss for epoch
        avg_loss = logs['epoch_loss'] / logs['batches'] if logs['batches'] > 0 else float('nan')
        # save to history
        logs['history']['loss'].append(avg_loss)
        print(f"Epoch {epoch+1} is done. Avg loss: {avg_loss:.4f}")


    def on_train_end(self, logs=None):
        print("The end of training")


# certain callback class
class MyLoggingCallback(Callback):
    # anything can be changed, but base class already has all needed methods
    pass


# 100 samples, 10 features, predictor for regression
X = torch.randn(100, 10)
y = torch.randn(100, 1)

# simple model with 1 layer
model = nn.Linear(10, 1)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

# callback init. (lists can be sent)
callbacks = [MyLoggingCallback()]

logs = {}

# epochs and batch size
num_epochs = 5
batch_size = 20
num_batches = X.size(0) // batch_size


# before training, set logs in callbacks
for cb in callbacks:
    cb.on_train_begin(logs)

for epoch in range(num_epochs):
    for cb in callbacks:
        cb.on_epoch_begin(epoch, logs)
    
    for batch in range(num_batches):
        start = batch * batch_size
        end = start + batch_size
        inputs = X[start:end]
        targets = y[start:end]

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        
        # update logs for batch
        batch_logs = {'loss': loss.item(), 'epoch_loss': 0, 'batches': 0}

        for cb in callbacks:
            cb.on_batch_end(batch, logs=batch_logs)

        # accumulate the values for the epoch in the general logs dictionary
        logs['epoch_loss'] = logs.get('epoch_loss', 0.0) + loss.item()
        logs['batches'] = logs.get('batches', 0) + 1

    for cb in callbacks:
        cb.on_epoch_end(epoch, logs)
        
for cb in callbacks:
    cb.on_train_end(logs)


Training start: initialization of the settings metric
Epoch's start 1.
  Batch 1 is done, loss = 1.6179
  Batch 2 is done, loss = 1.2072
  Batch 3 is done, loss = 1.6743
  Batch 4 is done, loss = 0.7493
  Batch 5 is done, loss = 0.7382
Epoch 1 is done. Avg loss: 1.1974
Epoch's start 2.
  Batch 1 is done, loss = 1.4886
  Batch 2 is done, loss = 1.0242
  Batch 3 is done, loss = 1.6143
  Batch 4 is done, loss = 0.6692
  Batch 5 is done, loss = 0.6592
Epoch 2 is done. Avg loss: 1.0911
Epoch's start 3.
  Batch 1 is done, loss = 1.3882
  Batch 2 is done, loss = 0.8857
  Batch 3 is done, loss = 1.5706
  Batch 4 is done, loss = 0.6095
  Batch 5 is done, loss = 0.6045
Epoch 3 is done. Avg loss: 1.0117
Epoch's start 4.
  Batch 1 is done, loss = 1.3096
  Batch 2 is done, loss = 0.7801
  Batch 3 is done, loss = 1.5383
  Batch 4 is done, loss = 0.5650
  Batch 5 is done, loss = 0.5672
Epoch 4 is done. Avg loss: 0.9520
Epoch's start 5.
  Batch 1 is done, loss = 1.2476
  Batch 2 is done, loss = 0.6987