### Common Mistakee:
1. **Didn't overfit batch**: Overfit a ***single batch to check NN*** is working perfectly without having any errors. Then passing the whole dataset. `data, labels = next(iter(train_loader))`

2. ***Forgot toggle tain/eval:*** do first `model.eval()` then `test with  with torch.no_grad():` and finally train again by `model.train()`. This will improve model performance.

3. **forgot.zero_grad():** `optimizer.zero_grad()` is used to clear the gradients of all model parameters before the next iteration.

4. **Softmax when using CrossEntropy:** If we use `CrossEntropyLoss()` this will internally add a `sofmax` function.

5. ***Bias term with BatchNorm:*** When we add batchNorm after a **CNN layer or Linear layer** we have set `bais=False` for those layer. Because BatchNorm layer add the bias term internally.

6. **Using View as Permute:**
    * `view:` The view function is used to reshape a tensor while maintaining the total number of elements.
    * `permute:` The permute function is used to rearrange the dimensions of a tensor.

7. **Incorrect Data Augmentation:**

8. **Not shuffling data:**

9. **Not normalize Data:** Add a Normalize in the transform.

10. **Not cliping Gradient:(`torch.nn.utils.clip_grad_norm(model.parameters(), max_norm=1)`)** It is commonly used during training to clip gradients to a specified maximum norm in order to prevent exploding gradients and improve the stability of the optimization process. It is commonly used in `RNNs, LSTMs, GRUs`

#### 1. Import Section:

In [1]:
# import necessay modules
import torch
import torchvision
import torch.nn.functional as F  # Parameterless functions, like (some) activation functions
import torchvision.datasets as datasets  # Standard datasets
from torch.utils.data import Dataset, DataLoader # Gives easier dataset managment by creating mini batches etc.
import torchvision.transforms as transforms  # Transformations we can perform on our dataset for augmentation
from torch import optim  # For optimizers like SGD, Adam, etc.
from torch import nn  # All neural network modules
from tqdm import tqdm # for nice 
from torchinfo import summary

#### 2. Config Section:

In [2]:
# device config
device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') # 

In [3]:
# Hyperparameter
input_size = 784 # for minist 28*28
num_classes = 10 # 0-9
learning_rate = 0.001
batch_size = 64
num_epochs = 3

#### 3. Data Processing Section:

In [19]:
# Load Data
train_dataset = datasets.MNIST(
    root="./data", train=True, transform=transforms.ToTensor(), download=True
)
test_dataset = datasets.MNIST(
    root="./data", train=False, transform=transforms.ToTensor(), download=True
)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=True)

In [20]:
print(train_loader)

<torch.utils.data.dataloader.DataLoader object at 0x000001D120946490>


#### 4. Model Section:

In [5]:
class NN(nn.Module):
    def __init__(self, input_size, num_classes):
        super(NN, self).__init__()
        self.fc1 = nn.Linear(input_size, 50)
        self.fc2 = nn.Linear(50, num_classes)
        self.dropout = nn.Dropout(0.5)
    def forward(self, x):
        x = self.dropout(F.relu(self.fc1(x)))
        x = self.fc2(x)
        return x

In [6]:
# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)
summary(model, input_size=(1,1,28*28))

Layer (type:depth-idx)                   Output Shape              Param #
NN                                       [1, 1, 10]                --
├─Linear: 1-1                            [1, 1, 50]                39,250
├─Dropout: 1-2                           [1, 1, 50]                --
├─Linear: 1-3                            [1, 1, 10]                510
Total params: 39,760
Trainable params: 39,760
Non-trainable params: 0
Total mult-adds (Units.MEGABYTES): 0.04
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.16
Estimated Total Size (MB): 0.16

#### 5. Loss and Optimizer Section:

In [7]:
# Loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

#### 6. Train Model Section:

In [8]:
# Train Network
def train_nn(model, num_epochs, loader, loss_fn, optimizer):
    for epoch in range(num_epochs):
        losses=[]
        for batch_idx, (data, targets) in enumerate(tqdm(loader)):
            # Get data to cuda if possible
            data = data.to(device=device)
            targets = targets.to(device=device)

            data = data.reshape(data.shape[0], -1)
            # Forward
            scores = model(data)
            loss = loss_fn(scores, targets)
            losses.append(loss)

            # Backward
            optimizer.zero_grad() # used to clear the gradients of all model parameters before the next iteration.
            loss.backward()

            # Gradient descent or adam step-> used to update the model's parameters based on the computed gradients.
            optimizer.step()
        mean_loss = sum(losses)/len(losses)
        print(f"loss at each epoch {mean_loss:.5f}")

train_nn(model, num_epochs, loader=train_loader, loss_fn=loss_fn, optimizer=optimizer)

  0%|          | 0/938 [00:00<?, ?it/s]

100%|██████████| 938/938 [00:12<00:00, 76.70it/s] 


loss at each epoch 0.66677


100%|██████████| 938/938 [00:09<00:00, 98.67it/s] 


loss at each epoch 0.40861


100%|██████████| 938/938 [00:09<00:00, 99.74it/s] 

loss at each epoch 0.35974





#### 7. Model Evaluation Section:

In [9]:
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        # Loop through the data
        for x, y in loader:

            # Move data to device
            x = x.to(device=device)
            y = y.to(device=device)
            
            x= x.reshape(x.shape[0], -1)
            # Forward pass
            scores = model(x)
            _, predictions = scores.max(1)

            # Check how many we got correct
            num_correct += (predictions == y).sum()

            # Keep track of number of samples
            num_samples += predictions.size(0)

    model.train()
    return num_correct / num_samples

In [10]:
# Check accuracy on training & test to see how good our model
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")

Accuracy on training set: 94.55
Accuracy on test set: 94.41


In [26]:
import torch
# Create a tensor
x = torch.arange(12)
# Reshape the tensor
y = x.view(3, 4)
y

tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])

In [27]:
# Create a tensor
x = torch.rand(2, 3, 4)

# Permute dimensions
y = x.permute(1, 2, 0)
y, x.shape

(tensor([[[0.7752, 0.6201],
          [0.5137, 0.0647],
          [0.1324, 0.3990],
          [0.7703, 0.9250]],
 
         [[0.5311, 0.7856],
          [0.4865, 0.6590],
          [0.3296, 0.0250],
          [0.2081, 0.0032]],
 
         [[0.8647, 0.2242],
          [0.7728, 0.1922],
          [0.5120, 0.8592],
          [0.5363, 0.8615]]]),
 torch.Size([2, 3, 4]))