## Loss functions

In [None]:
from torch import nn
import torch

"""Classification loss"""
loss_fn = nn.BCELoss()  # binary classification
loss_fn = nn.CrossEntropyLoss()  # multi-class classification


"""Regression loss"""
loss_fn = nn.L1Loss()  # mae loss
loss_fn = nn.MSELoss()  

## Optimizers

In [None]:
from torch import  optim

"""optimize full parameters"""
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

"""optimize partial parameters"""
optimizer = optim.Adam([var1, var2], lr=0.0001)
optimizer = optim.SGD([
                {'params': model.base.parameters()},
                {'params': model.classifier.parameters(), 'lr': 1e-3}
            ], lr=1e-2, momentum=0.9)

"""different optimizers"""
optimizer = torch.optim.SGD(model.parameters(), 
                            lr=learning_rate, 
                            momentum=0.8)
optimizer = torch.optim.Rprop(model.parameters(), 
                              lr=learning_rate,
                              etas=(0.5, 1.2))
optimizer = torch.optim.RMSprop(model.parameters(), 
                                lr=learning_rate,
                                alpha=0.99,
                                momentum=0)
optimizer = torch.optim.Adam(model.parameters(), 
                             lr=learning_rate, 
                             betas=(0.9, 0.999))
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr=learning_rate,
                              betas=(0.9, 0.999))

"""Common hyperparameters
- weight_decay (float, default 0): l2 penalty to weights
- momentum in SGD, RMSprop:
- etas in Rprop: 
- betas for Adam, AdamW: 
"""

## learning rate decay

In [None]:
from torch.optim import lr_scheduler

"""Define schedular"""
# This scheduler multiplies the learning rate by 0.1 every 30 epochs
#         >>> # lr = 0.05     if epoch < 30
#         >>> # lr = 0.005    if 30 <= epoch < 60
#         >>> # lr = 0.0005   if 60 <= epoch < 90
scheduler = lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)

"""Different schedular"""
# Decays the learning rate of each parameter group by gamma 
# once the number of epoch reaches one of the milestones.
# >>> # Assuming optimizer uses lr = 0.05 for all groups
#         >>> # lr = 0.05     if epoch < 10
#         >>> # lr = 0.005    if 10 <= epoch < 30
#         >>> # lr = 0.0005   if epoch >= 30
scheduler = lr_scheduler.MultiStepLR(optimizer, gamma=0.1, milestones=[10, 30])

"""Check last learning rate"""
scheduler.get_last_lr()[0]

In [None]:
"""Usage"""
num_epochs = 100
for epoch in range(num_epochs):
    ...
    
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        running_loss += loss.item()
    
    # Learning rate decay
    scheduler.step()

## Early Stopping

In [None]:
import numpy as np


class EarlyStopping:
    def __init__(self, patience=7, verbose=False, delta=0):
        """
        :param patience: How long to wait after last time validation loss improved.
        :param verbose: If True, prints a message for each validation loss improvement. 
        :param delta: Minimum change in the monitored quantity to qualify as an improvement.
        """
        self.patience = patience
        self.verbose = verbose
        self.counter = 0
        self.best_loss = None
        self.early_stop = False
        self.val_loss_min = np.Inf
        self.delta = delta

    def __call__(self, val_loss):
        if self.best_loss is None:
            self.best_loss = val_loss
        elif val_loss > self.best_loss - self.delta:
            self.counter += 1
            if self.verbose:
                print(f'EarlyStopping counter: {self.counter} out of {self.patience}')
            if self.counter >= self.patience:
                self.early_stop = True
        else:
            self.best_loss = val_loss
            self.counter = 0

In [None]:
"""Usage"""
early_stopping = EarlyStopping(patience=10, verbose=True)

num_epochs = 100
for epoch in range(num_epochs):
    model.train()
    
    # Forward pass, backward pass, and update model parameters
    ...

    # Validation phase
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for data, target in val_loader:
            output = model(data)
            val_loss += criterion(output, target).item()

    val_loss /= len(val_loader.dataset)
    print(f'Epoch {epoch+1}, Val Loss: {val_loss}')

    # Call early stopping
    early_stopping(val_loss)
    if early_stopping.early_stop:
        print("Early stopping triggered")
        break

## Model Training

#### Define models and settings

In [None]:
"""Define hyper-parameters"""
learning_rate = 1e-3
batch_size = 64
epochs = 5

"""Load model"""
model = Model()

"""Initialize the loss function"""
loss_fn = nn.CrossEntropyLoss()

"""Select optimizer"""
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#### Training

In [None]:
"""Define training loop"""
def train_loop(dataloader, model, loss_fn, optimizer):
    size = len(dataloader.dataset)
    # Set the model to training mode - important for batch normalization and dropout layers
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        # Compute prediction and loss
        pred = model(X)
        loss = loss_fn(pred, y)

        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        if batch % 100 == 0:
            loss, current = loss.item(), batch * batch_size + len(X)
            print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")

## Model Evaluation

In [None]:
"""Testing loop"""
def test_loop(dataloader, model, loss_fn):
    # Set the model to evaluation mode - important for batch normalization and dropout layers
    model.eval()
    size = len(dataloader.dataset)
    num_batches = len(dataloader)
    test_loss, correct = 0, 0

    # Evaluating the model with torch.no_grad() ensures that no gradients are computed during test mode
    # also serves to reduce unnecessary gradient computations and memory usage for tensors with requires_grad=True
    with torch.no_grad():
        for X, y in dataloader:
            pred = model(X)
            test_loss += loss_fn(pred, y).item()

    test_loss /= num_batches
    print(f"Test Avg loss: {test_loss:>8f} \n")

## Save and Load Model

In [None]:
"""Save model with only weights"""
torch.save(model.state_dict(), 'model_weights.pth')

"""Load model with only weights"""
model = Model()
model.load_state_dict(torch.load('model_weights.pth'))
model.eval()

In [None]:
"""Save model with shapes"""
torch.save(model, 'model.pth')
model = torch.load('model.pth')