# PyTorch Image Classification Tutorial for Beginners

This is the self-taught PyTorch Image Classification 

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import cv2, os

import albumentations as A 
from albumentations.pytorch import ToTensorV2

import random
import pandas as pd
import numpy as np # Data processing
import matplotlib.pyplot as plt # Data visualization
from tqdm import tqdm # Progress bar 

In [None]:
print(torch.__version__)
print(A.__version__)
print(cv2.__version__)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

## Step 1:Prepare and Explore Data

In [None]:
root_dir = "/home/jacobian/Desktop/Master_degree_research/data/Layer2/data/train/"
sub_folders = ["GOOD", "BAD"]
labels = [0, 1]

data = [] 
for s, l in zip(sub_folders, labels):
    print(s, l)
    for r, d, f in os.walk(root_dir + s):
        print(r, d, f )
        for file in f:
            if ".png" in file:
                
                data.append((os.path.join(s, file), l, s))
                
df = pd.DataFrame(data, columns=["file_name", "label", "classes"])

In [None]:
df

In [None]:
import seaborn as sns

sns.countplot(data= df, x="classes", color='white', edgecolor='black', hatch=['.', '']);

It is always a good idea to plot a few samples

In [None]:
fig, ax = plt.subplots(2, 3, figsize=(10, 6))

idx = 0
for i in range(2):
    for j in range(3):
        
        label = df.label[idx]
        file_path = os.path.join(root_dir, df.file_name[idx])
        
        # Read an image with OpenCV
        image = cv2.imread(file_path)
        
        # Convert the image to RGB color space
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        
        # Resize image
        image = cv2.resize(image, (204, 204))
        
        ax[i, j].imshow(image)
        ax[i, j].set_title(f"Label: {label} ({'GOOD' if label == 0 else 'BAD'})")
        ax[i, j].axis("off")
        
plt.tight_layout()
plt.show()

Let's split the dataset into training and testing data

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(df,
                                     test_size = 0.1,
                                     random_state=42)

## Step 2: Build a Baseline

1. A data pip-line for loading images
2. A **model** with **loss function** and **optimizer**
3. A training pipe-line, including a cross-validation strategy


Deep Learning has a lot of Experimentation. We should make the code as modular as possible and work with a configuration for tuning;

In [None]:
# Add configurable parameter
from types import SimpleNamespace

cfg = SimpleNamespace(**{})

### Build a data pipe-line for loading images

* **`Dataset`** class: Loads and preprocesses the dataset. You will need to customize this class for your purpose.
* **`Dataloader`** class: Loads batches of data samples to the neural network.

In [None]:
def set_seed(seed: int = 1234):
    """Set seed for reproducibility across multiple functions and libraries.

    Args:
        seed (int, optional): seed value to be used. Default is 1234.
    """
    
    # Seed Python's built-in random module
    # This affects Python random functions, like random.shuffle and random.randint
    random.seed(seed)

    # Seed the Python hash function
    # PYTHONHASHSEED environment variable can be set to control the seed for generating hash of the types covered by Python's "hash" mechanism
    os.environ["PYTHONHASHSEED"] = str(seed)

    # Seed for NumPy random functions
    # NumPy is used for numerical operations in Python, including generating random numbers for array elements
    np.random.seed(seed)
    
    # Seed PyTorch's random number generator
    # PyTorch is a machine learning library, and this seed affects PyTorch random functions
    torch.manual_seed(seed)
    
    # If using a single GPU with CUDA, seed the GPU
    # CUDA is a parallel computing platform that allows for using GPUs
    torch.cuda.manual_seed(seed)
    
    # If using more than 1 GPU with CUDA, seed all the GPUs
    # This allows for reproducibility across multiple GPUs
    torch.cuda.manual_seed_all(seed)
    
    # Force CUDA to use deterministic algorithms
    # CUDA operations can be non-deterministic, causing minor differences in computations, this forces those operations to be deterministic
    torch.backends.cudnn.deterministic = True 
    
    # Disabling the inbuilt cuDNN auto-tuner 
    # The auto-tuner selects the best algorithm for cuDNN operations based on the hardware, 
    # which could lead to non-deterministic results between multiple runs of the same program.
    torch.backends.cudnn.benchmark = False 



In [None]:
cfg.root_dir = "/home/jacobian/Desktop/Master_degree_research/data/Layer2/data/train/"
cfg.image_size = 256

class CustomDataset(Dataset):
    def __init__(self, 
                 cfg,
                 df, transform=None,
                 mode = "val") -> None:
        super().__init__()
        self.root_dir = cfg.root_dir
        self.df =df
        self.file_name = df["file_name"].values
        self.labels = df["label"].values
    
        if transform:
            self.transform = transform
        else:
            self.transform = A.Compose([
                A.Resize(cfg.image_size, cfg.image_size),
                ToTensorV2()
            ])
            
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        # Get file_path and label for index
        label = self.labels[idx]
        file_path = os.path.join(self.root_dir, self.file_name[idx])
        # print(file_path)
        
        # Read image with OpenCV
        image = cv2.imread(file_path)
        # Convert the image to RGB color space.
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
        # Apply augmentations
        augmented = self.transform(image=image)
        image = augmented["image"]
        
        # Normalize because ToTen
        
        return image, label
    

Use `Dataloader` to load batch of image into neural network
1. Provide `Dataloader` with the instance of the `Dataset` you want to navigate
2. The size of batches (`cfg.batch_size`) 
3. Other information such as: shuffle the data

In [None]:
cfg.batch_size = 32

example_dataset = CustomDataset(cfg, df)

example_dataloader = DataLoader(example_dataset,
                                batch_size=cfg.batch_size,
                                shuffle=True,
                                num_workers=0)

Batch size should be fixed throughout the training, use the biggest batch size as possible by start from 32, 64, 128, etc. until get memory error and use the last batch size

In [None]:
for (image_batch, label_batch) in example_dataloader:
    print(image_batch.shape)
    print(label_batch.shape)
    break

`Dataloader` return **image batch** and **label batch**
- `image_batch` is tensor of the shape (`32, 3, 256, 256`) (`batch_size, channels, image_height, image_width`)
- `label_batch` is a tensor of the shape (`32`)

![Image](https://miro.medium.com/v2/resize:fit:1100/format:webp/1*AqNNI3sEFpv_fMbD31JfNw.png)


Let's randomly partition the training data into training and validation set.

Create `Datasets` and `Dataloader` for the training and validation data 

![Image](https://miro.medium.com/v2/resize:fit:720/format:webp/1*aV6AXit9w9SIOSDoKjpZ_Q.png)

In [None]:
X = df 
y = df.label

train_df, valid_df, y_train, y_test = train_test_split(X,
                                                       y,
                                                       test_size=0.2,
                                                       random_state=42)

In [None]:
train_dataset = CustomDataset(cfg, train_df)
valid_dataset = CustomDataset(cfg, valid_df)

train_dataloader = DataLoader(train_dataset,
                              batch_size=cfg.batch_size,
                              shuffle=True)

valid_dataloader = DataLoader(valid_dataset,
                              batch_size=cfg.batch_size,
                              shuffle=False)

### Prepare the model

[`timm`](https://timm.fast.ai/) - a Deep Learning library containing a collection of state-of-the-art computer vision model

`timm` like `torchvision.models` but writer's comment suggest timm good for switch backbones during experimentation

In [None]:
import timm

cfg.n_classes = 2
cfg.backbone = "resnet18"

model = timm.create_model(cfg.backbone,
                          pretrained=True,
                          num_classes = cfg.n_classes)

In [None]:
timm.list_models('resnet*')

In [None]:
x = torch.randn(cfg.batch_size, 3, cfg.image_size, cfg.image_size)
y = model(x)
print(type(y))

### Prepare loss function and optimizer

* a loss function (criterion)
* an optimization algorithm (optimizer)
* optionally a learning rate scheduler


**Loss function** - Common loss function

| Loss Function | Explanation |
|---|---|
| Binary Cross-Entropy (BCE) Loss | Used for binary classification tasks. This loss function compares the model's predictions with the true values. For a single sample, it is defined as `-y_true * log(y_pred) - (1 - y_true) * log(1 - y_pred)`, where `y_true` is the true binary label (0 or 1), and `y_pred` is the predicted probability from the model. The BCE loss for a set of samples is the average of the BCE losses for each individual sample. It penalizes the model heavily when it makes confident and wrong predictions. |
| Categorical Cross-Entropy Loss | Used for multi-class classification tasks. This loss function is a generalization of the binary cross-entropy loss. It measures the dissimilarity between the predicted probability distribution and the true distribution. It is defined as `-sum(y_true * log(y_pred))` for each class, where `y_true` is the true label in one-hot encoded format, and `y_pred` is the predicted probability distribution from the model. The categorical cross-entropy loss for a set of samples is the average of the losses for each individual sample. It is suitable when each sample belongs to exactly one class. |
| Mean Squared Error (MSE) Loss | Used for regression tasks. It measures the average squared difference between the predicted and actual values. For a set of samples, it is defined as `(1/n) * sum((y_true - y_pred)^2)`, where `y_true` is the true value, `y_pred` is the predicted value from the model, and `n` is the number of samples. MSE loss penalizes the model more heavily for larger errors due to the squaring operation. |


In [None]:
# use categorical cross-entropy loss, if you like

criterion = nn.CrossEntropyLoss()

**Optimizer** — The optimization algorithm minimizes the loss function (in our case, the cross-entropy loss). There are many different optimizers available. Let’s use a popular one: `Adam`.

In [None]:
cfg.learning_rate = 1e-4

optimizer = torch.optim.Adam(
    params=model.parameters(),
    lr = cfg.learning_rate,
    weight_decay=0
)

[**Learning rate scheduler**](https://towardsdatascience.com/a-visual-guide-to-learning-rate-schedulers-in-pytorch-24bbb262c863)

There are many different learning rate schedulers available, but Kaggle Grandmasters recommend using cosine decay as a learning rate scheduler for fine-tuning

In [None]:
cfg.lr_min = 1e-5
cfg.epochs = 5

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
    optimizer=optimizer,
    T_max= np.ceil(len(train_dataloader.dataset) / cfg.batch_size) * cfg.epochs,
    eta_min=cfg.lr_min
)

`T_max` defines the half period and should be equal to the maximum number of iterations 
(`np.ceil(len(train_dataloader.dataset) /cfg.batch_size)*cfg.epochs`)

Result of learning rate will look as follows over training run

![Image](https://miro.medium.com/v2/resize:fit:700/1*igE55NzyCh3Z6MI587jYeg.jpeg)

### Metric
a metric for evaluate the model's overall performance.

I will you accuracy as the metric

>Metric measure the model's performance after training

>Loss function is used to optimize the learning function during training

In [None]:
from sklearn.metrics import accuracy_score

def calculate_metric(y, y_pred):
    metric = accuracy_score(y, y_pred)
    return metric

### Setup a training pipe-line

A model is typically trained in iteration (`One iteration is called an epoch`)

Training from scratch usually requires many epochs, while fine-tuning requires only few (roughly 5 to 10) epochs.

1. `train_one_epoch()` for training full data 
2. `validate_one_epoch()` for validate the model on an epoch


In [None]:
cfg.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
def train_one_epoch(dataloader: torch.utils.data.DataLoader,
                   model: torch.nn.Module, 
                   optimizer: torch.optim.Optimizer, 
                   scheduler: torch.optim.lr_scheduler, 
                   cfg):
    '''
    Train the model for an epoch.

    Parameters:
    dataloader (torch.utils.data.DataLoader): The data loader.
    model (torch.nn.Module): The model to be trained.
    optimizer (torch.optim.Optimizer): The optimizer.
    scheduler (torch.optim.lr_scheduler): The learning rate scheduler.
    cfg: The configuration with training details.

    Returns:
    metric (float): The calculated metric value.
    loss (float): The average loss value over the epoch.
    '''
    
    
    # Set the model to training mode.
    model.train()
    
    # Init lists to store y and y_pred
    final_y = []
    final_y_pred = []
    final_loss = []
    
    # Iterate over the data
    for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        
        model = model.to(cfg.device)
        # Move the batch tensors to the same device as the model.
        X = batch[0].float().to(cfg.device)
        y = batch[1].to(cfg.device)
        
        # Reset the gradients of the model parameters
        optimizer.zero_grad()
        
        # Enable gradient calculations.
        with torch.set_grad_enabled(True):
            # Compute the model output.
            y_pred = model(X)
            
            # Compute the loss value | Calculate the loss for the model output.
            loss = criterion(y_pred, y)
            
            # Convert targets and outputs to numpy arrays and add them to the lists.
            y = y.detach().cpu().numpy().tolist()
            y_pred = y_pred.detach().cpu().numpy().tolist()
            
            # Extend the list with current batch.
            final_y.extend(y)
            final_y_pred.extend(y_pred)
            final_loss.append(loss.item())
            
            # Perform back-propagation. | Back-propagate the error through the model.
            loss.backward()
            # Update the model parameters | Update the model to reduce the loss
            optimizer.step()
            
        # Update the learning rate
        scheduler.step()
    
    # Compute the average loss over the epoch.
    loss = np.mean(final_loss)
    
    # Convert predictions to numpy arrays and 
    # get the indices of the maximum values along an axis.
    final_y_pred = np.argmax(final_y_pred, axis=1)
    
    # Calculate the metric (e.g., accuracy) based on targets and predictions.
    metric = calculate_metric(final_y, final_y_pred)
    
    return metric, loss

In [None]:
def validate_one_epoch(dataloader: torch.utils.data.DataLoader, 
                      model: torch.nn.Module, 
                      cfg):
    '''
    Validate the model for an epoch.

    Parameters:
    dataloader (torch.utils.data.DataLoader): The data loader.
    model (torch.nn.Module): The model to be validated.
    cfg: The configuration with validation details.

    Returns:
    metric (float): The calculated metric value.
    loss (float): The average loss value over the epoch.
    '''
    
    # Set the model to evaluation mode. In this mode, layers like Dropout and BatchNorm 
    # will behave differently than in training mode.
    model.eval()
    
    # Initialize lists to store targets (y) and predictions (y_pred).
    final_y = []
    final_y_pred = []
    final_loss = []
    
    # Iterate over data
    for step, batch in tqdm(enumerate(dataloader), total=len(dataloader)):
        
        model = model.to(cfg.device)
        # Move the batch tensors to the same device as the model.
        X = batch[0].to(cfg.device).float()
        y = batch[1].to(cfg.device).long()
        
        # with torch.inference_mode():
        with torch.no_grad():
            # Compute the model output | forward pass of the input through the model
            y_pred = model(X)
            # Compute the loss value.
            loss = criterion(y_pred, y)
            # Convert targets and outputs to numpy arrays and add them to the lists.
            y = y.detach().cpu().numpy().tolist()
            y_pred = y_pred.detach().cpu().numpy().tolist()
            
            # Extend original list
            final_y.extend(y)
            final_y_pred.extend(y_pred)
            final_loss.append(loss.item())
            
    # Compute the average loss over the epoch.
    loss = np.mean(final_loss)

    # Convert predictions to numpy arrays and get the indices of the maximum values along an axis.
    final_y_pred = np.argmax(final_y_pred, axis=1)

    # Calculate the metric (e.g., accuracy) based on targets and predictions.
    metric = calculate_metric(final_y, final_y_pred)

    return metric, loss

* **Training Mode**: This is set by calling `model.train()`. In this mode, layers like Dropout and BatchNorm have an effect on the model's output. For Dropout, this means that during training, certain neurons will be randomly "dropped out", or turned off, which helps prevent overfitting. For BatchNorm, this means that the layer will normalize its inputs using both the current batch's mean and variance, as well as keeping track of running estimates of these statistics for use in testing.

* **Evaluation Mode**: This is set by calling `model.eval()`. In this mode, the Dropout layers will not "drop out" any neurons, and the BatchNorm layers will use the running estimates of mean and variance collected during training rather than the batch's mean and variance. This is because we want to use the full learned capabilities of the model for evaluation and testing, and we don't want the model's output to vary from one run to another.

### Adding data augmentation

If the metric of training and validation are significantly different, this indicates that the model is over-fitting to the training data. Over-fitting occurs when a model is trained on only a few examples and learns irrelevant details or noise from the training data.

You can use data augmentation to overcome over-fitting by generate additional taining data by randomly transforming existing images make model generalize better. 

In [None]:
transform_soft = A.Compose([A.Resize(cfg.image_size, cfg.image_size),
                           A.Rotate(p=0.6, limit=[-10,10]),
                           A.HorizontalFlip(p=0.6),
                           ToTensorV2()])

`p` parameter is control probability of applying the augmentation

In [None]:
from typing import Optional

cfg.seed = 42

def fit(model: torch.nn.Module, 
        optimizer: torch.optim.Optimizer, 
        cfg,
        scheduler: torch.optim.lr_scheduler,
        train_dataloader: torch.utils.data.DataLoader, 
        valid_dataloader: Optional[torch.utils.data.DataLoader] =None):
    
    '''
    Fits the model for a given number of epochs.

    Parameters:
    model (torch.nn.Module): The model to be trained.
    optimizer (torch.optim.Optimizer): The optimizer.
    scheduler (torch.optim.lr_scheduler): The learning rate scheduler.
    cfg: The configuration with training details.
    train_dataloader (torch.utils.data.DataLoader): The data loader for training.
    valid_dataloader (Optional[torch.utils.data.DataLoader]): The data loader for validation. If None, no validation is performed.

    Returns:
    acc_list (List[float]): The list of accuracy values for each training epoch.
    loss_list (List[float]): The list of loss values for each training epoch.
    val_acc_list (List[float]): The list of accuracy values for each validation epoch.
    val_loss_list (List[float]): The list of loss values for each validation epoch.
    model (torch.nn.Module): The trained model.
    '''
    
    # Initialize lists to store accuracy and loss values for training and validation.
    acc_list = []
    loss_list = []
    val_acc_list = []
    val_loss_list = []
    
    # Train for a specified number of epochs.
    for epoch in range(cfg.epochs):
        print(f"Epoch {epoch + 1}/{cfg.epochs}")
        
        # Set a new seed for each epoch to ensure different randomization.
        set_seed(cfg.seed + epoch)
        
        # Train the model for one epoch and get the accuracy and loss.
        acc, loss = train_one_epoch(train_dataloader,
                                   model, 
                                   optimizer=optimizer,
                                   scheduler=scheduler,
                                   cfg=cfg)
        
        # If a validation dataloader is provided, validate the model and get the accuracy and loss.
        if valid_dataloader:
            val_acc, val_loss = validate_one_epoch(valid_dataloader, model, cfg)
        
        # Print the training loss and accuracy for this epoch. 
        print(f"Loss: {loss:.4f} Acc: {acc:.4f}")
        
        # Add the accuracy and loss for this epoch to the lists.
        acc_list.append(acc)
        loss_list.append(loss)
        
        # If a validation dataloader was provided, print the validation loss and accuracy and add them to the lists.
        if valid_dataloader:
            print(f"Val Loss: {val_loss:.4f} Val Acc: {val_acc:.4f}")
            val_acc_list.append(val_acc)
            val_loss_list.append(val_loss)
            
            
    # Return the lists of accuracy and loss for both training and validation, as well as the trained model.
    return acc_list, loss_list, val_acc_list, val_loss_list, model

In [None]:
result = fit(model=model,
    optimizer=optimizer,
    cfg=cfg,
    scheduler=scheduler,
    train_dataloader=train_dataloader,
    valid_dataloader=valid_dataloader)