# Pytorch Lightning, Callbacks 

Objectives:
1. How to use pytorch lightning
2. What is callbacks and how to use it for training?
3. An end-to-end ML model for classification

In [1]:
import pytorch_lightning
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from torchvision import transforms
import pandas as pd
import os
import torch.nn as nn
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping

pytorch_lightning.__version__

'2.5.0.post0'

# Hyperparameters

In [2]:
BATCH_SIZE = 32
LR = 0.001

In [3]:
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [4]:
data = pd.read_csv('../Datasets/digit_train.csv')
data.head()

Unnamed: 0,label,pixel0,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,...,pixel774,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# Data Preparation

In [5]:
class DigitDataset(Dataset):
    def __init__(self, file_path, transform):
        self.data = pd.read_csv(file_path)
        self.transform = transform
        
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        example = self.data.iloc[idx]
        pixels = example.values[1:].astype('float32')
        label = int(example.values[0])
        
        pixels = torch.tensor(pixels).reshape(28, 28).unsqueeze(0)
        label = torch.tensor(label)

        if self.transform:
            pixels = self.transform(pixels)
        
        return pixels, label

In [6]:
pixel_transformation = transforms.Compose([
    transforms.Normalize(
        mean=torch.Tensor([0.1307]), 
        std=torch.Tensor([0.3081])
    )
])

In [7]:
dataset = DigitDataset(
    file_path='../Datasets/digit_train.csv',
    transform=pixel_transformation
)

In [8]:
train_size = int(0.7 * len(dataset))
val_size = int(0.15 * len(dataset))
test_size = len(dataset) - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(
    dataset=dataset, 
    lengths=[train_size, val_size, test_size]
)

In [9]:
train_loader = DataLoader(dataset=train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(dataset=val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(dataset=test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
for pixels, labels in train_loader:
    print(pixels.shape)
    print(labels.shape)
    break

torch.Size([32, 1, 28, 28])
torch.Size([32])


# Model Build

In [11]:
class DigitClassifier(pl.LightningModule):
    def __init__(self):
        super(DigitClassifier, self).__init__()
        self.fc1 = nn.Linear(28 * 28, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 10)

    def forward(self, x):
        x = x.view(-1, 28 * 28)
        x = self.fc1(x)
        x = torch.relu(x)
        x = self.fc2(x)
        x = torch.relu(x)
        x = self.fc3(x)
        return x
    
    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=LR)
        return optimizer
    
    def training_step(self, batch, batch_idx):
        """
        Suppose you have 320 examples and batch size if 32
        pytorch shuffle the examples randomly
        and prepare 320 / 32 = 10 batches
        After that torch provides the batch randomly to the training_step
        """
        data, target = batch
        output = self(data) # Calls the forward method
        loss = nn.CrossEntropyLoss()(output, target)
        self.log('loss', loss)
        return loss
    
    def validation_step(self, batch, batch_idx):
        data, target = batch
        output = self(data) # Calls the forward method
        val_loss = nn.CrossEntropyLoss()(output, target)
        """
        val_acc produces a binary mask for each example in the batch
        """
        val_acc = (output.argmax(dim=1) == target).float().mean()
        self.log('val_loss', val_loss, prog_bar=True)
        self.log('val_acc', val_acc, prog_bar=True)
        
    def test_step(self, batch, batch_idx):
        data, target = batch
        output = self(data) # Calls the forward method
        test_loss = nn.CrossEntropyLoss()(output, target)
        """
        val_acc produces a binary mask for each example in the batch
        """
        test_acc = (output.argmax(dim=1) == target).float().mean()
        self.log('test_loss', test_loss, prog_bar=True)
        self.log('test_acc', test_acc, prog_bar=True)

In [12]:
model = DigitClassifier()

# Train Model

## Understand callback
 What is callback?
- Any function that is executed after each step/epoch to control the training is a callback
- For example,
  - early stopping -> stops the training if validation loss doesn't improve over time'
  - checkpointing -> saves the model checkpoint

In [13]:
checkpoint_path = os.path.join(
    os.getcwd(), "../../Model/", "best_model.pth"
)

checkpoint_callback = ModelCheckpoint(
    monitor='val_acc',
    save_top_k=1,
    mode='max',
)

early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=2,
    verbose=True,
)

In [14]:
trainer = pl.Trainer(
    max_epochs=3,
    callbacks=[checkpoint_callback, early_stopping],
)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\logger_connector\logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `pytorch_lightning` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default


In [15]:
trainer.fit(
    model=model,
    train_dataloaders=train_loader,
    val_dataloaders=val_loader
)


  | Name | Type   | Params | Mode 
----------------------------------------
0 | fc1  | Linear | 100 K  | train
1 | fc2  | Linear | 8.3 K  | train
2 | fc3  | Linear | 650    | train
----------------------------------------
109 K     Trainable params
0         Non-trainable params
109 K     Total params
0.438     Total estimated model params size (MB)
3         Modules in train mode
0         Modules in eval mode


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'val_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.
c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'train_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved. New best score: 0.312


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.092 >= min_delta = 0.0. New best score: 0.220


Validation: |          | 0/? [00:00<?, ?it/s]

Metric val_loss improved by 0.013 >= min_delta = 0.0. New best score: 0.207
`Trainer.fit` stopped: `max_epochs=3` reached.


In [16]:
trainer.test(
    model=model,
    dataloaders=test_loader,
)

c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.24105729162693024, 'test_acc': 0.9446119666099548}]

In [17]:
best_model_path = checkpoint_callback.best_model_path
best_model = DigitClassifier.load_from_checkpoint(best_model_path)

trainer.test(
    best_model,
    dataloaders=test_loader,
)

Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.24105729162693024, 'test_acc': 0.9446119666099548}]

In [22]:
saved_model_path ='../Model/digit_classifier_model_release.pth'
torch.save(best_model, saved_model_path)

In [23]:
production_model = torch.load(saved_model_path)
trainer.test(production_model, dataloaders=test_loader)

  production_model = torch.load(saved_model_path)
c:\Users\ADMIN\AppData\Local\Programs\Python\Python312\Lib\site-packages\pytorch_lightning\trainer\connectors\data_connector.py:425: The 'test_dataloader' does not have many workers which may be a bottleneck. Consider increasing the value of the `num_workers` argument` to `num_workers=15` in the `DataLoader` to improve performance.


Testing: |          | 0/? [00:00<?, ?it/s]

[{'test_loss': 0.24105729162693024, 'test_acc': 0.9446119666099548}]