In [1]:
import torch
import torch.nn as nn
import torchvision
from torchvision.transforms import transforms
import torch.nn.functional as F
import matplotlib.pyplot as plt
import pytorch_lightning as pl
from pytorch_lightning import Trainer

In [2]:
# !pip install lightning[extra] tensorboard

In [3]:
### Hyper-parameters
input_size = 784    ## image size => 28*28
hidden_size = 500
num_classes = 10
num_epochs = 2
batch_size = 100
learning_rate = 0.001

In [4]:
# Set precision to leverage Tensor Cores for improved performance
torch.set_float32_matmul_precision('medium')

### Fully connected neural network with one hidden layer

class LitNeuralNet(pl.LightningModule):

    def __init__(self, input_size, hidden_size, num_classes):
        super(LitNeuralNet, self).__init__()
        self.l1 = nn.Linear(in_features= input_size, out_features= hidden_size)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(in_features= hidden_size, out_features= num_classes)
        
        # Initialize the list for storing validation losses
        self.val_losses = []
    def forward(self, x):

        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out
    
    def train_dataloader(self):
        training_dataset = torchvision.datasets.MNIST(root='./mnist_folder', 
                                                train=True, 
                                                transform= transforms.ToTensor(), 
                                                download=True)
        
        train_loader = torch.utils.data.DataLoader(dataset= training_dataset,
                                                batch_size= batch_size,
                                                num_workers = 2,
                                                persistent_workers = True,
                                                shuffle= True)
        return train_loader
        
    def val_dataloader(self):
        testing_dataset = torchvision.datasets.MNIST(root='./mnist_folder', 
                                                train=False, 
                                                transform= transforms.ToTensor(), 
                                                download=False)
        
        test_loader = torch.utils.data.DataLoader(dataset= testing_dataset,
                                                batch_size= batch_size,
                                                num_workers = 2,
                                                persistent_workers = True,
                                                shuffle= False)
        return test_loader
    
    def training_step(self, batch, batch_idx):
        images , labels = batch
        ## original image size = [100, 1, 28, 28] because of ( 100 => total images , 1 => one color channel , ( 28*28 ) => pixel size)
        ## you have resize the image as per the model input
        ## [100, 784] because ( total 100 images are there and 784 is the pixel value after flatten )
        images = images.reshape(-1, 28*28)

        ## forward pass
        output = self(images)
        loss = F.cross_entropy(output, labels)

        tensorboard_logs = {'train_loss': loss}
        ### use key 'log'
        return {"loss": loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.reshape(-1, 28*28)
        output = self(images)
        loss = F.cross_entropy(output, labels)
        return {"val_loss": loss}

    # def on_validation_batch_end(self, outputs):
    #     # outputs = list of dictionaries
    #     avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    #     tensorboard_logs = {'avg_val_loss': avg_loss}
    #     # use key 'log'
    #     return {'val_loss': avg_loss, 'log': tensorboard_logs}

    # def validation_epoch_end(self, outputs):
    #     # outputs is a list of dictionaries from each validation step
    #     avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    #     tensorboard_logs = {'avg_val_loss': avg_loss}
    #     return {'val_loss': avg_loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(-1, 28 * 28)
        output = self(images)
        loss = F.cross_entropy(output, labels)
        
        # Store loss for later use in on_validation_epoch_end
        self.val_losses.append(loss)
        
        return {"val_loss": loss}


    def on_validation_epoch_end(self):
        # Calculate average validation loss
        avg_loss = torch.stack(self.val_losses).mean()
        
        # Log the average validation loss
        self.log('avg_val_loss', avg_loss, prog_bar=True)
        
        # Clear stored losses
        self.val_losses.clear()

    def configure_optimizers(self):
        return torch.optim.Adam(self.parameters(), lr=learning_rate)

In [5]:
if __name__ == '__main__':
    model = LitNeuralNet(input_size, hidden_size, num_classes)
    
    # gpus=8
    # fast_dev_run=True -> runs single batch through training and validation
    # train_percent_check=0.1 -> train only on 10% of data
    # trainer = Trainer(max_epochs=num_epochs)
    trainer = Trainer(max_epochs=num_epochs, devices=1, accelerator='gpu', logger=True)
    trainer.fit(model)
          
    # advanced features
    # distributed_backend
    # (DDP) implements data parallelism at the module level which can run across multiple machines.
    # 16 bit precision
    # log_gpu_memory
    # TPU support
    
    # auto_lr_find: automatically finds a good learning rate before training
    # deterministic: makes training reproducable
    # gradient_clip_val: 0 default


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type   | Params | Mode 
----------------------------------------
0 | l1   | Linear | 392 K  | train
1 | relu | ReLU   | 0      | train
2 | l2   | Linear | 5.0 K  | train
----------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

`Trainer.fit` stopped: `max_epochs=2` reached.


# Model with EarlyStopping, Learning Rate Scheduler, Model Checkpoint, and Callbacks

In [6]:
from torch.optim.lr_scheduler import ReduceLROnPlateau, StepLR
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor, Callback

# Set precision to leverage Tensor Cores for improved performance
torch.set_float32_matmul_precision('medium')

### Fully connected neural network with one hidden layer
class LitNeuralNet2(pl.LightningModule):

    def __init__(self, input_size, hidden_size, num_classes):
        super(LitNeuralNet2, self).__init__()
        self.l1 = nn.Linear(in_features= input_size, out_features= hidden_size)
        self.relu = nn.ReLU()
        self.l2 = nn.Linear(in_features= hidden_size, out_features= num_classes)
        
        # Initialize the list for storing validation losses
        self.val_losses = []
    def forward(self, x):

        out = self.l1(x)
        out = self.relu(out)
        out = self.l2(out)
        return out
    
    def train_dataloader(self):
        training_dataset = torchvision.datasets.MNIST(root='./mnist_folder', 
                                                train=True, 
                                                transform= transforms.ToTensor(), 
                                                download=True)
        
        train_loader = torch.utils.data.DataLoader(dataset= training_dataset,
                                                batch_size= batch_size,
                                                num_workers = 2,
                                                persistent_workers = True,
                                                shuffle= True)
        return train_loader
        
    def val_dataloader(self):
        testing_dataset = torchvision.datasets.MNIST(root='./mnist_folder', 
                                                train=False, 
                                                transform= transforms.ToTensor(), 
                                                download=False)
        
        test_loader = torch.utils.data.DataLoader(dataset= testing_dataset,
                                                batch_size= batch_size,
                                                num_workers = 2,
                                                persistent_workers = True,
                                                shuffle= False)
        return test_loader
    
    def training_step(self, batch, batch_idx):
        images , labels = batch
        ## original image size = [100, 1, 28, 28] because of ( 100 => total images , 1 => one color channel , ( 28*28 ) => pixel size)
        ## you have resize the image as per the model input
        ## [100, 784] because ( total 100 images are there and 784 is the pixel value after flatten )
        images = images.reshape(-1, 28*28)

        ## forward pass
        output = self(images)
        loss = F.cross_entropy(output, labels)

        tensorboard_logs = {'train_loss': loss}
        ### use key 'log'
        return {"loss": loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.reshape(-1, 28*28)
        output = self(images)
        loss = F.cross_entropy(output, labels)
        return {"val_loss": loss}

    # def on_validation_batch_end(self, outputs):
    #     # outputs = list of dictionaries
    #     avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    #     tensorboard_logs = {'avg_val_loss': avg_loss}
    #     # use key 'log'
    #     return {'val_loss': avg_loss, 'log': tensorboard_logs}

    # def validation_epoch_end(self, outputs):
    #     # outputs is a list of dictionaries from each validation step
    #     avg_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
    #     tensorboard_logs = {'avg_val_loss': avg_loss}
    #     return {'val_loss': avg_loss, 'log': tensorboard_logs}

    def validation_step(self, batch, batch_idx):
        images, labels = batch
        images = images.view(-1, 28 * 28)
        output = self(images)
        loss = F.cross_entropy(output, labels)
        
        # Store loss for later use in on_validation_epoch_end
        self.val_losses.append(loss)
        
        return {"val_loss": loss}

    def on_validation_epoch_end(self):
        # Calculate average validation loss
        avg_loss = torch.stack(self.val_losses).mean()
        
        # Log the average validation loss
        self.log('avg_val_loss', avg_loss, prog_bar=True)
        
        # Clear stored losses
        self.val_losses.clear()

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        
        # Use a StepLR scheduler
        scheduler = StepLR(optimizer, step_size=1, gamma=0.7)
        # scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.2, patience=3, min_lr=7e-7, cooldown=0)

        # Return the optimizer and scheduler
        return {"optimizer": optimizer, "lr_scheduler": scheduler}


model2 = LitNeuralNet2(input_size, hidden_size, num_classes)

In [7]:
# Custom callback example
class PrintCallback(Callback):
    def on_train_start(self, trainer, pl_module):
        print("Training is starting...")

    def on_train_end(self, trainer, pl_module):
        print("Training is ending...")

In [8]:
## Define callbacks
early_stopping_callback = EarlyStopping(
    monitor='avg_val_loss',
    patience=3,
    verbose=True,
    mode='min'
)

## define model checkpoint
checkpoint_callback = ModelCheckpoint(
    monitor='avg_val_loss',
    dirpath='my_model_checkpoints',
    filename='mnist-{epoch:02d}-{avg_val_loss:.2f}',
    save_top_k=3,
    mode='min'
)

lr_monitor = LearningRateMonitor(logging_interval='epoch')

print_callback = PrintCallback()

## Initialize the Trainer with callbacks
trainer = pl.Trainer(
    max_epochs=num_epochs,
    devices=1,
    accelerator='gpu',
    logger=True,
    callbacks=[early_stopping_callback, checkpoint_callback, lr_monitor, print_callback]
)

## Start training
trainer.fit(model2)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs
c:\Users\jites\anaconda3\envs\torchenv\lib\site-packages\pytorch_lightning\callbacks\model_checkpoint.py:652: Checkpoint directory C:\Users\jites\Desktop\Project_folder\Jupyter_practice\pytorch_practice\my_model_checkpoints exists and is not empty.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name | Type   | Params | Mode 
----------------------------------------
0 | l1   | Linear | 392 K  | train
1 | relu | ReLU   | 0      | train
2 | l2   | Linear | 5.0 K  | train
----------------------------------------
397 K     Trainable params
0         Non-trainable params
397 K     Total params
1.590     Total estimated model params size (MB)


Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training is starting...


Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Metric avg_val_loss improved. New best score: 0.161


Validation: |          | 0/? [00:00<?, ?it/s]

Metric avg_val_loss improved by 0.058 >= min_delta = 0.0. New best score: 0.103
`Trainer.fit` stopped: `max_epochs=2` reached.


Training is ending...
