In [6]:
import lightning.pytorch as pl
import os
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.tensorboard import SummaryWriter, FileWriter
from torchvision.models.segmentation.deeplabv3 import deeplabv3_mobilenet_v3_large, DeepLabV3_MobileNet_V3_Large_Weights
from torchvision.models.segmentation.deeplabv3 import deeplabv3_resnet50, DeepLabV3_ResNet50_Weights
from torchvision.models.segmentation.deeplabv3 import deeplabv3_resnet101, DeepLabV3_ResNet101_Weights
from torch.optim import Adam, SGD, LBFGS, Adadelta, Adamax, Adagrad, ASGD
from torch.optim.lr_scheduler import CyclicLR, PolynomialLR, CosineAnnealingWarmRestarts
from torch.optim.lr_scheduler import ReduceLROnPlateau, ConstantLR, StepLR, CosineAnnealingLR
from lightning.pytorch.callbacks import ModelCheckpoint, StochasticWeightAveraging, EarlyStopping
from lightning.pytorch.callbacks import ModelSummary, LearningRateFinder, TQDMProgressBar, DeviceStatsMonitor
from lightning.pytorch.profilers import AdvancedProfiler, PyTorchProfiler
from lightning.pytorch.loggers import TensorBoardLogger
import sys
#from utils.losses import IoULoss, DiceLoss, TverskyLoss, FocalTverskyLoss, HybridLoss, FocalHybridLoss
from utils.datasets import CityscapesDataset #, MapillaryDataset
from torch.utils.data import DataLoader
from torchsummary import summary
#from utils.eval import MeanIoU
#from utils.models import  Unet, Residual_Unet, Attention_Unet, Unet_plus, DeepLabV3plus
from argparse import ArgumentParser
import yaml

In [7]:
# Read YAML file
print('Reading configuration from config yaml')

with open('config/Cityscapes.yaml', 'r') as config_file:
    config = yaml.safe_load(config_file)

# TODO: Add default values if a variable is not defined in the config file

LOGS_DIR = config.get('logs_dir')

model_config = config.get('model_config')
dataset_config = config.get('dataset_config')
train_config = config.get('train_config')

# Dataset Configuration
DATASET = dataset_config.get('name')
DATA_PATH = dataset_config.get('path')
VERSION = dataset_config.get('version')
NUM_TRAIN_IMAGES = dataset_config.get('num_train_images')
NUM_EVAL_IMAGES = dataset_config.get('num_eval_images')
SEED = dataset_config.get('seed')

# Model Configuration
MODEL_TYPE = model_config.get('architecture')
MODEL_NAME = model_config.get('name')
BACKBONE = model_config.get('backbone')
UNFREEZE_AT = model_config.get('unfreeze_at')
INPUT_SHAPE = model_config.get('input_shape')
OUTPUT_STRIDE = model_config.get('output_stride')
FILTERS = model_config.get('filters')
ACTIVATION = model_config.get('activation')
DROPOUT_RATE = model_config.get('dropout_rate')

# Training Configuration
# PRETRAINED_WEIGHTS = model_config['pretrained_weights']

BATCH_SIZE = train_config.get('batch_size') #
EPOCHS = train_config.get('epochs') #
AUGMENTATION = train_config.get('augment') #
PRECISION = str(train_config.get('precision')) #

# Stohastic weight averaging parameters
SWA = train_config.get('swa')
if SWA is not None:
    SWA_LRS = SWA.get('lr', 1e-3)
    SWA_EPOCH_START = SWA.get('epoch_start', 0.7)

DISTRIBUTE_STRATEGY = train_config.get('distribute').get('strategy') #
DEVICES = train_config.get('distribute').get('devices') #

Reading configuration from config yaml


In [8]:
class DeepLabV3(pl.LightningModule):
    def __init__(self, 
                 model: nn.Module,
                 train_config: dict = None
                 
                 ) -> None:        
        super().__init__()
        self.save_hyperparameters(ignore='model')
        
        loss = train_config.get('loss', 'CrossEntropy')
        
        self.model = model
        self.loss = self.get_loss(loss)
        self.optimizer_config = train_config.get('optimizer')
        self.lr_schedule_config = train_config.get('lr_schedule')
        self.batch_size = train_config.get('batch_size')
        
        #self.example_input_array = torch.Tensor(self.batch_size, 3, 1024, 2048)
        
    def get_lr_schedule(self, optimizer):
        lr = self.optimizer_config.get('learnin_rate', 1e-3)
        schedule = self.lr_schedule_config.get('name')
        
        # num of steps in cyclic lr should be cycle_epochs * steps_per_epoch
        # steps_per_epoch is defined depended on the length of the dataset
        # so maybe define the dataset inside the Lightning Module using 
        # the DataModule object
        
        if schedule in ['Polynomial', 'PolynomialLr', 'PolynomialLR', 'polynomial']:
            decay_epochs = self.lr_schedule_config.get('decay_epochs')
            power = self.lr_schedule_config.get('power')
            lr_schedule = PolynomialLR(
                optimizer=optimizer,
                total_iters=decay_epochs, #*steps_per_epoch,
                power=power,
                verbose=True
            )
            
        elif schedule in ['CyclicLR', 'Cyclic', 'CyclicLr', 'cyclic']:
            lr_schedule = CyclicLR(
                optimizer = optimizer,
                base_lr = lr,
                max_lr = self.lr_schedule_config.get('max_lr', 1e-2),
                # step_size_up=
                # step_size_down=
                gamma = self.lr_schedule_config.get('gamma', 1.0),
                verbose=  True
            )

        return lr_schedule
    
    
    def get_loss(self, loss: str):
        if loss in ['CrossEntropy', 'CrossEntropyLoss', 'crossentropy']:
            loss_fn = nn.CrossEntropyLoss()
        # elif loss in ['Dice, DiceLoss']:
        #     loss_fn = DiceLoss()
        # elif loss in ['Hybrid', 'HybridLoss']:
        #     loss_fn = HybridLoss()
        # elif loss in ['rmi', 'RMI', 'RmiLoss', 'RMILoss']:
        #     loss_fn = RMILoss()
        return loss_fn
    
    def training_step(self, train_batch, batch_idx):
        input, target = train_batch
        pred = self.model(input)['out']
        loss = self.loss(pred, target)
        # Logging to TensorBoard (if installed) by default
        self.log("train_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True)
        return loss
    
    def validation_step(self, val_batch, batch_idx):
        input, target = val_batch
        pred = self.model(input)['out']
        loss = self.loss(pred, target)
        self.log("val_loss", loss, on_epoch=True, on_step=False, prog_bar=True, sync_dist=True)
    
    
    def configure_optimizers(self):
        optimizer_name = self.optimizer_config.get('name', 'Adam')
        lr = self.optimizer_config.get('learnin_rate', 1e-3)
        weight_decay = self.optimizer_config.get('weight_decay', 0)
        momentum = self.optimizer_config.get('momentum', 0)
        
        optimizer_dict = {
            'Adam' : Adam(params=self.model.parameters(),
                          lr=lr,
                          weight_decay=weight_decay),
            'Adadelta' : Adadelta(params=self.model.parameters(),
                                  lr=lr,
                                  weight_decay=weight_decay),
            'SGD' : SGD(params=self.model.parameters(),
                        lr=lr,
                        momentum=momentum,
                        weight_decay=weight_decay)
        }

        optimizer = optimizer_dict[optimizer_name]
        return {
            'optimizer': optimizer,
            'lr_scheduler': self.get_lr_schedule(optimizer)
        }
    


In [9]:
train_ds = CityscapesDataset(root=DATA_PATH, 
                             split='train', 
                             mode=VERSION, 
                             target_type='semantic'
                             )
train_loader = DataLoader(dataset=train_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

val_ds = CityscapesDataset(root=DATA_PATH, 
                           split='val', 
                           mode=VERSION, 
                           target_type='semantic'
                           )
val_loader = DataLoader(dataset=val_ds, batch_size=BATCH_SIZE, shuffle=False)

In [10]:
model_checkpoint_path = f'saved_models/{MODEL_TYPE}/{MODEL_NAME}'
model_checkpoint_callback = ModelCheckpoint(dirpath=LOGS_DIR,
                                            filename=model_checkpoint_path,
                                            save_weights_only=False,
                                            monitor='val_loss',
                                            mode='min',
                                #    monitor='MeanIoU',
                                #    mode='max',
                                   verbose=True)

early_stopping_callback = EarlyStopping(patience=6,
                                        monitor='val_loss',
                                        # mode='max',
                                        min_delta=1e-6,
                                        verbose=True,
                                        strict=True,
                                        check_finite=True,
                                        log_rank_zero_only=True)


profiler = PyTorchProfiler(dirpath=LOGS_DIR, filename="perf-logs")
#profiler = AdvancedProfiler(dirpath=LOGS_DIR, filename="perf_logs")
#lr_finder_callback = LearningRateFinder()

In [11]:
callbacks = [model_checkpoint_callback, ModelSummary(max_depth=3), DeviceStatsMonitor()]

if SWA is not None:
    swa_callback = StochasticWeightAveraging(swa_lrs=SWA_LRS,
                                         swa_epoch_start=SWA_EPOCH_START)
    callbacks.append(swa_callback)

In [12]:
logger = TensorBoardLogger(save_dir=LOGS_DIR, name='Tensorboard_logs', version=f'{MODEL_TYPE}/{MODEL_NAME}')

In [16]:
model = DeepLabV3(
    model = deeplabv3_mobilenet_v3_large(num_classes=20),
    train_config=train_config
)

trainer = pl.Trainer(
    accelerator='gpu',
    devices=DEVICES,
    limit_train_batches=NUM_TRAIN_IMAGES, 
    limit_val_batches=NUM_EVAL_IMAGES,
    max_epochs=EPOCHS,
    precision=PRECISION,
    deterministic=False,
    callbacks=callbacks,
    default_root_dir=LOGS_DIR,
    logger=logger,
    #strategy=DISTRIBUTE_STRATEGY
    profiler='simple',
    #sync_batchnorm=True,
)

Trainer already configured with model summary callbacks: [<class 'lightning.pytorch.callbacks.model_summary.ModelSummary'>]. Skipping setting a default `ModelSummary` callback.
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [17]:
torch.set_float32_matmul_precision('high')

In [18]:
trainer.fit(model, train_loader, val_loader)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1]

   | Name               | Type                    | Params
----------------------------------------------------------------
0  | model              | DeepLabV3               | 11.0 M
1  | model.backbone     | IntermediateLayerGetter | 3.0 M 
2  | model.backbone.0   | Conv2dNormActivation    | 464   
3  | model.backbone.1   | InvertedResidual        | 464   
4  | model.backbone.2   | InvertedResidual        | 3.4 K 
5  | model.backbone.3   | InvertedResidual        | 4.4 K 
6  | model.backbone.4   | InvertedResidual        | 10.3 K
7  | model.backbone.5   | InvertedResidual        | 21.0 K
8  | model.backbone.6   | InvertedResidual        | 21.0 K
9  | model.backbone.7   | InvertedResidual        | 32.1 K
10 | model.backbone.8   | InvertedResidual        | 34.8 K
11 | model.backbone.9   | InvertedResidual        | 32.0 K
12 | model.backbone.10  | InvertedResidual    

Adjusting learning rate of group 0 to 1.0000e-03.
Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(
  one_hot_output.scatter_(0, torch.tensor(input, dtype=torch.int64), 1)


                                                                           

STAGE:2023-05-06 12:19:40 196158:196158 ActivityProfilerController.cpp:311] Completed Stage: Warm Up


Epoch 0:   8%|▊         | 4/50 [00:04<00:55,  1.21s/it, v_num=ets1]

STAGE:2023-05-06 12:19:48 196158:196158 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-06 12:19:48 196158:196158 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


Epoch 0:  98%|█████████▊| 49/50 [00:40<00:00,  1.21it/s, v_num=ets1]Adjusting learning rate of group 0 to 8.1000e-04.
Epoch 0: 100%|██████████| 50/50 [00:41<00:00,  1.21it/s, v_num=ets1]

STAGE:2023-05-06 12:20:25 196158:196158 ActivityProfilerController.cpp:311] Completed Stage: Warm Up




STAGE:2023-05-06 12:20:27 196158:196158 ActivityProfilerController.cpp:317] Completed Stage: Collection
STAGE:2023-05-06 12:20:27 196158:196158 ActivityProfilerController.cpp:321] Completed Stage: Post Processing


Epoch 0: 100%|██████████| 50/50 [00:49<00:00,  1.01it/s, v_num=ets1, val_loss=1.290, train_loss=0.884]

Epoch 0, global step 50: 'val_loss' was not in top 1


Epoch 1:  98%|█████████▊| 49/50 [00:39<00:00,  1.23it/s, v_num=ets1, val_loss=1.290, train_loss=0.884]Adjusting learning rate of group 0 to 6.4000e-04.
Epoch 1: 100%|██████████| 50/50 [00:48<00:00,  1.03it/s, v_num=ets1, val_loss=1.180, train_loss=0.588]

Epoch 1, global step 100: 'val_loss' was not in top 1


Epoch 2:  98%|█████████▊| 49/50 [00:39<00:00,  1.23it/s, v_num=ets1, val_loss=1.180, train_loss=0.588]Adjusting learning rate of group 0 to 4.9000e-04.
Epoch 2: 100%|██████████| 50/50 [00:48<00:00,  1.03it/s, v_num=ets1, val_loss=1.360, train_loss=0.478]

Epoch 2, global step 150: 'val_loss' was not in top 1


Epoch 3:  98%|█████████▊| 49/50 [00:39<00:00,  1.23it/s, v_num=ets1, val_loss=1.360, train_loss=0.478]Adjusting learning rate of group 0 to 3.6000e-04.
Epoch 3: 100%|██████████| 50/50 [00:48<00:00,  1.03it/s, v_num=ets1, val_loss=1.130, train_loss=0.403]

Epoch 3, global step 200: 'val_loss' was not in top 1


Epoch 4:  98%|█████████▊| 49/50 [00:39<00:00,  1.23it/s, v_num=ets1, val_loss=1.130, train_loss=0.403]Adjusting learning rate of group 0 to 2.5000e-04.
Epoch 4: 100%|██████████| 50/50 [00:48<00:00,  1.03it/s, v_num=ets1, val_loss=1.020, train_loss=0.340]

`Trainer.fit` stopped: `max_epochs=5` reached.


Epoch 4: 100%|██████████| 50/50 [00:48<00:00,  1.03it/s, v_num=ets1, val_loss=1.020, train_loss=0.340]
