In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Install Libraries

In [2]:
# ! pip install -U lightning ultralytics

## Organize Imports

In [3]:
import os
import gc
from pathlib import Path
import requests
import zipfile

In [4]:
import json

In [5]:
from PIL import Image

In [6]:
import matplotlib.pyplot as plt

In [7]:
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split

In [8]:
from torchmetrics.classification import Dice

In [9]:
from torchvision.models import resnet34, resnet
from torchvision.models.detection import mask_rcnn, maskrcnn_resnet50_fpn, maskrcnn_resnet50_fpn_v2
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor
from torchvision.models.detection.mask_rcnn import MaskRCNNPredictor
from torchvision.datasets import OxfordIIITPet, Cityscapes, wrap_dataset_for_transforms_v2
from torchvision.transforms import v2

In [10]:
import lightning as pl
from lightning.pytorch.callbacks import ModelCheckpoint
from lightning.pytorch.loggers import TensorBoardLogger

In [11]:
from ultralytics import YOLO

## Initialize Folders for Model and Data

In [12]:
def makedirs(dir_path: Path):
    dir_path.mkdir(exist_ok=True, parents=True)

In [13]:
DATA = Path('data')
MODELS = Path('models')
unet_path = MODELS / 'unset'
yolo_path = MODELS / 'yolo11'
pets_path = DATA / 'pets'
carparts_path = DATA / 'carparts'
cracks_path = DATA / 'cracks'
packages_path = DATA / 'packages'
makedirs(unet_path)
makedirs(yolo_path)
makedirs(pets_path)

## Initialize Device

In [14]:
def init_device():
    # For the most part I'll try to import functions and classes near
    # where they are used
    # to make it clear where they come from.
    if torch.backends.mps.is_available():
        device = 'mps'
    else:
        device = 'cuda' if torch.cuda.is_available() else 'cpu'

    print(f'Device: {device}')

    return device

In [15]:
device = init_device()
device

Device: mps


'mps'

# Section A: UNet Segmentation Training with Pretrained ResNet34 Backbone

In this section we will train a UNet model for semantic segmentation. The encoder uses a pretrained ResNet34 backbone. We will use the Oxford-IIIT Pet dataset (downloaded via TorchVision) and train our UNet model using PyTorch Lightning.

#### Prepare the Model

In [16]:
class ConvBlock(nn.Module):
    def __init__(self, in_ch, out_ch):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(in_ch, out_ch, 3, padding=1),
            nn.ReLU(),
            nn.Conv2d(out_ch, out_ch, 3, padding=1),
            nn.ReLU()
        )

    def forward(self, x):
        return self.conv(x)

class UNetResNet34(nn.Module):
    def __init__(self, n_classes=1):
        super().__init__()
        base = resnet34(weights=resnet.ResNet34_Weights.DEFAULT)
        self.enc0 = nn.Sequential(base.conv1, base.bn1, base.relu)
        self.enc1 = nn.Sequential(base.maxpool, base.layer1)
        self.enc2 = base.layer2
        self.enc3 = base.layer3
        self.enc4 = base.layer4

        self.dec3 = ConvBlock(512+256, 256)
        self.dec2 = ConvBlock(256+128, 128)
        self.dec1 = ConvBlock(128+64, 64)
        self.dec0 = ConvBlock(64+64, 32)
        self.final = nn.Conv2d(32, n_classes, kernel_size=1)

    def forward(self, x):
        enc0 = self.enc0(x)
        enc1 = self.enc1(enc0)
        enc2 = self.enc2(enc1)
        enc3 = self.enc3(enc2)
        enc4 = self.enc4(enc3)

        dec3 = self.dec3(torch.cat([F.interpolate(enc4, enc3.size()[2:]), enc3], dim=1))
        dec2 = self.dec2(torch.cat([F.interpolate(dec3, enc2.size()[2:]), enc2], dim=1))
        dec1 = self.dec1(torch.cat([F.interpolate(dec2, enc1.size()[2:]), enc1], dim=1))
        dec0 = self.dec0(torch.cat([F.interpolate(dec1, enc0.size()[2:]), enc0], dim=1))

        out = self.final(dec0)   # [B, 1, 59, 59]

        # Upsample explicitly to match target (118,118)
        out = F.interpolate(out, size=x.shape[2:], mode='bilinear', align_corners=False)
    
        return torch.sigmoid(out)

In [17]:
class LitUNet(pl.LightningModule):
    def __init__(self, model):
        super().__init__()
        self.model = model
        self.loss_fn = nn.BCELoss()
        self.dice = Dice(num_classes=2, average='micro')

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        dice_score = self.dice((y_hat>0.5).int(), y.int())
        self.log("train_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log("train_dice", dice_score, prog_bar=True, on_step=False, on_epoch=True)
        
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        dice_score = self.dice((y_hat>0.5).int(), y.int())
        self.log("val_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log("val_dice", dice_score, prog_bar=True, on_step=False, on_epoch=True)

        return loss

    def test_step(self, batch, batch_idx):
        x, y = batch
        y_hat = self(x)
        loss = self.loss_fn(y_hat, y)
        dice_score = self.dice((y_hat>0.5).int(), y.int())
        self.log("test_loss", loss, prog_bar=True, on_step=False, on_epoch=True)
        self.log("test_dice", dice_score, prog_bar=True, on_step=False, on_epoch=True)

        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3, weight_decay=1e-4)
        scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=5, T_mult=2)
        
        return [optimizer], [scheduler]

#### Initialize Data

In [18]:
class TargetTransform(object):

    def __init__(self):
        super().__init__()

    def forward(self, x):
        return (x - 1).clamp(0, 1).float()

    def __call__(self, *args, **kwargs):
        return self.forward(*args, **kwargs)

In [19]:
workers=0

In [21]:
class JointTransform:
    def __init__(self, base_size, crop_size):
        self.transforms = v2.Compose([
            v2.Resize(base_size),
            v2.RandomHorizontalFlip(p=0.1),
            v2.RandomVerticalFlip(p=0.1),
            v2.RandomRotation(30),
            v2.RandomCrop(crop_size),
            v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            v2.RandomGrayscale(p=0.1),
            v2.ToTensor(),
        ])

    def __call__(self, image, target):
        image = self.transforms(image, target)
        target = v2.ToTensor()(target).float()
        return image, target

In [48]:
# Define outside PetDataModule at the global level
class PetDataModule(pl.LightningDataModule):
    def setup(self, stage=None):
        transforms_img = v2.Compose([
            v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            v2.RandomGrayscale(p=0.1),
            v2.Resize((118, 118)),
            v2.ToTensor(),
        ])
        transforms_target = v2.Compose([
            v2.Resize((118, 118)),
            v2.PILToTensor(),
            TargetTransform(),
        ])
        joint_transforms = v2.Compose([
            v2.Resize(118),
            v2.RandomHorizontalFlip(p=0.1),
            v2.RandomVerticalFlip(p=0.1),
            v2.RandomRotation(30),
            v2.RandomCrop(100),
            # v2.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
            # v2.RandomGrayscale(p=0.1),
            # v2.ToTensor(),
            v2.PILToTensor(),
            TargetTransform(),
        ])
        self.train_ds = OxfordIIITPet(
            root=pets_path, 
            split='trainval', 
            target_types='segmentation',
            # transform=transforms_img, 
            # target_transform=transforms_target,
            transforms=joint_transforms,
            download=True
        )
        self.val_ds = OxfordIIITPet(
            root=pets_path, 
            split='test', 
            target_types='segmentation',
            transform=transforms_img, 
            target_transform=transforms_target, 
            download=True)
        self.batch_size = 32

    def train_dataloader(self):
        return DataLoader(
            self.train_ds, 
            batch_size=self.batch_size, 
            shuffle=True, 
            num_workers=workers
        )

    def val_dataloader(self):
        return DataLoader(
            self.val_ds, 
            batch_size=self.batch_size, 
            shuffle=False, 
            num_workers=workers
        )

#### Prepare Checkpoint Callback

In [49]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_loss',
    dirpath=unet_path / 'checkpoints/',
    filename='unet-{epoch:02d}-{val_loss:.2f}',
    save_top_k=1,
    mode='min',
)

#### Setup Training

In [50]:
model = UNetResNet34()

In [51]:
lit_model = LitUNet(model)

In [52]:
data_module = PetDataModule()

trainer = pl.Trainer(
    max_epochs=10, 
    callbacks=[checkpoint_callback],
    accelerator='auto',
    devices=1,
    precision='16-mixed',
)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (mps), used: True
TPU available: False, using: 0 TPU cores
HPU available: False, using: 0 HPUs


#### Train the Model

In [None]:
trainer.fit(
    lit_model, 
    datamodule=data_module
)

  5%|█████████▊                                                                                                                                                                                            | 39.2M/792M [01:25<13:14, 948kB/s]

In [None]:
trainer.save_checkpoint("last_checkpoint_aug.ckpt")

#### Visualize Predicted Masks

In [None]:
data_module.setup()
x, y = next(iter(data_module.val_dataloader()))
preds = lit_model(x).detach().cpu().numpy()
plt.figure(figsize=(10,5))
for i in range(3):
    plt.subplot(3,3,i*3+1)
    plt.imshow(x[i].permute(1,2,0).cpu().numpy())
    plt.title('Input')
    plt.subplot(3,3,i*3+2)
    plt.imshow(y[i][0], cmap='gray')
    plt.title('Ground Truth')
    plt.subplot(3,3,i*3+3)
    plt.imshow(preds[i][0]>0.5, cmap='gray')
    plt.title('Prediction')
plt.tight_layout()
plt.show()

Check [torchvision model-hub for UNet model](https://pytorch.org/hub/mateuszbuda_brain-segmentation-pytorch_unet/) source 

In [None]:
gc.collect()

# Section B: Train YOLO11 for Instance Segmentation

In this section we will train a YOLO11 using `ultralytics` models on the different provided datasets for instance segmentation.

In [None]:
model = YOLO("yolo11n-seg.pt") 

#### Download YAML Metadata File

In [None]:
def download_yaml(url: str, dest: Path) -> Path:
    dest.mkdir(exist_ok=True, parents=True)
    file_name = url.split('/')[-1]
    response = requests.get(url)
    file_path = dest / file_name
    if response.status_code == 200:
        with file_path.open('wb') as file:
            file.write(response.content)
        print(f'{file_name} has been downloaded successfully in {file_path}.')
    else:
        print(f'Failed to download file. Status code: {response.status_code}')

    return file_path

#### Prepare Data

In [None]:
url = 'https://github.com/ultralytics/ultralytics/raw/main/ultralytics/cfg/datasets/carparts-seg.yaml'
yaml_path = download_yaml(url, carparts_path)

#### Train the Model

In [None]:
results = model.train(
    data=yaml_path.name, 
    epochs=100, 
    imgsz=640,
    device=device,
    workers=11,
)

#### Visualize Prediction

In [None]:
# Predict with the model
results = model(source="https://ultralytics.com/images/bus.jpg", show=True)  # predict on an image

# Access the results
# for result in results:
#     xy = result.masks.xy  # mask in polygon format
#     xyn = result.masks.xyn  # normalized
#     masks = result.masks.data  # mask in matrix format (num_objects x H x W)

In [None]:
img = result.plot()

In [None]:
plt.imshow(img)
plt.show()
plt.close()

#### Train on Crack Dataset

In [None]:
url = 'https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/crack-seg.yaml'
yaml_path = download_yaml(url, cracks_path)

In [None]:
results = model.train(
    data=yaml_path.name, 
    epochs=100, 
    imgsz=640,
    device=device,
    workers=11,
)

#### Train on Packages Dataset

In [None]:
url = 'https://github.com/ultralytics/ultralytics/blob/main/ultralytics/cfg/datasets/package-seg.yaml'
yaml_path = download_yaml(url, packages_path)

In [None]:
results = model.train(
    data=yaml_path.name, 
    epochs=100, 
    imgsz=640,
    device=device,
    workers=11,
)