In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import Resize, Compose
import segmentation_models_pytorch as smp

from src.models.BaselineModel import BaselineModel
from src.evaluation.evaluate_result import evaluate_result
from src.datasets.UAVidSemanticSegmentationDataset import (
    UAVidSemanticSegmentationDataset,
)

from src.datasets.utils.ResizeToDivisibleBy32 import ResizeToDivisibleBy32

  from .autonotebook import tqdm as notebook_tqdm


## Prepare environment

In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
VAL_SIZE = 0.2
BATCH_SIZE = 1
SEED = 42
UAVID_DATASET_PATH = "data/UAVidSemanticSegmentationDataset"
IMAGE_SIZE = 576

In [5]:
train_dataset = UAVidSemanticSegmentationDataset(
    UAVID_DATASET_PATH,
    transforms=[Compose([Resize(IMAGE_SIZE), ResizeToDivisibleBy32()])],
)
print(len(train_dataset))

200


In [6]:
val_dataset = UAVidSemanticSegmentationDataset(
    UAVID_DATASET_PATH,
    split="valid",
    transforms=[Compose([Resize(IMAGE_SIZE), ResizeToDivisibleBy32()])],
)
print(len(val_dataset))

70


In [7]:
test_dataset = UAVidSemanticSegmentationDataset(
    UAVID_DATASET_PATH,
    split="test",
    transforms=[Compose([Resize(IMAGE_SIZE), ResizeToDivisibleBy32()])],
)
print(len(test_dataset))

10


## Sanity check data

In [8]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=True)

In [9]:
for images, masks in train_loader:
    print(images.shape)
    print(masks.shape)
    break

torch.Size([1, 3, 576, 1024])
torch.Size([1, 1, 576, 1024])


In [10]:
for images, masks in val_loader:
    print(images.shape)
    print(masks.shape)
    break

torch.Size([1, 3, 576, 1024])
torch.Size([1, 1, 576, 1024])


In [11]:
for images in test_loader:
    print(images.shape)
    break

torch.Size([1, 3, 576, 1024])


In [12]:
to_pil_transform = transforms.ToPILImage()

In [13]:
img = to_pil_transform(images.squeeze())

In [14]:
# img.show()

In [15]:
msk = to_pil_transform(masks.squeeze())

In [16]:
# msk.show()

# UNET

## Run example model on single image

In [17]:
model = smp.Unet(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=8,  # model output channels (number of classes in your dataset)
    activation="softmax",
).to(device)

In [18]:
baseline_model = BaselineModel(classes=8).to(device)

In [19]:
torch.cuda.empty_cache()

In [20]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 1024])
torch.Size([1, 1, 576, 1024])


  return self._call_impl(*args, **kwargs)


In [21]:
print(output.shape)

torch.Size([1, 576, 1024])


In [22]:
print(output_baseline.shape)

torch.Size([1, 576, 1024])


In [23]:
unique, counts = np.unique(output.cpu(), return_counts=True)
print(dict(zip(unique, counts)))

{0: 421156, 1: 171, 2: 1, 3: 41, 4: 64813, 5: 94888, 6: 5675, 7: 3079}


In [24]:
evaluate_result(output, masks, mode="multiclass", num_classes=8)

{'iou': tensor(0.0950),
 'f1': tensor(0.1735),
 'accuracy': tensor(0.7934),
 'recall': tensor(0.1735)}

In [25]:
evaluate_result(output_baseline, masks, mode="multiclass", num_classes=8)

{'iou': tensor(0.0667),
 'f1': tensor(0.1251),
 'accuracy': tensor(0.7813),
 'recall': tensor(0.1251)}

## Run example model on dataset

In [26]:
model.eval()
metrics_dict = {
    "iou": [],
    "f1": [],
    "accuracy": [],
    "recall": [],
}

In [27]:
with torch.no_grad():
    for images, masks in train_loader:
        output = model(images.to(device))
        output = torch.argmax(output, dim=1)


        iter_metrics = evaluate_result(output, masks, mode="multiclass", num_classes=8)
        for key in metrics_dict.keys():
            metrics_dict[key].append(iter_metrics[key])

In [28]:
print(metrics_dict)

{'iou': [tensor(0.1571), tensor(0.0709), tensor(0.1364), tensor(0.1650), tensor(0.2344), tensor(0.1098), tensor(0.1290), tensor(0.1248), tensor(0.0845), tensor(0.0191), tensor(0.1105), tensor(0.0675), tensor(0.0961), tensor(0.0961), tensor(0.0704), tensor(0.0794), tensor(0.1482), tensor(0.1021), tensor(0.0582), tensor(0.0715), tensor(0.0797), tensor(0.0438), tensor(0.1321), tensor(0.1098), tensor(0.0524), tensor(0.1219), tensor(0.0703), tensor(0.1151), tensor(0.0544), tensor(0.0735), tensor(0.0958), tensor(0.0929), tensor(0.0289), tensor(0.0843), tensor(0.1763), tensor(0.0966), tensor(0.0767), tensor(0.0891), tensor(0.0537), tensor(0.1011), tensor(0.1655), tensor(0.1204), tensor(0.0713), tensor(0.1453), tensor(0.0658), tensor(0.0804), tensor(0.0494), tensor(0.0867), tensor(0.0377), tensor(0.1068), tensor(0.0613), tensor(0.1534), tensor(0.1060), tensor(0.1174), tensor(0.0909), tensor(0.0256), tensor(0.0819), tensor(0.1090), tensor(0.0652), tensor(0.0334), tensor(0.0724), tensor(0.1173),

# UNET++

## Run example model on single image

In [29]:
model = smp.UnetPlusPlus(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=8,  # model output channels (number of classes in your dataset)
    activation="softmax",
).to(device)

In [30]:
baseline_model = BaselineModel()

In [31]:
torch.cuda.empty_cache()

In [32]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 1120])
torch.Size([1, 1, 576, 1120])


In [33]:
print(output.shape)

torch.Size([1, 576, 1120])


In [34]:
print(output_baseline.shape)

torch.Size([1, 576, 1120])


In [35]:
to_pil_transform = transforms.ToPILImage()

In [36]:
img = to_pil_transform(images.squeeze())

In [37]:
# img.show()

In [38]:
msk = to_pil_transform(masks.squeeze())

In [39]:
# msk.show()

In [40]:
unique, counts = np.unique(output.to('cpu'), return_counts=True)
print(dict(zip(unique, counts)))

{0: 50678, 1: 888, 2: 339, 4: 433271, 5: 640, 6: 89677, 7: 69627}


In [41]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [42]:
# plt.imsave("assets/uavid-example-unet-plus-plus-output.jpeg", np.array(outp))

In [43]:
# outp.show()

In [44]:
evaluate_result(output, masks, mode="multiclass", num_classes=8)

{'iou': tensor(0.1375),
 'f1': tensor(0.2418),
 'accuracy': tensor(0.8105),
 'recall': tensor(0.2418)}

## DeepLabV3

## Run example model on single image

In [45]:
model = smp.DeepLabV3(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=8,
    activation="softmax"
).to(device)

In [46]:
baseline_model = BaselineModel()

In [47]:
torch.cuda.empty_cache()

In [48]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 1120])
torch.Size([1, 1, 576, 1120])


In [49]:
print(output.shape)

torch.Size([1, 576, 1120])


In [50]:
print(output_baseline.shape)

torch.Size([1, 576, 1120])


In [51]:
to_pil_transform = transforms.ToPILImage()

In [52]:
img = to_pil_transform(images.squeeze())

In [53]:
# img.show()

In [54]:
msk = to_pil_transform(masks.squeeze())

In [55]:
# msk.show()

In [56]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [57]:
plt.imsave("assets/uavid-example-deep-lab-v3-output.jpeg", np.array(outp))

In [58]:
# outp.show()

In [59]:
evaluate_result(output, masks, mode="multiclass", num_classes=8)

{'iou': tensor(0.1123),
 'f1': tensor(0.2019),
 'accuracy': tensor(0.8005),
 'recall': tensor(0.2019)}

## DeepLabV3+

## Run example model on single image

In [60]:
model = smp.DeepLabV3Plus(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=8,
    activation="softmax"
).to(device)

In [61]:
baseline_model = BaselineModel()

In [62]:
torch.cuda.empty_cache()

In [63]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 1024])
torch.Size([1, 1, 576, 1024])


In [64]:
print(output.shape)

torch.Size([1, 576, 1024])


In [65]:
print(output_baseline.shape)

torch.Size([1, 576, 1024])


In [66]:
to_pil_transform = transforms.ToPILImage()

In [67]:
img = to_pil_transform(images.squeeze())

In [68]:
# img.show()

In [69]:
msk = to_pil_transform(masks.squeeze())

In [70]:
# msk.show()

In [71]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [72]:
plt.imsave("assets/uavid-example-deep-lab-v3-plus-output.jpeg", np.array(outp))

In [73]:
# outp.show()

In [74]:
evaluate_result(output, masks, mode="multiclass", num_classes=8)

{'iou': tensor(0.0114),
 'f1': tensor(0.0224),
 'accuracy': tensor(0.7556),
 'recall': tensor(0.0224)}