In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import Resize, Compose
import segmentation_models_pytorch as smp

from src.models.BaselineModel import BaselineModel
from src.evaluation.evaluate_result import evaluate_result
from src.datasets.AerialDroneSemanticSegmentationDataset import (
    AerialDroneSemanticSegmentationDataset,
)

from src.datasets.utils.ResizeToDivisibleBy32 import ResizeToDivisibleBy32

  from .autonotebook import tqdm as notebook_tqdm


## Prepare environment

In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
VAL_SIZE = 0.2
BATCH_SIZE = 1
SEED = 42
AERIAL_DRONE_DATASET_PATH = (
    "data/AerialDroneSemanticSegmentationDataset/dataset/semantic_drone_dataset"
)
IMAGE_SIZE = 576

In [5]:
train_dataset = AerialDroneSemanticSegmentationDataset(
    AERIAL_DRONE_DATASET_PATH,
    transforms=[Compose([Resize(IMAGE_SIZE), ResizeToDivisibleBy32()])],
)
print(len(train_dataset))

400


## Sanity check data

In [6]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [7]:
for images, masks in train_loader:
    print(images.shape)
    print(masks.shape)
    break

torch.Size([1, 3, 576, 864])
torch.Size([1, 1, 576, 864])


In [8]:
to_pil_transform = transforms.ToPILImage()

In [9]:
img = to_pil_transform(images.squeeze())

In [10]:
# img.show()

In [11]:
msk = to_pil_transform(masks.squeeze())

In [12]:
# msk.show()

## Run example model on single image

In [13]:
model = smp.Unet(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=23,  # model output channels (number of classes in your dataset)
    activation="softmax",
).to(device)

In [14]:
baseline_model = BaselineModel(classes=23).to(device)

In [15]:
torch.cuda.empty_cache()

In [16]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    # output = (output > 0.5).float()

    output_baseline = baseline_model(images.to(device))
    # output_baseline = (output_baseline > 0.5).float()

torch.Size([1, 3, 576, 864])
torch.Size([1, 1, 576, 864])


  return self._call_impl(*args, **kwargs)


In [17]:
print(output.shape)

torch.Size([1, 23, 576, 864])


In [18]:
print(output_baseline.shape)

torch.Size([1, 23, 576, 864])


In [19]:
output = torch.argmax(output, dim=1)
output_baseline = torch.argmax(output_baseline, dim=1)

In [20]:
print(output.shape)
print(output_baseline.shape)

torch.Size([1, 576, 864])
torch.Size([1, 576, 864])


In [21]:
evaluate_result(output.squeeze(), masks.squeeze(), mode="multiclass", num_classes=23)

{'iou': tensor(0.0004),
 'f1': tensor(0.0008),
 'accuracy': tensor(0.9131),
 'recall': tensor(0.0008)}

In [22]:
evaluate_result(
    output_baseline.squeeze(), masks.squeeze(), mode="multiclass", num_classes=23
)

{'iou': tensor(0.0224),
 'f1': tensor(0.0439),
 'accuracy': tensor(0.9169),
 'recall': tensor(0.0439)}

## Run example model on dataset

In [23]:
model.eval()
metrics_dict = {
    "iou": [],
    "f1": [],
    "accuracy": [],
    "recall": [],
}

In [24]:
with torch.no_grad():
    for images, masks in train_loader:
        output = model(images.to(device))
        output = torch.argmax(output, dim=1)

        iter_metrics = evaluate_result(
            output.squeeze(), masks.squeeze(), mode="multiclass", num_classes=23
        )
        for key in metrics_dict.keys():
            metrics_dict[key].append(iter_metrics[key])

In [25]:
print(metrics_dict)

{'iou': [tensor(0.0006), tensor(0.0002), tensor(0.0004), tensor(0.0027), tensor(1.4066e-05), tensor(1.8085e-05), tensor(0.0005), tensor(0.0003), tensor(6.0285e-05), tensor(0.0033), tensor(0.0089), tensor(0.0020), tensor(0.0001), tensor(0.0003), tensor(0.0006), tensor(0.0002), tensor(0.0006), tensor(0.0003), tensor(0.0008), tensor(0.0005), tensor(0.0002), tensor(0.0002), tensor(0.0002), tensor(0.0007), tensor(0.0004), tensor(0.0002), tensor(0.0002), tensor(0.0006), tensor(0.0003), tensor(0.0004), tensor(0.0006), tensor(5.0235e-06), tensor(0.0004), tensor(0.0008), tensor(9.6460e-05), tensor(0.0004), tensor(0.0005), tensor(0.0002), tensor(0.0039), tensor(0.0003), tensor(0.0008), tensor(0.0002), tensor(0.0002), tensor(0.0026), tensor(0.0007), tensor(0.0002), tensor(0.0006), tensor(0.0002), tensor(5.7271e-05), tensor(5.8276e-05), tensor(0.0004), tensor(0.0002), tensor(0.0001), tensor(0.0003), tensor(0.0002), tensor(0.0005), tensor(0.0003), tensor(0.0025), tensor(0.0004), tensor(0.0017), ten

In [26]:
print("Mean metrics")
for key in metrics_dict.keys():
    print(key, np.mean(metrics_dict[key]))

Mean metrics
iou 0.0009743935
f1 0.0019387779
accuracy 0.91321206
recall 0.001938778


# UNET++

## Run example model on single image

In [27]:
model = smp.UnetPlusPlus(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=23,  # model output channels (number of classes in your dataset)
    activation="softmax",
).to(device)

In [28]:
baseline_model = BaselineModel()

In [29]:
torch.cuda.empty_cache()

In [30]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 864])
torch.Size([1, 1, 576, 864])


In [31]:
print(output.shape)

torch.Size([1, 576, 864])


In [32]:
print(output_baseline.shape)

torch.Size([1, 576, 864])


In [33]:
to_pil_transform = transforms.ToPILImage()

In [34]:
img = to_pil_transform(images.squeeze())

In [35]:
# img.show()

In [36]:
msk = to_pil_transform(masks.squeeze())

In [37]:
# msk.show()

In [38]:
unique, counts = np.unique(output.to("cpu"), return_counts=True)
print(dict(zip(unique, counts)))

{0: 7613, 3: 46255, 4: 3, 5: 1, 6: 70, 10: 469, 14: 447, 16: 251, 18: 2977, 20: 276, 21: 439302}


In [39]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [40]:
plt.imsave("assets/aerial-drone-example-unet-plus-plus-output.jpeg", np.array(outp))

In [41]:
# outp.show()

In [42]:
evaluate_result(output, masks, mode="multiclass", num_classes=23)

{'iou': tensor(0.0052),
 'f1': tensor(0.0103),
 'accuracy': tensor(0.9139),
 'recall': tensor(0.0103)}

## DeepLabV3

## Run example model on single image

In [43]:
model = smp.DeepLabV3(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=23,
    activation="softmax",
).to(device)

In [44]:
baseline_model = BaselineModel()

In [45]:
torch.cuda.empty_cache()

In [46]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 864])
torch.Size([1, 1, 576, 864])


In [47]:
print(output.shape)

torch.Size([1, 576, 864])


In [48]:
print(output_baseline.shape)

torch.Size([1, 576, 864])


In [49]:
to_pil_transform = transforms.ToPILImage()

In [50]:
img = to_pil_transform(images.squeeze())

In [51]:
# img.show()

In [52]:
msk = to_pil_transform(masks.squeeze())

In [53]:
# msk.show()

In [54]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [55]:
plt.imsave("assets/aerial-drone-example-deep-lab-v3-output.jpeg", np.array(outp))

In [56]:
# outp.show()

In [57]:
evaluate_result(output, masks, mode="multiclass", num_classes=23)

{'iou': tensor(0.0031),
 'f1': tensor(0.0062),
 'accuracy': tensor(0.9136),
 'recall': tensor(0.0062)}

## DeepLabV3+

## Run example model on single image

In [58]:
model = smp.DeepLabV3Plus(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=23,
    activation="softmax",
).to(device)

In [59]:
baseline_model = BaselineModel()

In [60]:
torch.cuda.empty_cache()

In [61]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 864])
torch.Size([1, 1, 576, 864])


In [62]:
print(output.shape)

torch.Size([1, 576, 864])


In [63]:
print(output_baseline.shape)

torch.Size([1, 576, 864])


In [64]:
to_pil_transform = transforms.ToPILImage()

In [65]:
img = to_pil_transform(images.squeeze())

In [66]:
# img.show()

In [67]:
msk = to_pil_transform(masks.squeeze())

In [68]:
# msk.show()

In [69]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [70]:
plt.imsave("assets/aerial-drone-example-deep-lab-v3-plus-output.jpeg", np.array(outp))

In [71]:
# outp.show()

In [72]:
evaluate_result(output, masks, mode="multiclass", num_classes=23)

{'iou': tensor(0.2109),
 'f1': tensor(0.3483),
 'accuracy': tensor(0.9433),
 'recall': tensor(0.3483)}