In [1]:
import numpy as np
import matplotlib.pyplot as plt
import torch
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.transforms import Compose, Resize
import segmentation_models_pytorch as smp

from src.models.BaselineModel import BaselineModel
from src.evaluation.evaluate_result import evaluate_result
from src.datasets.DubaiSemanticSegmentationDataset import (
    DubaiSemanticSegmentationDataset,
)

from src.datasets.utils.ResizeToDivisibleBy32 import ResizeToDivisibleBy32

  from .autonotebook import tqdm as notebook_tqdm


## Prepare environment

In [2]:
torch.cuda.is_available()

True

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [4]:
VAL_SIZE = 0.2
BATCH_SIZE = 1
SEED = 42
DUBAI_DATASET_PATH = "data/DubaiSemanticSegmentationDataset"
IMAGE_SIZE = 576

In [5]:
train_dataset = DubaiSemanticSegmentationDataset(
    DUBAI_DATASET_PATH,
    transforms=[Compose([Resize(IMAGE_SIZE), ResizeToDivisibleBy32()])],
)
print(len(train_dataset))

72


## Sanity check data

In [6]:
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

In [7]:
for images, masks in train_loader:
    print(images.shape)
    print(masks.shape)
    break

torch.Size([1, 3, 576, 608])
torch.Size([1, 1, 576, 608])


In [8]:
to_pil_transform = transforms.ToPILImage()

In [9]:
img = to_pil_transform(images.squeeze())

In [10]:
# img.show()

In [11]:
msk = to_pil_transform(masks.squeeze())

In [12]:
# msk.show()

# UNET

## Run example model on single image

In [13]:
model = smp.Unet(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=6,  # model output channels (number of classes in your dataset)
    activation="softmax",
).to(device)

In [14]:
baseline_model = BaselineModel(classes=6).to(device)

In [15]:
torch.cuda.empty_cache()

In [16]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        # (batch_size, channels, height, width)
        # (B, C, H, W)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 608])
torch.Size([1, 1, 576, 608])


  return self._call_impl(*args, **kwargs)


In [17]:
print(masks.max())

tensor(5, dtype=torch.uint8)


In [18]:
print(masks.min())

tensor(0, dtype=torch.uint8)


In [19]:
print(output.shape)

torch.Size([1, 576, 608])


In [20]:
print(output_baseline.shape)

torch.Size([1, 576, 608])


In [21]:
unique, counts = np.unique(output.cpu(), return_counts=True)
print(dict(zip(unique, counts)))

{0: 329, 1: 19714, 2: 1381, 3: 139, 4: 328433, 5: 212}


In [22]:
unique, counts = np.unique(masks.cpu(), return_counts=True)
print(dict(zip(unique, counts)))

{0: 12172, 1: 220557, 2: 72658, 3: 26781, 4: 15547, 5: 2493}


In [23]:
# FIXME: wykonanie tego bez błędu == tensory mają takie same wymiary
evaluate_result(output, masks, mode="multiclass", num_classes=6)

{'iou': tensor(0.0395),
 'f1': tensor(0.0760),
 'accuracy': tensor(0.6920),
 'recall': tensor(0.0760)}

In [24]:
evaluate_result(output_baseline, masks, mode="multiclass", num_classes=6)

{'iou': tensor(0.0909),
 'f1': tensor(0.1667),
 'accuracy': tensor(0.7222),
 'recall': tensor(0.1667)}

## Run example model on dataset

In [25]:
model.eval()
metrics_dict = {
    "iou": [],
    "f1": [],
    "accuracy": [],
    "recall": [],
}

In [26]:
with torch.no_grad():
    for images, masks in train_loader:
        output = model(images.to(device))
        output = torch.argmax(output, dim=1)

        iter_metrics = evaluate_result(output, masks, mode="multiclass", num_classes=6)
        for key in metrics_dict.keys():
            metrics_dict[key].append(iter_metrics[key])

In [27]:
print(metrics_dict)

{'iou': [tensor(0.0114), tensor(0.0618), tensor(0.0171), tensor(0.0496), tensor(0.1154), tensor(0.0141), tensor(0.1329), tensor(0.3690), tensor(0.7344), tensor(0.0732), tensor(0.2918), tensor(0.5210), tensor(0.0146), tensor(0.1446), tensor(0.0695), tensor(0.3571), tensor(0.1984), tensor(0.0307), tensor(0.1094), tensor(0.2462), tensor(0.2313), tensor(0.0449), tensor(0.0699), tensor(0.0097), tensor(0.3149), tensor(0.0784), tensor(0.1874), tensor(0.1622), tensor(0.0245), tensor(0.0357), tensor(0.1011), tensor(0.0828), tensor(0.0063), tensor(0.0360), tensor(0.0860), tensor(0.1214), tensor(0.0326), tensor(0.0735), tensor(0.0126), tensor(0.1483), tensor(0.0915), tensor(0.3903), tensor(0.0985), tensor(0.0136), tensor(0.0193), tensor(0.0362), tensor(0.0175), tensor(0.0152), tensor(0.3412), tensor(0.0298), tensor(0.6657), tensor(0.0089), tensor(0.2835), tensor(0.0226), tensor(0.0442), tensor(0.4874), tensor(0.0096), tensor(0.0437), tensor(0.0395), tensor(0.0331), tensor(0.1646), tensor(0.0127),

# UNET++

## Run example model on single image

In [28]:
model = smp.UnetPlusPlus(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=6,  # model output channels (number of classes in your dataset)
    activation="softmax",
).to(device)

In [29]:
baseline_model = BaselineModel()

In [30]:
torch.cuda.empty_cache()

In [31]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 864])
torch.Size([1, 1, 576, 864])


In [32]:
print(output.shape)

torch.Size([1, 576, 864])


In [33]:
print(output_baseline.shape)

torch.Size([1, 576, 864])


In [34]:
to_pil_transform = transforms.ToPILImage()

In [35]:
img = to_pil_transform(images.squeeze())

In [36]:
# img.show()

In [37]:
msk = to_pil_transform(masks.squeeze())

In [38]:
# msk.show()

In [39]:
unique, counts = np.unique(output.to("cpu"), return_counts=True)
print(dict(zip(unique, counts)))

{0: 17603, 1: 375312, 2: 458, 3: 14150, 4: 88587, 5: 1554}


In [40]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [41]:
plt.imsave("assets/dubai-example-unet-plus-plus-output.jpeg", np.array(outp))

In [42]:
# outp.show()

In [43]:
evaluate_result(output, masks, mode="multiclass", num_classes=6)

{'iou': tensor(0.2857),
 'f1': tensor(0.4445),
 'accuracy': tensor(0.8148),
 'recall': tensor(0.4445)}

## DeepLabV3

## Run example model on single image

In [44]:
model = smp.DeepLabV3(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=6,
    activation="softmax",
).to(device)

In [45]:
baseline_model = BaselineModel()

In [46]:
torch.cuda.empty_cache()

In [47]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 768])
torch.Size([1, 1, 576, 768])


In [48]:
print(output.shape)

torch.Size([1, 576, 768])


In [49]:
print(output_baseline.shape)

torch.Size([1, 576, 768])


In [50]:
to_pil_transform = transforms.ToPILImage()

In [51]:
img = to_pil_transform(images.squeeze())

In [52]:
# img.show()

In [53]:
msk = to_pil_transform(masks.squeeze())

In [54]:
# msk.show()

In [55]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [56]:
plt.imsave("assets/dubai-example-deep-lab-v3-output.jpeg", np.array(outp))

In [57]:
# outp.show()

In [58]:
evaluate_result(output, masks, mode="multiclass", num_classes=6)

{'iou': tensor(0.2249),
 'f1': tensor(0.3672),
 'accuracy': tensor(0.7891),
 'recall': tensor(0.3672)}

## DeepLabV3+

## Run example model on single image

In [59]:
model = smp.DeepLabV3Plus(
    encoder_name="resnet18",  # choose encoder, e.g. mobilenet_v2 or efficientnet-b7
    encoder_weights="imagenet",  # use `imagenet` pre-trained weights for encoder initialization
    in_channels=3,  # model input channels (1 for gray-scale images, 3 for RGB, etc.)
    classes=6,
    activation="softmax",
).to(device)

In [60]:
baseline_model = BaselineModel()

In [61]:
torch.cuda.empty_cache()

In [62]:
model.eval()
with torch.no_grad():
    for images, masks in train_loader:
        print(images.shape)
        print(masks.shape)
        break
    output = model(images.to(device))
    output = torch.argmax(output, dim=1)

    output_baseline = baseline_model(images.to(device))
    output_baseline = torch.argmax(output_baseline, dim=1)

torch.Size([1, 3, 576, 736])
torch.Size([1, 1, 576, 736])


In [63]:
print(output.shape)

torch.Size([1, 576, 736])


In [64]:
print(output_baseline.shape)

torch.Size([1, 576, 736])


In [65]:
to_pil_transform = transforms.ToPILImage()

In [66]:
img = to_pil_transform(images.squeeze())

In [67]:
# img.show()

In [68]:
msk = to_pil_transform(masks.squeeze())

In [69]:
# msk.show()

In [70]:
outp = to_pil_transform(output.int().squeeze().cpu())

In [71]:
plt.imsave("assets/dubai-example-deep-lab-v3-plus-output.jpeg", np.array(outp))

In [72]:
# outp.show()

In [73]:
evaluate_result(output, masks, mode="multiclass", num_classes=6)

{'iou': tensor(0.0016),
 'f1': tensor(0.0031),
 'accuracy': tensor(0.6677),
 'recall': tensor(0.0031)}