In [None]:
! pip install gdown

In [None]:
! pip install matplotlib

In [None]:
#! pip install -U fvcore

In [None]:
! find . -name "*.DS_Store" -type f -delete

## 0 - Imports

In [1]:
import os
import gdown
import zipfile
import time
import numpy as np

from PIL import Image

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torchvision.transforms as transforms
import torchvision.models as models
import torch.optim as optim

#from fvcore.nn import FlopCountAnalysis, flop_count_table

## 1 - CityScapes Dataset

In [None]:
"""
# Dataset's URL
url = 'https://drive.google.com/uc?id=1Qb4UrNsjvlU-wEsR9d7rckB0YS_LXgb2'

# Temporary directory
temp_dir = 'temp'

if not os.path.exists(temp_dir):
    os.makedirs(temp_dir)
    
# Datasets directory
extract_dir = 'data'

if not os.path.exists(extract_dir):
    os.makedirs(extract_dir)

zip_dir = temp_dir + '/Cityscape.zip'

# Download the dataset
gdown.download(url, zip_dir, quiet=False)

with zipfile.ZipFile(zip_dir, 'r') as zip_file:
  zip_file.extractall(extract_dir)

# Remove the temporary directory
try:
    os.rmdir(temp_dir)
    print(f"Temporary directory '{temp_dir}' removed successfully.")
except FileNotFoundError:
    print(f"Temporary directory '{temp_dir}' does not exist.")
except OSError as e:
    print(f"Error: {e}")
"""

In [2]:
"""
Use this only if gdown doesn't work
"""
# Unzip file

# Path to the zip file
zip_file = "Cityscapes.zip"

# Directory where you want to extract the contents
extract_dir = 'data'

# Create the extract directory if it doesn't exist
os.makedirs(extract_dir, exist_ok=True)

# Extract the contents of the zip file
with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

print("Extraction complete.")

Extraction complete.


In [3]:
class CityScapes(Dataset):
    def __init__(self, root_dir, split='train', image_transform=None, label_transform=None):
        super(CityScapes, self).__init__()
        """
        Args:
            root_dir (string): Directory with all the images and annotations.
            split (string): 'train' or 'val'.
            transform (callable, optional): Optional transform to be applied on a sample.
        """
        
        self.root_dir = root_dir
        self.split = split
        self.image_transform = image_transform
        self.label_transform = label_transform

        # Get the image and label directories
        self.image_dir = os.path.join(root_dir, 'images', split)
        self.label_dir = os.path.join(root_dir, 'gtFine', split)

        # Get a list of all image files
        self.image_files = []
        for city_dir in os.listdir(self.image_dir):
            city_image_dir = os.path.join(self.image_dir, city_dir)
            self.image_files.extend([os.path.join(city_image_dir, f) for f in os.listdir(city_image_dir) if f.endswith('.png')])

    def __len__(self):
        return len(self.image_files)
        
    def __getitem__(self, idx):
        img_name = self.image_files[idx]

        # Get the corresponding label image path
        label_name = img_name.replace('images', 'gtFine').replace('_leftImg8bit', '_gtFine_labelTrainIds')
        #color_name = img_name.replace('images', 'gtFine').replace('_leftImg8bit', '_gtFine_color')

        # Load image and label
        image = Image.open(img_name).convert('RGB')
        label = Image.open(label_name)
        #color = Image.open(color_name).convert('RGB')

        if self.image_transform:
            image = self.image_transform(image)
        if self.label_transform:
            label = self.label_transform(label)
            #color = self.label_transform(color)
        
        return image, label

In [4]:
# Define transforms for preprocessing
image_transform = transforms.Compose([
        transforms.Resize((256,512)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])

label_transform = transforms.Compose([
        transforms.Resize((256,512)),
        transforms.ToTensor(),
    ])

root_dir = extract_dir + '/Cityscapes/Cityspaces'

# Create training and validation datasets
train_dataset = CityScapes(root_dir=root_dir, split='train', image_transform=image_transform, label_transform=label_transform)
val_dataset = CityScapes(root_dir=root_dir, split='val', image_transform=image_transform, label_transform=label_transform)

# Create data loaders
batch_size = 8
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def denormalize(image):
    image = image.to('cpu').numpy().transpose((1, 2, 0))
    mean = np.array([0.485, 0.456, 0.406])
    std = np.array([0.229, 0.224, 0.225])
    image = image * std + mean
    image = np.clip(image, 0, 1)
    return image

figure = plt.figure(figsize=(12, 8))
for i, (inputs, classes) in enumerate(val_loader):
    if i > 0:
        break
    figure.add_subplot(1,2,1)
    plt.imshow(denormalize(inputs[0].squeeze()), cmap="gray")
    figure.add_subplot(1,2,2)
    #plt.imshow(np.clip(classes[0].numpy().transpose((1, 2, 0)), 0, 1), cmap="gray")
    plt.imshow(classes[0].squeeze(), cmap="gray")

## 2 - Model DeepLabV2

In [7]:
affine_par = True


class Bottleneck(nn.Module):
    expansion = 4

    def __init__(self, inplanes, planes, stride=1, dilation=1, downsample=None):
        super(Bottleneck, self).__init__()
        # change
        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, stride=stride, bias=False)
        self.bn1 = nn.BatchNorm2d(planes, affine=affine_par)
        for i in self.bn1.parameters():
            i.requires_grad = False
        padding = dilation
        # change
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1,
                               padding=padding, bias=False, dilation=dilation)
        self.bn2 = nn.BatchNorm2d(planes, affine=affine_par)
        for i in self.bn2.parameters():
            i.requires_grad = False
        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
        self.bn3 = nn.BatchNorm2d(planes * 4, affine=affine_par)
        for i in self.bn3.parameters():
            i.requires_grad = False
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        residual = x
        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)
        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)
        out = self.conv3(out)
        out = self.bn3(out)
        if self.downsample is not None:
            residual = self.downsample(x)
        out += residual
        out = self.relu(out)

        return out


class ClassifierModule(nn.Module):
    def __init__(self, inplanes, dilation_series, padding_series, num_classes):
        super(ClassifierModule, self).__init__()
        self.conv2d_list = nn.ModuleList()
        for dilation, padding in zip(dilation_series, padding_series):
            self.conv2d_list.append(
                nn.Conv2d(inplanes, num_classes, kernel_size=3, stride=1, padding=padding,
                          dilation=dilation, bias=True))

        for m in self.conv2d_list:
            m.weight.data.normal_(0, 0.01)

    def forward(self, x):
        out = self.conv2d_list[0](x)
        for i in range(len(self.conv2d_list) - 1):
            out += self.conv2d_list[i + 1](x)
        return out


class ResNetMulti(nn.Module):
    def __init__(self, block, layers, num_classes):
        self.inplanes = 64
        super(ResNetMulti, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = nn.BatchNorm2d(64, affine=affine_par)
        for i in self.bn1.parameters():
            i.requires_grad = False
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, ceil_mode=True)  # change
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
        self.layer3 = self._make_layer(block, 256, layers[2], stride=1, dilation=2)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=1, dilation=4)
        self.layer6 = ClassifierModule(2048, [6, 12, 18, 24], [6, 12, 18, 24], num_classes)
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                m.weight.data.normal_(0, 0.01)
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, block, planes, blocks, stride=1, dilation=1):
        downsample = None
        if (stride != 1
                or self.inplanes != planes * block.expansion
                or dilation == 2
                or dilation == 4):
            downsample = nn.Sequential(
                nn.Conv2d(self.inplanes, planes * block.expansion,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes * block.expansion, affine=affine_par))
        for i in downsample._modules['1'].parameters():
            i.requires_grad = False
        layers = []
        layers.append(
            block(self.inplanes, planes, stride, dilation=dilation, downsample=downsample))
        self.inplanes = planes * block.expansion
        for i in range(1, blocks):
            layers.append(block(self.inplanes, planes, dilation=dilation))

        return nn.Sequential(*layers)

    def forward(self, x):
        _, _, H, W = x.size()

        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer6(x)

        x = torch.nn.functional.interpolate(x, size=(H, W), mode='bilinear')

        if self.training == True:
            return x, None, None

        return x

    def get_1x_lr_params_no_scale(self):
        """
        This generator returns all the parameters of the net except for
        the last classification layer. Note that for each batchnorm layer,
        requires_grad is set to False in deeplab_resnet.py, therefore this function does not return
        any batchnorm parameter
        """
        b = []

        b.append(self.conv1)
        b.append(self.bn1)
        b.append(self.layer1)
        b.append(self.layer2)
        b.append(self.layer3)
        b.append(self.layer4)

        for i in range(len(b)):
            for j in b[i].modules():
                jj = 0
                for k in j.parameters():
                    jj += 1
                    if k.requires_grad:
                        yield k

    def get_10x_lr_params(self):
        """
        This generator returns all the parameters for the last layer of the net,
        which does the classification of pixel into classes
        """
        b = []
        if self.multi_level:
            b.append(self.layer5.parameters())
        b.append(self.layer6.parameters())

        for j in range(len(b)):
            for i in b[j]:
                yield i

    def optim_parameters(self, lr):
        return [{'params': self.get_1x_lr_params_no_scale(), 'lr': lr},
                {'params': self.get_10x_lr_params(), 'lr': 10 * lr}]


def get_deeplab_v2(num_classes=19, pretrain=True, pretrain_model_path='DeepLab_resnet_pretrained_imagenet.pth'):
    model = ResNetMulti(Bottleneck, [3, 4, 23, 3], num_classes)

    # Pretraining loading
    if pretrain:
        print('Deeplab pretraining loading...')
        saved_state_dict = torch.load(pretrain_model_path)

        new_params = model.state_dict().copy()
        for i in saved_state_dict:
            i_parts = i.split('.')
            new_params['.'.join(i_parts[1:])] = saved_state_dict[i]
        model.load_state_dict(new_params, strict=False)

    return model

# Define the model (DeepLabV2 with ResNet-101 backbone)
model = get_deeplab_v2(num_classes=19, pretrain=True, pretrain_model_path='deeplab_resnet_pretrained_imagenet.pth')

Deeplab pretraining loading...


## 3 - Training and Evaluation

In [None]:
"""
def evaluate_model(model, loader):
    # FLOPs and parameters
    
    # -----------------------------
    # Initizialize your model here
    # -----------------------------
    
    height = 
    width = 
    image = torch.zeros((3, height, width))
    
    flops = FlopCountAnalysis(model, image)
    num_params = flop_count_table(flops)
    

mIoU, flops, num_params = evaluate_model(model, val_loader)
"""

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [9]:
# Define optimizer and loss function
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9, weight_decay=5e-4)
criterion = torch.nn.CrossEntropyLoss()

# Lists to store latency and FPS values
latency = []
FPS = []

# Training loop
num_epochs = 2
for epoch in range(num_epochs):
    model.train()
    for batch_idx, (images, labels) in enumerate(train_loader):

        
        images, labels = images.to(device), labels.squeeze(1).to(device)
        print(images.size())
        print(f"IMAGES:\n{images}")
        print()
        print(labels.size())
        print(f"LABELS:\n{labels.squeeze(1)}")
        
        # Record start time
        start = time.time()
        
        optimizer.zero_grad()
        outputs = model(images)
        
        end = time.time() # Record end time

        # Calculate latency for this iteration
        latency_i = end - start
        latency.append(latency_i)
        
        # Calculate FPS for this iteration
        FPS_i = 1 / latency_i
        FPS.append(FPS_i)
    
        
        print(f"OUTPUTS:\n{outputs}")
        print()
        print(outputs[0].size())
        print(f"OUTPUTS[0]:\n{outputs[0]}")
        
        pred_labels = torch.argmax(torch.softmax(outputs[0], axis=1),axis=1)
        
        print(pred_labels.size())
        print()
        print(f"PREDS:\n{pred_labels}")
        print(f"PREDS:\n{pred_labels.float()}")
        
        pred_labels = pred_labels.float()
        
        # From paper: 
        # Our loss function is the sum of cross-entropy terms for each spatial position in the CNN output map 
        #loss = [criterion(pred_labels[i], labels[i]) for i in range(batch_size)].sum()
        loss = criterion(pred_labels, labels)
        
        loss.backward()
        optimizer.step()
        print(f'Epoch [{epoch+1}/{num_epochs}], Step [{batch_idx+1}/{len(train_loader)}], Loss: {loss.item()}')

torch.Size([8, 3, 256, 512])
IMAGES:
tensor([[[[-1.1247, -1.1247, -1.0904,  ..., -1.3302, -1.3815, -1.3987],
          [-1.0904, -1.0904, -1.1075,  ..., -1.3302, -1.3815, -1.3644],
          [-1.0733, -1.0733, -1.0562,  ..., -1.3130, -1.3987, -1.3302],
          ...,
          [-1.2617, -1.2617, -1.2617,  ..., -0.8507, -0.9020, -0.9363],
          [-1.2617, -1.2617, -1.2617,  ..., -0.8849, -0.8507, -0.8849],
          [-1.2445, -1.2617, -1.2445,  ..., -0.9020, -0.8849, -0.8849]],

         [[-0.7927, -0.8102, -0.7577,  ..., -1.0903, -1.1429, -1.1253],
          [-0.7402, -0.7402, -0.7577,  ..., -1.0903, -1.1604, -1.0903],
          [-0.7577, -0.7577, -0.7402,  ..., -1.1078, -1.1429, -1.0553],
          ...,
          [-0.9328, -0.9328, -0.9153,  ..., -0.4776, -0.5301, -0.5476],
          [-0.9328, -0.9328, -0.9328,  ..., -0.5126, -0.4951, -0.5301],
          [-0.9153, -0.9328, -0.9153,  ..., -0.5476, -0.5301, -0.5126]],

         [[-0.8284, -0.8458, -0.8284,  ..., -1.0724, -1.1073, -1.

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [None]:
print(pred_labels.size())
print(pred_labels[0].size())
print(pred_labels[0][0].size())
print(pred_labels[0][0][0].size())

print(type(pred_labels))
print(type(pred_labels[0]))
print(type(pred_labels[0][0]))
print(type(pred_labels[0][0][0]))
pred_labels[0][0][0].max()

In [None]:
# Evaluation

# Calculate mean and standard deviation of latency (in milliseconds)
meanLatency = np.mean(latency) * 1000
# stdLatency = np.std(latency) * 1000

# Calculate mean and standard deviation of FPS
# meanFPS = np.mean(FPS)
# stdFPS = np.std(FPS)

model.eval()
mIoU, flops, num_params = evaluate_model(model, val_loader)
print(f'mIoU: {mIoU}, Latency: {meanLatency}, FLOPs: {flops}, Number of Parameters: {num_params}')

In [None]:
print(outputs[0].size())
print(outputs[0][0].size())
print(outputs[0][0][0].size())
print(outputs[0][0][0][0].size())

print(type(outputs))
print(type(outputs[0]))
print(type(outputs[0][0]))
print(type(outputs[0][0][0]))
print(type(outputs[0][0][0][0]))
outputs[0][0][0][0].max()