# Download Dataset


In [1]:
# Download the KITTI dataset
!wget https://s3.eu-central-1.amazonaws.com/avg-kitti/data_semantics.zip

--2025-03-10 00:04:40--  https://s3.eu-central-1.amazonaws.com/avg-kitti/data_semantics.zip
Resolving s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)... 52.219.169.169, 3.5.134.27, 3.5.135.122, ...
Connecting to s3.eu-central-1.amazonaws.com (s3.eu-central-1.amazonaws.com)|52.219.169.169|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 327699796 (313M) [application/zip]
Saving to: ‘data_semantics.zip’


2025-03-10 00:04:54 (23.8 MB/s) - ‘data_semantics.zip’ saved [327699796/327699796]



# Import Libraries

In [2]:
!unzip data_semantics.zip

Archive:  data_semantics.zip
   creating: testing/
   creating: testing/image_2/
  inflating: testing/image_2/000000_10.png  
  inflating: testing/image_2/000001_10.png  
  inflating: testing/image_2/000002_10.png  
  inflating: testing/image_2/000003_10.png  
  inflating: testing/image_2/000004_10.png  
  inflating: testing/image_2/000005_10.png  
  inflating: testing/image_2/000006_10.png  
  inflating: testing/image_2/000007_10.png  
  inflating: testing/image_2/000008_10.png  
  inflating: testing/image_2/000009_10.png  
  inflating: testing/image_2/000010_10.png  
  inflating: testing/image_2/000011_10.png  
  inflating: testing/image_2/000012_10.png  
  inflating: testing/image_2/000013_10.png  
  inflating: testing/image_2/000014_10.png  
  inflating: testing/image_2/000015_10.png  
  inflating: testing/image_2/000016_10.png  
  inflating: testing/image_2/000017_10.png  
  inflating: testing/image_2/000018_10.png  
  inflating: testing/image_2/000019_10.png  
  inflating: testin

In [3]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import transforms
import torchvision.models.segmentation as segmentation
import matplotlib.pyplot as plt
from PIL import Image
import os
import cv2
import numpy as np
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm
import logging

# Preprocess masks

In [4]:
# Define a dictionary mapping RGB colors to class labels
# The key is the (R, G, B) tuple, and the value is the corresponding class label
color_map = {
    (128, 64, 128): 1,    # Road
    (35,142,107): 2,    # Sidewalk
    (70, 70, 70): 3,      # Building
    (60,20,220): 4,   # Wall
    (153, 153, 153): 5,   # Fence
    (153, 153, 190): 6,   # Vegetation
    (0,220,220): 7,     # Terrain
    (142,0,0): 8,       # Sky
    (100, 100, 150): 9,       # Person
    (152, 251, 152): 10,      # Car
    (180, 130, 70): 11,    # Bicycle
    (232, 35, 244): 12,    # Motorcycle
    (0,0,0): 0,   # Background (can be 0)
}


In [5]:
# Path to the folder with RGB masks
mask_folder = '/content/training/semantic_rgb'
output_mask_folder = '/content/preprocessed_masks'
os.makedirs(output_mask_folder)
n = 0
total = len(os.listdir(mask_folder))
# Loop through all mask images
for mask_filename in os.listdir(mask_folder):
    if mask_filename.endswith(".png"):
        mask_path = os.path.join(mask_folder, mask_filename)
        mask_image = cv2.imread(mask_path)

        # Create empty integer mask
        height, width, _ = mask_image.shape
        integer_mask = np.zeros((height, width), dtype=np.uint8)

        # Convert RGB to class labels
        for i in range(height):
            for j in range(width):
                rgb_value = tuple(mask_image[i, j])
                if rgb_value in color_map:
                    integer_mask[i, j] = color_map[rgb_value]
                else:
                    integer_mask[i, j] = 0  # Background

        # Save the integer mask
        output_mask_path = os.path.join(output_mask_folder, mask_filename)
        cv2.imwrite(output_mask_path, integer_mask)
        n+=1
        print(str(n) +'/'+ str(total))

1/200
2/200
3/200
4/200
5/200
6/200
7/200
8/200
9/200
10/200
11/200
12/200
13/200
14/200
15/200
16/200
17/200
18/200
19/200
20/200
21/200
22/200
23/200
24/200
25/200
26/200
27/200
28/200
29/200
30/200
31/200
32/200
33/200
34/200
35/200
36/200
37/200
38/200
39/200
40/200
41/200
42/200
43/200
44/200
45/200
46/200
47/200
48/200
49/200
50/200
51/200
52/200
53/200
54/200
55/200
56/200
57/200
58/200
59/200
60/200
61/200
62/200
63/200
64/200
65/200
66/200
67/200
68/200
69/200
70/200
71/200
72/200
73/200
74/200
75/200
76/200
77/200
78/200
79/200
80/200
81/200
82/200
83/200
84/200
85/200
86/200
87/200
88/200
89/200
90/200
91/200
92/200
93/200
94/200
95/200
96/200
97/200
98/200
99/200
100/200
101/200
102/200
103/200
104/200
105/200
106/200
107/200
108/200
109/200
110/200
111/200
112/200
113/200
114/200
115/200
116/200
117/200
118/200
119/200
120/200
121/200
122/200
123/200
124/200
125/200
126/200
127/200
128/200
129/200
130/200
131/200
132/200
133/200
134/200
135/200
136/200
137/200
138/200
139/

# KITTI dataset class

In [7]:
class KITTIdataset(Dataset):
    def __init__(self,image_dir, mask_dir, transform=None, mask_transform=None, image_size=(256, 256)):
        self.image_dir = image_dir
        self.mask_dir = mask_dir
        self.transform = transform
        self.mask_transform = mask_transform
        self.image_size = image_size

        self.image_names = sorted([f for f in os.listdir(image_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])
        self.mask_names = sorted([f for f in os.listdir(mask_dir) if f.endswith(('.png', '.jpg', '.jpeg'))])

        assert len(self.image_names) == len(self.mask_names), "Mismatch between image and mask count!"

    def __len__(self):
        return len(self.image_names)

    def __getitem__(self, idx):
        img_path = os.path.join(self.image_dir, self.image_names[idx])
        mask_path = os.path.join(self.mask_dir, self.mask_names[idx])

        image = Image.open(img_path)
        mask = Image.open(mask_path)

        if self.image_size:
            image = image.resize(self.image_size)
            mask = mask.resize(self.image_size)

        if self.transform:
            image = self.transform(image)
        if self.mask_transform:
            mask = self.mask_transform(mask)

        return image, mask

# Model class


In [8]:
class DeepLabV3Plus(nn.Module):
    def __init__(self, num_classes = 13):
        super(DeepLabV3Plus, self).__init__()
        self.model = segmentation.deeplabv3_resnet101(pretrained=True)
        self.model.classifier[4] = nn.Conv2d(256, num_classes, kernel_size=(1, 1), stride=(1, 1))

    def forward(self, x):
        return self.model(x)['out']

# Define instances and variables

In [11]:
image_dir = '/content/training/image_2'
mask_dir = '/content/preprocessed_masks'
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
EPOCHS = 50
BATCH_SIZE = 8
NUM_CLASSES = 13
LEARNING_RATE = 1e-3
IMAGE_SIZE = (256, 256)
CHECKPOINT_DIR = "./experiments/"

transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((256, 256)),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

mask_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Resize((256, 256))
])

dataset = KITTIdataset(image_dir=image_dir, mask_dir=mask_dir,
                       transform=transform, mask_transform=mask_transform)

num_img = len(dataset)
train_size = int(0.1*num_img)
val_size = int(0.1*num_img)
test_size = num_img - train_size - val_size

train_dataset, val_dataset, test_dataset = random_split(dataset, [train_size, val_size, test_size])

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

model = DeepLabV3Plus(NUM_CLASSES).to(DEVICE)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)

Downloading: "https://download.pytorch.org/models/deeplabv3_resnet101_coco-586e9e4e.pth" to /root/.cache/torch/hub/checkpoints/deeplabv3_resnet101_coco-586e9e4e.pth
100%|██████████| 233M/233M [00:02<00:00, 113MB/s]


In [14]:
import time
from tqdm import tqdm  # Progress bar

num_epochs = 10  # Adjust number of epochs
print_freq = 10  # Print loss every 10 batches

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    running_loss = 0.0
    start_time = time.time()

    # tqdm progress bar for better visibility
    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{num_epochs}")

    for batch_idx, (images, masks) in progress_bar:
        images, masks = images.to(DEVICE), masks.to(DEVICE)

        optimizer.zero_grad()  # Zero the gradients
        outputs = model(images)['out']  # Forward pass
        loss = criterion(outputs, masks)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Optimize the weights

        running_loss += loss.item()

        # Print progress every `print_freq` batches
        if (batch_idx + 1) % print_freq == 0 or batch_idx == len(train_loader) - 1:
            avg_loss = running_loss / (batch_idx + 1)
            progress_bar.set_postfix(loss=f"{avg_loss:.4f}")

    # Calculate and print epoch summary
    epoch_loss = running_loss / len(train_loader)
    epoch_time = time.time() - start_time
    print(f"Epoch [{epoch+1}/{num_epochs}] - Loss: {epoch_loss:.4f} - Time: {epoch_time:.2f}s")

    # Save model at the end
torch.save(model.state_dict(), f"deeplabv3_epoch{epoch+1}.pth")

Epoch 1/10:   0%|          | 0/1 [00:43<?, ?it/s]


KeyboardInterrupt: 