In [None]:
import cv2
import matplotlib.pyplot as plt
from patchify import patchify
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import json
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import joblib

OpenCV loads images in BGR format, while Matplotlib expects RGB format when displaying images using `plt.imshow()`

In [None]:
image = cv2.imread(r"..\data\Semantic segmentation dataset\Tile 1\images\image_part_001.jpg")
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
print(image.shape)
plt.subplot(1, 2, 1)
plt.imshow(image)
plt.axis('off')

mask = cv2.imread(r"..\data\Semantic segmentation dataset\Tile 1\masks\image_part_001.png")
mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB)
print(mask.shape)
plt.subplot(1, 2, 2)
plt.imshow(mask)
plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
IMAGE_PATCH_SIZE = 256

crops the image using NumPy slicing to ensure that its dimensions are exact multiples of IMAGE_PATCH_SIZE

Another way to crop the image using the `PIL` package:
```py
image = Image.fromarray(image)
image = image.crop((0, 0, size_x, size_y))
image = np.array(image)
```

In [None]:
image = cv2.imread(r"..\data\Semantic segmentation dataset\Tile 1\images\image_part_001.jpg")
print(image.shape)

size_x = (image.shape[1] // IMAGE_PATCH_SIZE) * IMAGE_PATCH_SIZE
size_y = (image.shape[0] // IMAGE_PATCH_SIZE) * IMAGE_PATCH_SIZE

# crop the image to be multiples of IMAGE_PATCH_SIZE
image = image[0:size_y, 0:size_x]
print(image.shape)

patched_images = patchify(image, (IMAGE_PATCH_SIZE, IMAGE_PATCH_SIZE, 3), step=IMAGE_PATCH_SIZE)
print(patched_images.shape)

In [None]:
for i in range(patched_images.shape[0]):
	for j in range(patched_images.shape[1]):
		print(patched_images[i, j, 0].shape)

len(patched_images.reshape(-1, IMAGE_PATCH_SIZE, IMAGE_PATCH_SIZE, 3))

In [None]:
l = []
l.extend(patched_images.reshape(-1, IMAGE_PATCH_SIZE, IMAGE_PATCH_SIZE, 3))
len(l)

In [None]:
minMaxScaler = MinMaxScaler()

In [None]:
image_dataset = []
mask_dataset = []

for image_type, extension in [('images', 'jpg'), ('masks', 'png')]:
	for tile_idx in range(8):
		for image_idx in range(9):
			image = cv2.imread(fr"..\data\Semantic segmentation dataset\Tile {tile_idx+1}\{image_type}\image_part_00{image_idx+1}.{extension}")
			image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
			size_x = (image.shape[1] // IMAGE_PATCH_SIZE) * IMAGE_PATCH_SIZE
			size_y = (image.shape[0] // IMAGE_PATCH_SIZE) * IMAGE_PATCH_SIZE
			image = image[0:size_y, 0:size_x]
			patched_images = patchify(image, (IMAGE_PATCH_SIZE, IMAGE_PATCH_SIZE, 3), step=IMAGE_PATCH_SIZE)
			for i in range(patched_images.shape[0]):
				for j in range(patched_images.shape[1]):
					individual_patched_image = patched_images[i, j, 0]
					if image_type == 'images':
						individual_patched_image = minMaxScaler.fit_transform(individual_patched_image.reshape(-1, individual_patched_image.shape[-1])).reshape(individual_patched_image.shape)
						image_dataset.append(individual_patched_image)
					elif image_type == 'masks':
						mask_dataset.append(individual_patched_image)

In [None]:
print(len(image_dataset))
print(len(mask_dataset))

In [None]:
image_dataset = np.array(image_dataset)
mask_dataset = np.array(mask_dataset)

let's display the 1st patch of the 1st tile, that we displayed above

In [None]:
plt.subplot(1, 2, 1)
plt.imshow(image_dataset[0])
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(mask_dataset[0])
plt.axis('off')

plt.tight_layout()
plt.show()

In [None]:
with open(r"..\data\Semantic segmentation dataset\classes.json") as f_in:
	classes_file = json.load(f_in)

print(type(classes_file))
classes_file

| class | actual colors on mask images | colors in classes.json file (contains mistakes) |
| ----- | ----- | ----- |
| Building: | #3C1098 | #D0021B |
| Land (unpaved area): | #8429F6 | #F5A623 |
| Road: | #6EC1E4 | #DE597F |
| Vegetation: | #FEDD3A | #417505 |
| Water: | #E2A929 | #50E3C2 |
| Unlabeled: | #9B9B9B | #9B9B9B |

In [None]:
classes_file['classes'][0]['color'] = '#E2A929'
classes_file['classes'][1]['color'] = '#8429F6'
classes_file['classes'][2]['color'] = '#6EC1E4'
classes_file['classes'][3]['color'] = '#3C1098'
classes_file['classes'][4]['color'] = '#FEDD3A'
classes_file['classes'][5]['color'] = '#9B9B9B'

In [None]:
# save the new edited classes.json (overwrite the old one)
with open(r"..\data\Semantic segmentation dataset\classes.json", 'w') as f_out:
	json.dump(classes_file, f_out, indent=4)  # use indent for pretty-printing (optional)

In [None]:
# label: color → e.g., 0 is the label of the 'Water' class
classes = dict()
for label, cls in enumerate(classes_file['classes']):
    hex_color = cls['color'].lstrip('#')
    classes[label] = np.array([int(hex_color[i:i+2], 16) for i in (0, 2, 4)])
    
classes

Mask images (labels) are RGB images, where each class is represented by a unique color (e.g., Building = #3C1098 → dark purple). However, neural networks don't understand colors; they require integer class IDs, such as Water = 0. So, we need to convert RGB masks to label IDs.

In [None]:
def rgb_to_label(mask):
    label_segment = np.zeros(mask.shape, dtype=np.uint8)
    for label, rgb in classes.items():
        label_segment[np.all(mask == rgb, axis=-1)] = label
    # removes extra channels (since it's now just label IDs, we only need 2D, not 3D)
    label_segment = label_segment[:, :, 0] # the last index can be 0, 1, or 2 as they're all the same
    return label_segment

In [None]:
labels = []
for i in range(mask_dataset.shape[0]):
    label = rgb_to_label(mask_dataset[i])
    labels.append(label)
    
labels = np.array(labels)
print(labels.shape) # (n_samples, H, W)
labels[0]

In [None]:
plt.subplot(1, 2, 1)
plt.imshow(image_dataset[0])
plt.axis('off')

plt.subplot(1, 2, 2)
plt.imshow(labels[0])
plt.axis('off')

plt.tight_layout()
plt.show()

### Prepare for training

In [None]:
labels = np.expand_dims(labels, axis=3)
print(labels.shape)
labels[0]

Now shape is (n_samples, H, W, 1), adding a dummy channel so Keras doesn't complain (expects channel-last input). So, that's expected by Keras.

For **Focal** and **Dice** losses, we must use one-hot encoded labels. If we need to use sparse_categorical_crossentropy, we must ignore the cell below.

In [None]:
num_classes = len(np.unique(labels))
labels_categorical_dataset = to_categorical(labels, num_classes=num_classes)
labels_categorical_dataset.shape
# Now shape is (n_samples, H, W, num_classes) → one-hot encoded masks. Required for softmax + Focal, Dice, and categorical_crossentropy losses.

In [None]:
labels_categorical_dataset[0]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(image_dataset, labels_categorical_dataset, test_size=0.15, random_state=42)

In [None]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
joblib.dump((X_train, X_test, y_train, y_test), r'..\data\dataset.joblib', compress=9)