# Prepare Datasets and Create the Project Structure

Notebook to preprocess the dataset and create a proper folder structure to store the data. This notebook must be run before running the `Data_Augmentation_Using_Generative_Adversarial_Networks.ipynb` notebook that should be available in the same folder.

### Prerequisites

Ensure that the Cityscapes dataset is downloaded and placed in the directory named **dataset**. The dataset can be downloaded from [here](https://www.cityscapes-dataset.com/). You need to register to access the dataset. The following datasets are required for data augmentation:
1. [gtFine_trainvaltest.zip](https://www.cityscapes-dataset.com/file-handling/?packageID=1) (241MB): Fine annotations for training and validation datasets (3475 annotated images) and dummy annotations (ignore regions) for the test set (1525 images).
2. [leftImg8bit_trainvaltest.zip](https://www.cityscapes-dataset.com/file-handling/?packageID=3) (11GB): Left 8-bit images - training, validation, and test datasets (5000 images).

### Check Python version

In [None]:
import platform
assert (platform.python_version_tuple()[:2] >= ('3','7')), "The notebooks are tested on Python 3.7 and higher. Please updated your Python to evaluate the code"

### Check Notebook server has access to all required resources

In [None]:
from pathlib import Path

dataset_folder = Path("dataset")
dataset_folder = Path.joinpath(Path.cwd(), dataset_folder)

if not dataset_folder.exists():
    raise FileNotFoundError("Add `{}` folder in the current directory (`{}`)".format(dataset_folder.name, Path.cwd()))

In [None]:
expected_zipped_datasets = ["gtFine_trainvaltest.zip", "leftImg8bit_trainvaltest.zip"]
expected_zipped_datasets_path = list()

for zipped_dataset in expected_zipped_datasets:
    zipped_dataset = Path.joinpath(dataset_folder, zipped_dataset)
    expected_zipped_datasets_path.append(zipped_dataset)
    if not zipped_dataset.exists():
        raise FileNotFoundError("Download and place `{}` in the current directory (`{}`)".format(zipped_dataset.name, Path.cwd()))

### Unzip Datasets

In [None]:
import zipfile as zf

unzipped_datasets_name = [str(dataset_name).replace(".zip", "") for dataset_name in expected_zipped_datasets]
unzipped_datasets_path = [Path.joinpath(dataset_folder, dataset_name) for dataset_name in unzipped_datasets_name]

for iterator, (dataset_input_path, dataset_output_path) in enumerate(zip(expected_zipped_datasets_path, unzipped_datasets_path)):
    with zf.ZipFile(dataset_input_path, 'r') as zip_ref:
        zip_ref.extractall(dataset_output_path)
        
    print(f"Unzipped {(iterator+1)/len(expected_zipped_datasets_path) * 100:.2f}% Dataset.")
        
print("Unzipped Datasets Successfully!")

### Evaluate Datasets

In [None]:
import os
import glob

# Evaluate Overall Dataset

segmentation_map_expr_overall = str(unzipped_datasets_path[0]) + "\\**\\*_color.png"
segmentation_map_paths_overall = glob.glob(segmentation_map_expr_overall, recursive=True)
segmentation_map_paths_overall = sorted(segmentation_map_paths_overall)

photo_expr_overall = str(unzipped_datasets_path[1]) + "\\**\\*_leftImg8bit.png"
photo_paths_overall = glob.glob(photo_expr_overall, recursive=True)
photo_overall = sorted(photo_paths_overall)

print("\nOverall Dataset: \nFound {} images of segmentation maps.\nFound {} photos.".format(len(segmentation_map_paths_overall), len(segmentation_map_paths_overall)))

assert (len(segmentation_map_paths_overall) == len(photo_overall)), ("Dataset Incorrect! Number of semantic segmentation maps do not match the number of photos!")

In [None]:
# Evaluate Training Dataset

segmentation_map_expr_train = str(unzipped_datasets_path[0]) + "\\**\\train\\**\\*_color.png"
segmentation_map_paths_train = glob.glob(segmentation_map_expr_train, recursive=True)
segmentation_map_paths_train = sorted(segmentation_map_paths_train)

photo_expr_train = str(unzipped_datasets_path[1]) + "\\**\\train\\**\\*_leftImg8bit.png"
photo_paths_train = glob.glob(photo_expr_train, recursive=True)
photo_train = sorted(photo_paths_train)

print("\nTraining Dataset: \nFound {} images of segmentation maps.\nFound {} photos.".format(len(segmentation_map_paths_train), len(photo_train)))

assert (len(segmentation_map_paths_train) == len(photo_train)), ("Dataset Incorrect! Number of semantic segmentation maps do not match the number of photos!")

In [None]:
# Evaluate Validation Dataset

segmentation_map_expr_val = str(unzipped_datasets_path[0]) + "\\**\\val\\**\\*_color.png"
segmentation_map_paths_val = glob.glob(segmentation_map_expr_val, recursive=True)
segmentation_map_paths_val = sorted(segmentation_map_paths_val)

photo_expr_val = str(unzipped_datasets_path[1]) + "\\**\\val\\**\\*_leftImg8bit.png"
photo_paths_val = glob.glob(photo_expr_val, recursive=True)
photo_val = sorted(photo_paths_val)

print("\nValidation Dataset: \nFound {} images of segmentation maps.\nFound {} photos.".format(len(segmentation_map_paths_val), len(photo_val)))

assert (len(segmentation_map_paths_val) == len(photo_val)), ("Dataset Incorrect! Number of semantic segmentation maps do not match the number of photos!")

In [None]:
# Evaluate Test Dataset

segmentation_map_expr_test = str(unzipped_datasets_path[0]) + "\\**\\test\\**\\*_color.png"
segmentation_map_paths_test = glob.glob(segmentation_map_expr_test, recursive=True)
segmentation_map_paths_test = sorted(segmentation_map_paths_test)

photo_expr_test = str(unzipped_datasets_path[1]) + "\\**\\test\\**\\*_leftImg8bit.png"
photo_paths_test = glob.glob(photo_expr_test, recursive=True)
photo_test = sorted(photo_paths_test)

print("\nTest Dataset: \nFound {} images of segmentation maps.\nFound {} photos.".format(len(segmentation_map_paths_test), len(photo_test)))

assert (len(segmentation_map_paths_test) == len(photo_test)), ("Dataset Incorrect! Number of semantic segmentation maps do not match the number of photos!")

### Display Images from Dataset

In [None]:
import matplotlib.image as img
import matplotlib.pyplot as plt
%matplotlib inline

list_of_segmentation_maps = segmentation_map_paths_overall[-3:]
list_of_photos = photo_overall[-3:]

f, axarr = plt.subplots(len(list_of_segmentation_maps),2, figsize=(30,20))

for iterator, (segmentation_map_path, photo_path) in enumerate(zip(list_of_segmentation_maps, list_of_photos)):
    segmentation_map_data = img.imread(segmentation_map_path)
    photo_data = img.imread(photo_path)
    axarr[iterator, 0].imshow(segmentation_map_data)
    axarr[iterator, 1].imshow(photo_data)

### Create Project Structure

In [None]:
training_dataset = Path.joinpath(dataset_folder, "training_dataset")
os.makedirs(training_dataset)

validatation_dataset = Path.joinpath(dataset_folder, "validatation_dataset")
os.makedirs(validatation_dataset)

test_dataset = Path.joinpath(dataset_folder, "test_dataset")
os.makedirs(test_dataset)

print("Project Structure Created Successfully!")

### Process Cityscape Dataset

In [None]:
# Function to evaluate if the images are matching pair
def evaluate_matching_pair(segmentation_map_path, photo_path):    
    
    segmentation_map_identifier = os.path.basename(segmentation_map_path).replace("_gtFine_color", "")
    photo_identifier = os.path.basename(photo_path).replace("_leftImg8bit", "")
    
    assert (segmentation_map_identifier == photo_identifier), ("Invalid Image Pair! {} and {} are not same!".format(segmentation_map_identifier, photo_identifier))

In [None]:
from PIL import Image

# Function to load resized images
def load_resized_images(image_path):    
    return Image.open(image_path).convert('RGB').resize((256, 256))

In [None]:
# Process Training Dataset

for iterator, (segmentation_map_path, photo_path) in enumerate(zip(segmentation_map_paths_train, photo_train)):
    
    evaluate_matching_pair(segmentation_map_path, photo_path)
    segmentation_map = load_resized_images(segmentation_map_path)
    photo = load_resized_images(photo_path)
    
    side_by_side_image = Image.new('RGB', (512, 256))
    side_by_side_image.paste(segmentation_map, (256, 0))
    side_by_side_image.paste(photo, (0, 0))
    
    output_path = Path.joinpath(training_dataset, "{}.jpg".format(iterator))
    side_by_side_image.save(output_path, format='JPEG', subsampling=0, quality=100)
    
    if(iterator%10 == 0):
        print(f"Processed {(iterator+1)/len(segmentation_map_paths_train) * 100:.2f}% Training Dataset.")
        
print("Training Data Processed Successfully!")

In [None]:
# Process Validation Dataset

for iterator, (segmentation_map_path, photo_path) in enumerate(zip(segmentation_map_paths_val, photo_val)):
    
    evaluate_matching_pair(segmentation_map_path, photo_path)
    segmentation_map = load_resized_images(segmentation_map_path)
    photo = load_resized_images(photo_path)
    
    side_by_side_image = Image.new('RGB', (512, 256))
    side_by_side_image.paste(segmentation_map, (256, 0))
    side_by_side_image.paste(photo, (0, 0))
    
    output_path = Path.joinpath(validatation_dataset, "{}.jpg".format(iterator))
    side_by_side_image.save(output_path, format='JPEG', subsampling=0, quality=100)
    
    if(iterator%10 == 0):
        print(f"Processed {(iterator+1)/len(segmentation_map_paths_val) * 100:.2f}% Validation Dataset.")
        
print("Validation Data Processed Successfully!")

In [None]:
# Process Test Dataset

for iterator, (segmentation_map_path, photo_path) in enumerate(zip(segmentation_map_paths_test, photo_test)):
    
    evaluate_matching_pair(segmentation_map_path, photo_path)
    segmentation_map = load_resized_images(segmentation_map_path)
    photo = load_resized_images(photo_path)
    
    side_by_side_image = Image.new('RGB', (512, 256))
    side_by_side_image.paste(segmentation_map, (256, 0))
    side_by_side_image.paste(photo, (0, 0))
    
    output_path = Path.joinpath(test_dataset, "{}.jpg".format(iterator))
    side_by_side_image.save(output_path, format='JPEG', subsampling=0, quality=100)
    
    if(iterator%10 == 0):
        print(f"Processed {(iterator+1)/len(segmentation_map_paths_test) * 100:.2f}% Test Dataset.")
        
print("Test Data Processed Successfully!")

### Display Processed Dataset

In [None]:
# Display Training Dataset Images

training_dataset_expr = str(training_dataset) + "\\**\\*.jpg"
training_dataset_paths = glob.glob(training_dataset_expr, recursive=True)
training_dataset_paths = sorted(training_dataset_paths)

list_of_training_data = training_dataset_paths[:3]

f, axarr = plt.subplots(len(list_of_training_data),1, figsize=(15,20))

for iterator, training_data_path in enumerate(list_of_training_data):
    training_data = img.imread(training_data_path)
    axarr[iterator].imshow(training_data)

In [None]:
# Display Validation Dataset Images

validation_dataset_expr = str(validatation_dataset) + "\\**\\*.jpg"
validation_dataset_paths = glob.glob(validation_dataset_expr, recursive=True)
validation_dataset_paths = sorted(validation_dataset_paths)

list_of_validation_data = validation_dataset_paths[:3]

f, axarr = plt.subplots(len(list_of_validation_data),1, figsize=(15,20))

for iterator, validation_data_path in enumerate(list_of_validation_data):
    validation_data = img.imread(validation_data_path)
    axarr[iterator].imshow(validation_data)

In [None]:
# Display Test Dataset Images

test_dataset_expr = str(test_dataset) + "\\**\\*.jpg"
test_dataset_paths = glob.glob(test_dataset_expr, recursive=True)
test_dataset_paths = sorted(test_dataset_paths)

list_of_test_data = test_dataset_paths[:3]

f, axarr = plt.subplots(len(list_of_test_data),1, figsize=(15,20))

for iterator, test_data_path in enumerate(list_of_test_data):
    test_data = img.imread(test_data_path)
    axarr[iterator].imshow(test_data)

**Important:**

Based on the information from the official documentation of the dataset, it can be confirmed that the semantic segmentation map for the test dataset contains dummy annotations and can be ignored.

#### Wrapping Up

**Congratulations!!**

You have successfully prepared the dataset required and created the project structure to run this project. You can now run the project by running the `Data_Augmentation_Using_Generative_Adversarial_Networks.ipynb` notebook that should be available in the same folder.