# **Deep Learning Project - Pothole detection** <br/>
# **Preprocess**
**Data Science and Advanced Analytics with a specialization in Data Science**<br/>
**NOVA IMS**<br/>
Authors of this notebook: <br/>
* Mafalda Paço (20220619@novaims.unl.pt)<br/>
* Mª Margarida Graça (20220602@novaims.unl.pt)<br/>
* Marta Dinis (20220611@novaims.unl.pt)<br/>
* Nuno Dias (20220603@novaims.unl.pt)<br/>
* Patrícia Morais (20220638@novaims.unl.pt)<br/>


## Ready to use Dataset
https://drive.google.com/file/d/1KE507iE7Hwb7TiJINnvMYCXNIGrEgPvt/view?usp=share_link

## **Summary**
In this notebook, we did the necessary preprocess in our dataset.
We deleted the found duplicates in the Data Exploration notebook and converted all the images to the same format and color mode. Lastly, we apllied data augmentation on all images and saved them (old and augmented) in a new folder, DATA.zip.

## **Data Import**

Necessary library imports.

In [None]:
import os
import shutil

import time
import random
import zipfile
from PIL import Image

import numpy as np

import tensorflow as tf
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras import layers

In [None]:
# Set the machine
gdrive = True
# Set the connection string
path = "/content/drive/MyDrive/DL/Project/"
main_folder, training_folder, testing_folder = "DATA_original/", "train/", "test/"
# If using Google Drive
if gdrive:
    # Setup drive
    from google.colab import drive
    drive.mount('/content/drive')
    # Transfer zip dataset to the current virtual machine
    t0 = time.time()
    shutil.copyfile(path + 'DATA_original.zip', 'DATA_original.zip')
    # Extract files
    zip_ = zipfile.ZipFile('DATA_original.zip')
    zip_.extractall()
    zip_.close()
    print("File transfer completed in %0.3f seconds" % (time.time() - t0))
    path = ""

Mounted at /content/drive
File transfer completed in 7.903 seconds


After doing Data Exploration we noticed we have some duplicates in our dataset, this function removes them, creating a folder without duplicates in our environment

In [None]:
def get_all_image_paths(folder_path):
    image_paths = []
    subfolders = os.listdir(folder_path)
    for subfolder in subfolders:
        subfolder_path = os.path.join(folder_path, subfolder)
        if os.path.isdir(subfolder_path):
            for filename in os.listdir(subfolder_path):
                if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
                    image_paths.append(os.path.join(subfolder_path, filename))
    return image_paths

def remove_duplicates_and_save(input_folder, output_folder):
    image_paths = get_all_image_paths(input_folder)
    image_hashes = {}

    for image_path in image_paths:
        with Image.open(image_path) as img:
            image_hash = str(hash(img.tobytes()))
        if image_hash not in image_hashes:
            image_hashes[image_hash] = image_path

            # Create subfolder if it doesn't exist
            subfolder = os.path.dirname(image_path).split('/')[-1]
            new_subfolder_path = os.path.join(output_folder, subfolder)
            os.makedirs(new_subfolder_path, exist_ok=True)

            # Copy the image to the new folder
            new_image_path = os.path.join(new_subfolder_path, os.path.basename(image_path))
            shutil.copy(image_path, new_image_path)

# Set the input and output paths
input_base_folder = os.path.join('/content/', "DATA_original")
path = "/content/drive/MyDrive/DL/Project/"
output_base_folder = os.path.join(path, "DATA_no_DUPLICATES_1")

# Remove duplicates and save the results for train and test folders
for folder in ["train", "test"]:
    input_folder = os.path.join(input_base_folder, folder)
    output_folder = os.path.join(output_base_folder, folder)
    remove_duplicates_and_save(input_folder, output_folder)


Now that we have removed the duplicates, we will format all images to '.jpeg' and 'RGB' so we can use them in our model without erorrs, creating a new folder with images in the right format.

In [None]:
def get_all_image_paths(folder_path):
    image_paths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
                image_paths.append(os.path.join(root, file))
    return image_paths

def convert_image_to_jpeg_rgb(image_path, output_path):
    img = Image.open(image_path)
    if img.mode != 'RGB':
        img = img.convert('RGB')
    img.save(output_path, format='JPEG')

input_path = "/content/drive/MyDrive/DL/Project/DATA_no_DUPLICATES_1"
output_path = "/content/drive/MyDrive/DL/Project/DATA_FORMATTED_2"

# Create the output folder
os.makedirs(output_path, exist_ok=True)

# Get all image paths
image_paths = get_all_image_paths(input_path)

# Convert images and save them in the output folder
for image_path in image_paths:
    relative_path = os.path.relpath(image_path, input_path)
    new_output_path = os.path.join(output_path, relative_path)
    new_output_path = os.path.splitext(new_output_path)[0] + '.jpeg'
    os.makedirs(os.path.dirname(new_output_path), exist_ok=True)
    convert_image_to_jpeg_rgb(image_path, new_output_path)



Since we have a small dataset we use data augmentation on all images, since it also helps to improve the performance and reduce overfiting, putting all images in a new '.zip' called DATA ready to use.

In [None]:
import os
import random

def get_all_image_paths(folder_path):
    image_paths = []
    for root, _, files in os.walk(folder_path):
        for file in files:
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.gif')):
                image_paths.append(os.path.join(root, file))
    return image_paths

def apply_augmentation(image_path, output_path, augmentation_pipeline):
    image = load_img(image_path)
    image_array = img_to_array(image)
    image_array = tf.expand_dims(image_array, 0)

    augmented_image = augmentation_pipeline(image_array)
    augmented_image = tf.squeeze(augmented_image, 0).numpy().astype("uint8")

    img = Image.fromarray(augmented_image)
    img.save(output_path, format='JPEG')

input_path = "/content/drive/MyDrive/DL/Project/DATA_FORMATTED_2"
output_folder = "/content/drive/MyDrive/DL/Project/DATA"

# Create the output folder for training set
train_output_folder = os.path.join(output_folder, "train")
os.makedirs(train_output_folder, exist_ok=True)

# Create the output folder for test set
test_output_folder = os.path.join(output_folder, "test")
os.makedirs(test_output_folder, exist_ok=True)

# Get all image paths
train_image_paths = get_all_image_paths(os.path.join(input_path, "train"))
test_image_paths = get_all_image_paths(os.path.join(input_path, "test"))

# Shuffle the image paths
random.shuffle(train_image_paths)
random.shuffle(test_image_paths)

# Define augmentation pipeline for training set
augmentation_pipeline = tf.keras.Sequential([
    layers.RandomFlip(),
    layers.RandomRotation(factor=0.2),
    layers.RandomZoom(height_factor=0.1, width_factor=0.1),
    layers.RandomContrast(factor=0.25),
    layers.RandomBrightness(factor=0.2),
    layers.RandomTranslation(height_factor=(-0.1, 0.1), width_factor=(-0.1, 0.1))
], name="my_augmentation_pipeline")

# Copy and augment training images
for image_path in train_image_paths:
    relative_path = os.path.relpath(image_path, os.path.join(input_path, "train"))
    new_output_path = os.path.join(train_output_folder, relative_path)

    os.makedirs(os.path.dirname(new_output_path), exist_ok=True)

    # Copy the original image
    shutil.copyfile(image_path, new_output_path)

    # Apply data augmentation to the training images
    new_augmented_path = os.path.splitext(new_output_path)[0] + '_augmented.jpeg'
    apply_augmentation(image_path, new_augmented_path, augmentation_pipeline)

# Copy test images without augmentation
for image_path in test_image_paths:
    relative_path = os.path.relpath(image_path, os.path.join(input_path, "test"))
    new_output_path = os.path.join(test_output_folder, relative_path)

    os.makedirs(os.path.dirname(new_output_path), exist_ok=True)

    # Copy the original image
    shutil.copyfile(image_path, new_output_path)

# Create a zip file
with zipfile.ZipFile('/content/drive/MyDrive/DL/Project/DATA.zip', 'w', zipfile.ZIP_DEFLATED) as zf:
    for folder_name, _, filenames in os.walk(output_folder):
        for filename in filenames:
            file_path = os.path.join(folder_name, filename)
            zf.write(file_path, os.path.join("DATA", os.path.relpath(file_path, output_folder)))