# Data Cleaning

This notebook aims to clean and classify the image dataset for preparation to train the learning model.

### - Imports

In [1]:
from pathlib import Path
import cv2
import matplotlib.pyplot as plt

### - Loading

In [2]:
dataset = Path("data/India/")
num_datapoints = dataset.glob(pattern="*.jpg")
print(f"Number of datapoints in dataset = {len(list(num_datapoints))}")

Number of datapoints in dataset = 7062


### - Experimentation with existing YOLO annotations

In [3]:
def mark_and_show_yolo_annots(filename: str) -> None:
    """
    Reads in an image from the YOLO dataset, creating and displaying bounding boxes on it.
    
    The bounding box information is taken from the accompanying YOLO annotation file.
    """
    img = cv2.imread(f"./data/India/{filename}.jpg")
    dh, dw, _ = img.shape   

    file_data = ""

    with open(f"./data/India/YOLO_Darknet/{filename}.txt", 'r') as f:
        file_data = f.readlines()

    for data in file_data:
        _, x, y, width, height = map(float, data.split(' '))

        l, r = int((x - width / 2) * dw),   int((x + width / 2) * dw)
        t, b = int((y - height /  2) * dh), int((y + height / 2) * dh)

        l = max(0, l)
        r = min(r, dw - 1)
        t = max(0, t)
        b = min(b, dh - 1)

        cv2.rectangle(img, (l, t), (r, b), (255, 0, 0), 1)

    plt.imshow(img)
    plt.show()

In [None]:
for i in range(1, 25, 5):
    mark_and_show_yolo_annots(f"SS21_13 {i:04}")

We can observe from these, that the purpose of these annotations was to identify and mark out humans in the field of vision of the vehicle. While this has merit, and a wide range of applications can be based on these, it is not of particular interest to our study of obstacles and environment analysis as a whole on the road, but just a sub section of such.

### - VGG network for classifying images

In [4]:
# Let's import tensorflow to use the pre-existing VGG 19 model

import tensorflow as tf
import shutil

2022-10-03 17:44:52.086769: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [5]:
def create_model_from_vgg() -> tf.keras.models.Model:
    # Define the layers that we specifically want.
    # Content layer where we obtain our feature maps
    content_layers = ["block5_conv2"]

    # The styling layers we want
    style_layers = [
        "block1_conv1",
        "block2_conv1",
        "block3_conv1",
        "block4_conv1",
        "block5_conv1",
    ]

    # Store the model and make it's layers untrainable
    vgg = tf.keras.applications.vgg19.VGG19(include_top=True, weights="imagenet")
    vgg.trainable = False

    # Obtain style and content output layers and then merge them
    style_layer_outputs = [vgg.get_layer(name).output for name in style_layers]
    content_layer_outputs = [vgg.get_layer(name).output for name in content_layers]
    all_model_outputs = style_layer_outputs + content_layer_outputs

    return vgg

In [6]:
transfer_model = create_model_from_vgg()

transfer_model.summary()

2022-10-03 17:45:05.372077: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE3 SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-10-03 17:45:05.630582: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 411041792 exceeds 10% of free system memory.
2022-10-03 17:45:05.822643: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 411041792 exceeds 10% of free system memory.
2022-10-03 17:45:05.916538: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 411041792 exceeds 10% of free system memory.


Model: "vgg19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 224, 224, 3)]     0         
                                                                 
 block1_conv1 (Conv2D)       (None, 224, 224, 64)      1792      
                                                                 
 block1_conv2 (Conv2D)       (None, 224, 224, 64)      36928     
                                                                 
 block1_pool (MaxPooling2D)  (None, 112, 112, 64)      0         
                                                                 


2022-10-03 17:45:07.907057: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 411041792 exceeds 10% of free system memory.


 block2_conv1 (Conv2D)       (None, 112, 112, 128)     73856     
                                                                 
 block2_conv2 (Conv2D)       (None, 112, 112, 128)     147584    
                                                                 
 block2_pool (MaxPooling2D)  (None, 56, 56, 128)       0         
                                                                 
 block3_conv1 (Conv2D)       (None, 56, 56, 256)       295168    
                                                                 
 block3_conv2 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_conv3 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_conv4 (Conv2D)       (None, 56, 56, 256)       590080    
                                                                 
 block3_pool (MaxPooling2D)  (None, 28, 28, 256)       0         
          

In [7]:
# Here we try to create a dataset for training our network above.

data_dir = Path("./data/India/")

# We get a generator to generate our images
images = data_dir.glob("SS21_13 0[0-9][0-9][0-9].jpg")
out_dir = Path("./data/vgg_dataset/")

# Commented out so we don't accidentally run the operation again.
# for img in images:
#     shutil.copy(src=img, dst=out_dir)

In [10]:
# Now we can load the images previously, and process to a suitable dataset

batch_size = 32
height, width = 224, 224

# Use TF to generate the dataset from the output directory.
# The images are resized to 64*64 as that is the input of the model we created for training.

train_ds = tf.keras.utils.image_dataset_from_directory(
    out_dir,
    seed=420,
    image_size=(height, width),
    batch_size=batch_size,
    labels=None,
    crop_to_aspect_ratio=True,
)

Found 999 files belonging to 1 classes.


In [11]:
# This is consistent since we did not feed any classes into the dataset (determining the classes is the aim)
print(f"Classes: {train_ds.class_names}")

Classes: ['']


In [12]:
# We can see that we have batches of 32 images, each of which has been resized to 64x64 dims

for image_batch in train_ds:
    _, h, w, channles = image_batch.get_shape()
    print(f"Height={h}, Width={w}, Color channels={channles}")
    break

Height=224, Width=224, Color channels=3


In [13]:
# Now, before training and prediction, we just need to scale the RGB channel values of our images accordingly

norm_layer = tf.keras.layers.Rescaling(1./255)

norm_ds = train_ds.map(lambda x: (norm_layer(x)))

#### - Model Fitting and Predictions

In [15]:
y_pred = transfer_model.predict(x=norm_ds, batch_size=32, use_multiprocessing=True)



In [19]:
labels = tf.keras.applications.vgg19.decode_predictions(y_pred, top=3)

print(labels[0:10])

[[('n03788365', 'mosquito_net', 0.09929915), ('n04209239', 'shower_curtain', 0.03335749), ('n03291819', 'envelope', 0.029101176)], [('n03788365', 'mosquito_net', 0.14031307), ('n04209239', 'shower_curtain', 0.044426326), ('n03291819', 'envelope', 0.029852847)], [('n03788365', 'mosquito_net', 0.11054545), ('n04209239', 'shower_curtain', 0.03444298), ('n03291819', 'envelope', 0.029767605)], [('n03788365', 'mosquito_net', 0.07276636), ('n03291819', 'envelope', 0.034083467), ('n04209239', 'shower_curtain', 0.03138446)], [('n03788365', 'mosquito_net', 0.107587725), ('n04209239', 'shower_curtain', 0.03427578), ('n03291819', 'envelope', 0.024215296)], [('n03788365', 'mosquito_net', 0.08296403), ('n04209239', 'shower_curtain', 0.032409992), ('n03291819', 'envelope', 0.031521685)], [('n03788365', 'mosquito_net', 0.17276318), ('n04209239', 'shower_curtain', 0.03240079), ('n03291819', 'envelope', 0.02808046)], [('n03788365', 'mosquito_net', 0.064780906), ('n04209239', 'shower_curtain', 0.03221407

Looking at the results above, we can see that the top 3 predictions for our dataset by the VGG19 based model are 
1. mosquito nets
2. envelopes
3. shower curtains

This is incorrect, as we know from knowledge of our dataset, and hence we can conclude that using features extracted from a pre-trained model is not very effective here since the problem space of the model is too general.