# Kaggle dataset preprocessing

#### Imports

In [None]:
#pip install opencv-python

In [6]:
import cv2
import os
import numpy as np
import pandas as pd


In [5]:
import tensorflow as tf
from tensorflow.keras.utils import image_dataset_from_directory
from tensorflow.keras.utils import to_categorical

import warnings
warnings.filterwarnings("ignore")

## Loading dataset

🚨 This will only be needed for the training because for the predicting the image will be loaded/stored differently

### Loading as lists and then converting to arrays (Susi's approach)

❓ @Susi :
- I think we need to "generalized" the whole data path => I defined it as a constant (will be in params)
- I think that the CNN input are tensors... if we are using this kind of model, we should built a tensor (number of images, height, width, channels)
- Using skimage library function to load images allows us to directly resize them => interesting to resize from the loading?

In [4]:
def load_images_with_labels(root_folder):
    images_with_labels = []
    for object_folder in os.listdir(root_folder):
        object_path = os.path.join(root_folder, object_folder)
        if os.path.isdir(object_path):
            for filename in os.listdir(object_path):
                img_path = os.path.join(object_path, filename)
                if os.path.isfile(img_path):
                    img = cv2.imread(img_path)
                    if img is not None:
                        label = object_folder
                        images_with_labels.append((img, label))
    return images_with_labels

In [5]:
# Define root folder containing subfolders for each object category
    # current_directory = os.getcwd()
    # root_folder = os.path.dirname(current_directory) + '/raw_data/Garbage classification'
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".lewagon", "waste_sorter_smart_bin", "data")
root_folder = os.path.join(LOCAL_DATA_PATH,'raw','Garbage classification')

In [6]:
root_folder

'/Users/Ainhoa/.lewagon/waste_sorter_smart_bin/data/raw/Garbage classification'

In [7]:
os.listdir(root_folder)

['paper', '.DS_Store', 'metal', 'cardboard', 'trash', 'glass', 'plastic']

In [8]:
# Load images with labels
images_with_labels = load_images_with_labels(root_folder)

In [9]:
type(images_with_labels) , type(images_with_labels[0]), type(images_with_labels[0][0]), type(images_with_labels[0][1])

(list, tuple, numpy.ndarray, str)

In [10]:
len(images_with_labels) , len(images_with_labels[0]), images_with_labels[0][0].shape

(2527, 2, (384, 512, 3))

### Loading directly as tensors from directory (Ainhoa's approach)

❓ @Susi : As the Kaggle dataset images are classified in folders named by its class, maybe is more convinient to load them as Tensorflow Datasets as we saw in the CNN challenges so that it is easier to handle them when training (making batches, etc). Also this might allow as to preprocess the images as tensors (more efficient) with tensorflow.keras functions like resizing.

BUT I didn't manage to make it work so, for the moment, I followed your approach

In [None]:
# current directory from where the notebook is run
!pwd

In [None]:
# data directory where the Garbage classification images folder is stored
data_dir = '../raw_data/Garbage classification'

# check by listing all the elements in the directory
os.listdir(data_dir)

In [None]:
# Load data as Tensorflow Dataset
batch_size = None

data = image_dataset_from_directory(
  root_folder, # Folder directory
  labels = "inferred", # inferred from sub folder name
  label_mode = "categorical",
  seed=None,
  image_size=(224, 224), # we can resize directly the images
  batch_size=batch_size)

# With Tensorflow Dataset object we can also define directly the training/validating

In [None]:
# check class names
data.class_names

In [None]:
data.element_spec

In [None]:
#data.file_paths

In [None]:
type(data)

In [None]:
for X,y in data:
    print(X.shape)

In [None]:
X.shape

## Basic Preprocessing

🚨 The preprocessing is going to depend on the architecture of the model (some include already the preprocessing) and, if we use transfer learning, we might have to use the specific preprocessing used in the pretrained model.
This is the basic preprocessing for images and labels.


### Susi's code

❓ @Susi :
- I don't like a lot to do the preprocessing image by image with lists (I think it would be faster with arrays/tensors) but I haven't find a way to resize tensors yet so I leave it like this for the moment.
- I wasn't sure how to deal with 'string numpy.arrays' for the labels that you created. So I changed a bit the code and included the conversion from string-labels to numerical-categories.

In [12]:
def preprocess_image(img, target_size):
    # Resize image
    img_resized = cv2.resize(img, target_size)

    # Normalize pixel values
    img_normalized = img_resized / 255.0

    return img_normalized

In [None]:
# Define target size for resizing
target_size = (224, 224)

# Preprocess each image in the dataset
preprocessed_data = []
for img, label in images_with_labels:
    preprocessed_img = preprocess_image(img, target_size)
    preprocessed_data.append((preprocessed_img, label))

In [None]:
len(preprocessed_data)

In [None]:
# Convert preprocessed_data to NumPy arrays
X = np.array([data[0] for data in preprocessed_data])

In [None]:
y = np.array([data[1] for data in preprocessed_data])

### Ainhoa's code

In [41]:
target_size = (224, 224)

# Basic preprocess : Resize and normalize images and convert categories to numbers
preprocessed_images = []
labels_list = []
for img, label in images_with_labels:
    labels_list.append(label)
    preprocessed_img = preprocess_image(img, target_size)
    preprocessed_images.append(preprocessed_img)

# Create a Pandas Series
labels_series = pd.Series(labels_list)

# Define a dictionary mapping each fruit to its length
categories_map = {'glass': 1, 'paper': 2, 'cardboard': 3, 'plastic': 4, 'metal': 5, 'trash': 0}

# Use the map() method to apply the mapping to each element of the Series
categories_series = labels_series.map(categories_map)

In [42]:
# Convert to arrays
images_array = np.array(preprocessed_images)
labels_array = np.array(categories_series)

In [43]:
# Target encoding
to_categorical(labels_array, num_classes=6)

array([[0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       ...,
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.]], dtype=float32)

## VGG16 preprocessing

Testing VGG16 preprocessing input function

In [27]:
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.applications.vgg16 import preprocess_input

In [15]:
# Define target size for resizing
target_size = (224, 224)

# Define root folder containing subfolders for each object category
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".lewagon", "waste_sorter_smart_bin", "data")
root_folder = os.path.join(LOCAL_DATA_PATH,'raw','Garbage classification')

In [11]:
CATEGORIES_MAP = {
    'glass': 1,
    'paper': 2,
    'cardboard': 3,
    'plastic': 4,
    'metal': 5,
    'trash': 0
    }

In [17]:
def load_images_with_labels(root_folder, target_size=(244,244)):
    ''' 
    Load images from local data folder
    Convert them to arrays
    Create a list with all the images and the equivalent with all the labels
    '''
    images_list = []
    labels_list = []
    for object_folder in os.listdir(root_folder):
        object_path = os.path.join(root_folder, object_folder)
        if os.path.isdir(object_path):
            for filename in os.listdir(object_path):
                img_path = os.path.join(object_path, filename)
                if os.path.isfile(img_path):
                    img = tf.keras.utils.load_img(img_path, target_size=target_size)
                    img_array = tf.keras.utils.img_to_array(img)
                    if img_array is not None:
                        label = object_folder
                        labels_list.append(label)
                        images_list.append(img_array)
    return images_list, labels_list

In [14]:
def preprocess_labels(labels_list):
    ''' Converts the names of the labels in integer (category classes)'''
    labels_series = pd.Series(labels_list)
    categories_series = labels_series.map(CATEGORIES_MAP)
    return categories_series

In [40]:
images_with_labels = load_images_with_labels(root_folder)

In [45]:
images_with_labels[0][0].shape

(244, 244, 3)

In [24]:
labels_list = images_with_labels[1]
categories_series = preprocess_labels(labels_list)
images_list = images_with_labels[0]

In [37]:
# Convert lists to arrays
images_array = np.array(images_list)
labels_array = np.array(categories_series)

In [28]:
images_array.shape , labels_array.shape

((2527, 224, 224, 3), (2527,))

In [38]:
preprocessed_images = tf.keras.applications.vgg16.preprocess_input(images_array)

In [39]:
preprocessed_images[0]

array([[[ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        [ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        [ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        ...,
        [ 8.5060997e+01,  8.7221001e+01,  8.9320000e+01],
        [ 8.4060997e+01,  8.6221001e+01,  8.8320000e+01],
        [ 8.3060997e+01,  8.5221001e+01,  8.7320000e+01]],

       [[ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        [ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        [ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        ...,
        [ 8.4060997e+01,  8.6221001e+01,  8.8320000e+01],
        [ 8.3060997e+01,  8.5221001e+01,  8.7320000e+01],
        [ 8.3060997e+01,  8.5221001e+01,  8.7320000e+01]],

       [[ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        [ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        [ 1.1606100e+02,  1.1522100e+02,  1.1232000e+02],
        ...,
        [ 8.3060997e+01,  8.5221001e+01,  8.7320000e+01],
        [ 8.3060997e+01,  8.5

## Final code for the .py files in "logic"

🚨 Here I sum up the final code I put in the .py files

In [None]:
# Functions
def load_images_with_labels(root_folder, target_size=(244,244)):
    ''' 
    Load images from local data folder
    Convert them to arrays
    Create a list with all the images and the equivalent with all the labels
    '''
    images_list = []
    labels_list = []
    for object_folder in os.listdir(root_folder):
        object_path = os.path.join(root_folder, object_folder)
        if os.path.isdir(object_path):
            for filename in os.listdir(object_path):
                img_path = os.path.join(object_path, filename)
                if os.path.isfile(img_path):
                    img = tf.keras.utils.load_img(img_path, target_size=target_size)
                    img_array = tf.keras.utils.img_to_array(img)
                    if img_array is not None:
                        label = object_folder
                        labels_list.append(label)
                        images_list.append(img_array)
    return images_list, labels_list

def preprocess_labels(labels_list):
    ''' Converts the names of the labels in integer (category classes)'''
    labels_series = pd.Series(labels_list)
    categories_series = labels_series.map(CATEGORIES_MAP)
    return categories_series

In [None]:
# Define root folder containing subfolders for each object category
LOCAL_DATA_PATH = os.path.join(os.path.expanduser('~'), ".lewagon", "waste_sorter_smart_bin", "data")
root_folder = os.path.join(LOCAL_DATA_PATH,'raw','Garbage classification')

# Load images with labels
images_with_labels = load_images_with_labels(root_folder)

In [None]:
# Make lists
labels_list = images_with_labels[1]
images_list = images_with_labels[0]

# Transform categories to numbers
categories_series = preprocess_labels(labels_list)

# Convert to arrays
images_array = np.array(preprocessed_images)
labels_array = np.array(categories_series)

# Target encoding
to_categorical(labels_array, num_classes=6)

# VGG16 preprocessing
preprocessed_images = tf.keras.applications.vgg16.preprocess_input(images_array)

## CNN model

🚨 I think we should either decide in a first base model and give it to the person that will do the modeling part, either talk with the team and decide all together which kind of model we will use (at least, the first layer type, so that we can adapt the preprocessing to it)

In [None]:
# Define the CNN model
def create_model(input_shape, num_classes):
    model = models.Sequential([
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dense(num_classes, activation='softmax')
    ])
    return model

# Define input shape and number of classes
input_shape = (224, 224, 3)
num_classes = len(set(y))

# Create the model
model = create_model(input_shape, num_classes)

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X, y)
print(f"Test Accuracy: {accuracy}")
