In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# **1. Dataset**

# 1.1 Profiling

In [None]:
ls ../input/ifsp-d3apl-2023-face-recognition/train/train/

In [None]:
import os

dataset_folder = '../input/ifsp-d3apl-2023-face-recognition/train/train/'

class_folders = sorted(os.listdir(dataset_folder))


print(class_folders)
print(f'Number of class: {len(class_folders)}')

In [None]:
# show me the class proportions: number of samples per class
for class_folder in class_folders:
    full_class_folder = os.path.join(dataset_folder, class_folder)
    
    class_img_filenames = os.listdir(full_class_folder)
    print(f'Number of Images for Class "{class_folder}": {len(class_img_filenames)}')

In [None]:
import os
import glob

# List of directories
#directories = ['dir1', 'dir2', 'dir3']

# Dictionary to store directory and file count
file_counts = {}

# Count files in each directory
for class_folder in class_folders:
    full_class_folder = os.path.join(dataset_folder, class_folder)
    file_counts[class_folder] = len(glob.glob(os.path.join(full_class_folder, '*')))

# Sort file counts by value in descending order
sorted_counts = sorted(file_counts.items(), key=lambda x: x[1], reverse=True)

print(sorted_counts)

# Print the sorted file counts
#for class_folder, count in sorted_counts:
#    print(f"{class_folder}: {count} files")


In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.utils import load_img
from tensorflow.keras.utils import img_to_array
import os

# Set the paths for the original and oversampled image folders
original_folder = dataset_folder
oversampled_folder = '../working/oversampled'


# Create the oversampled folder if it doesn't exist
os.makedirs(oversampled_folder, exist_ok=True)

# Create an instance of the ImageDataGenerator
datagen = ImageDataGenerator(
    rescale=1./255,  # Normalize pixel values to [0, 1]
    rotation_range=20,  # Randomly rotate images within the range of 20 degrees
    width_shift_range=0.1,  # Randomly shift the width of images by 10%
    height_shift_range=0.1,  # Randomly shift the height of images by 10%
    shear_range=0.2,  # Apply random shear transformations
    zoom_range=0.2,  # Apply random zoom transformations
    horizontal_flip=True,  # Randomly flip images horizontally
    fill_mode='nearest'  # Fill any newly created pixels after rotation or shifting
)

# show me the class proportions: number of samples per class

for class_folder in class_folders:
    
    for class_img_filename in class_img_filenames:
        
        img_path = os.path.join(original_folder,class_folder,class_img_filename)
        
        # Load the image
        img = load_img(img_path)
        
        # Expand dimensions to match the batch size
        img = img_to_array(img)
        img = img.reshape((1,) + img.shape)
        
        # Generate augmented images and save them to the oversampled folder
        save_prefix = os.path.splitext(filename)[0]  # Get the filename without extension
        save_path = os.path.join(oversampled_folder, class_folder, save_prefix)
        
        # Generate oversampled images
        i = 0
        for batch in datagen.flow(img, batch_size=1, save_to_dir=oversampled_folder, save_prefix=save_prefix, save_format='jpg'):
            i += 1
            if i >= 5:# Generate 5 oversampled images for each original image
                break
        

# 1.2 Preprocessing the dataset

In [None]:
max_n_samples_per_class = 80

In [None]:
import random

dataset_folder = '../input/ifsp-d3apl-2023-face-recognition/train/train/'
class_folders = sorted(os.listdir(dataset_folder))

# OPTIONAL: just to get the same selected images
random.seed(42)

img_full_paths = []
img_classes = []

for class_folder in class_folders:
    img_class = class_folder  # english
    print(f'Class: {img_class}')  # italiano
    
    # translated class
    #img_class = translate[class_folder]  # english
    #print(f'Translation: {img_class}')  # italiano
    
    # get the full class folder pathname
    full_class_folder = os.path.join(dataset_folder, class_folder)
    print(full_class_folder)
    
    # get all image filenames (without their parent dir) for the current class/animal
    class_img_filenames = sorted(os.listdir(full_class_folder))
    print(len(class_img_filenames))
    
    #### undersampling from scratch
    ### one possible strategy to select `max_n_samples_per_class` of samples randomly
    # random.shuffle(class_img_filenames)
    # class_img_filenames = class_img_filenames[:max_n_samples_per_class]

    class_img_filenames = random.sample(class_img_filenames, max_n_samples_per_class)
    print(f'Number of images: {len(class_img_filenames)}')
    
    for img_filename in class_img_filenames:
        full_img_path = os.path.join(full_class_folder, img_filename)
        
        img_full_paths.append(full_img_path)
        img_classes.append(img_class)
    
    print()

In [None]:
print(len(img_full_paths))

In [None]:
print(len(img_classes))

In [None]:
# creating a dataframe to store the image full pathnames and their corresponding classes
import pandas as pd

dataset_df = pd.DataFrame({
    'image_pathname': img_full_paths,
    'class': img_classes
})

dataset_df

In [None]:
dataset_df['class'].value_counts()

# 1.3 Saving the preprocessed dataset

In [None]:
dataset_df.to_csv('../working/faces_dataset_balanced.csv', index=False)

# 1.4 Inspect an image

In [None]:
import cv2

In [None]:
dataset_df.loc[0, 'image_pathname']

In [None]:
# read an image
img = cv2.imread(dataset_df.loc[0, 'image_pathname'])
print(type(img))
img.shape

In [None]:
# channel BLUE
img[:, :, 0]

In [None]:
# channel GREEN
img[:, :, 1]

In [None]:
# channel RED
img[:, :, 2]

In [None]:
img.min(), img.max()

In [None]:
import matplotlib.pyplot as plt

plt.imshow(img)

In [None]:
img_RGB = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
#img_RGB = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
plt.imshow(img)

In [None]:
# read the color image as a gray image
gray_img = cv2.imread(dataset_df.loc[0, 'image_pathname'], cv2.IMREAD_GRAYSCALE)

print(gray_img.shape)

plt.imshow(gray_img, cmap='gray')

In [None]:
img = cv2.imread(dataset_df.loc[6000, 'image_pathname'])  # BGR
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)  # convert BGR to RGB
plt.imshow(img)

In [None]:
img.shape

# 1.4 Create the training dataset

In [None]:
import tensorflow as tf
tf.__version__

In [None]:
import tensorflow as tf
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
dataset_df

In [None]:
dataset_df["class"].unique()

In [None]:
class_names = sorted(dataset_df["class"].unique())
n_classes = len(class_names)

print(f'Number of classes: {n_classes}')
print(f'Classes: {class_names}')

In [None]:
# number of samples per class
dataset_df['class'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

# for a stratified sampling, we need to pass the labels
labels = dataset_df['class']

dataset_df_full_train, dataset_df_test = train_test_split(dataset_df, test_size=0.2, random_state=42, stratify=labels)

In [None]:
dataset_df_full_train.shape

In [None]:
dataset_df_full_train.head()

In [None]:
dataset_df_test.shape

In [None]:
# for a stratified sampling, we need to pass the labels
labels_full_train = dataset_df_full_train['class']

dataset_df_train, dataset_df_val = train_test_split(dataset_df_full_train, train_size=0.8, random_state=42, stratify=labels_full_train)

dataset_df_train['class'].value_counts()

In [None]:
# checking class balancing in the validation set
dataset_df_val['class'].value_counts()

In [None]:
# checking class balancing in the training set
dataset_df_test['class'].value_counts()

# **2. Training the model**

In [None]:
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Conv2D, MaxPool2D, Flatten, Dense


def build_cnn(input_shape=(64, 64, 3), n_classes=83):
    model = Sequential([
        # feature extraction
        Conv2D(filters=32, kernel_size=(4,4), activation='relu', input_shape=input_shape),
        MaxPool2D(pool_size=(2,2)),
        Conv2D(filters=32, kernel_size=(4,4), activation='relu'),
        MaxPool2D(pool_size=(2,2)),
        Flatten(),
        
        # Fully-Connected Neural Network ==> MLP
        Dense(256, activation='relu'),
        Dense(n_classes, activation='softmax')
    ])
    
    return model

In [None]:

input_shape = (64, 64, 3)

model = build_cnn(input_shape, n_classes)
opt = tf.keras.optimizers.SGD(learning_rate=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
from tensorflow.keras.utils import plot_model
# vertical
plot_model(model, show_shapes=True, show_layer_activations=True)

# 2.1 Preprocessing the images

In [None]:
dataset_df.loc[0, 'image_pathname']

In [None]:
import cv2
import matplotlib.pyplot as plt

# BGR
img = cv2.imread('../input/ifsp-d3apl-2023-face-recognition/train/train/Adam Sandler/73.jpg')
# BGR ==> RGB
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)

plt.imshow(img)

In [None]:
# aspect ratio = width / height
aspect_ratio = img.shape[0] / img.shape[1]
aspect_ratio

In [None]:
new_img_dims = (64, 64)

# resizing
res_img = cv2.resize(img, new_img_dims, interpolation=cv2.INTER_LINEAR)

plt.imshow(res_img)

In [None]:
### aspect ratio = width / height
aspect_ratio = img.shape[0] / img.shape[1]
aspect_ratio


In [None]:
import numpy as np

# preprocess the image dataset and return the feature matrix and the label array: X, y
def preprocess_faces_dataset(dataset_df, label_encoder, new_img_dims=(64,64), verbose=1000):
#def preprocess_faces_dataset(dataset_df, label_encoder, new_img_dims=(100,100), verbose=1000):
    image_list = []  # list of preprocessed images (numpy arrays)
    
    for index, img_path in enumerate(dataset_df['image_pathname']):
        img = cv2.imread(img_path)  # BGR
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) # RGB
        
        # image resizing 
        # for gray or color images, the linear interpolation sounds good
        img = cv2.resize(img, new_img_dims, interpolation=cv2.INTER_LINEAR)
        image_list.append(img)
        
        # verbose - print every 1000 iterations
        if index % verbose == 0:
            print(f'{index + 1}/{dataset_df.shape[0]} - {img_path}')
    
    # feature matrix
    # shape = (n_imgs, width, height, n_channels)
    X = np.array(image_list)
    
    # feature scaling
    X = X / 255.0
    
    # encoding the classes
    y = label_encoder.transform(dataset_df['class'])
    
    return X, y

In [None]:
# training a Label Encoder from the train set
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(dataset_df_train['class'])

label_encoder.classes_

In [None]:
# transform/map the string class to the trained numeric class
label_encoder.transform(['Alec Baldwin', 'Claudia Schiffer', 'Zac Efron'])

In [None]:
# preprocessing the train set
X_train, y_train = preprocess_faces_dataset(dataset_df_train, label_encoder, new_img_dims=(64, 64))
#X_train, y_train = preprocess_faces_dataset(dataset_df_train, label_encoder, new_img_dims=(100, 100))

In [None]:
print(f'X_train.shape: {X_train.shape}')
print(f'y_train (classes): {np.unique(y_train)}')
print(f'y_train.shape: {y_train.shape}')

# rescaled 24-bit color image
print(f'Min. value of X_train: {X_train.min()}')
print(f'Max. value of X_train: {X_train.max()}\n')

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X_train[0])

In [None]:

# preprocessing the validation set
X_val, y_val = preprocess_faces_dataset(dataset_df_val, label_encoder, new_img_dims=(64, 64))
#X_val, y_val = preprocess_faces_dataset(dataset_df_val, label_encoder, new_img_dims=(100, 100))

In [None]:
print(f'X_val.shape: {X_val.shape}')
print(f'y_val (classes): {np.unique(y_val)}')
print(f'y_val.shape: {y_val.shape}')

# rescaled 24-bit color image
print(f'Min. value of X_val: {X_val.min()}')
print(f'Max. value of X_val: {X_val.max()}\n')

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X_val[0])

In [None]:
# preprocessing the test set
X_test, y_test = preprocess_faces_dataset(dataset_df_test, label_encoder, new_img_dims=(64, 64))

In [None]:
import matplotlib.pyplot as plt
plt.imshow(X_test[0])

# 2.3. Saving the preprocessed data

In [None]:
import os

out_dir = '../working/preprocessed'

if not os.path.exists(out_dir):
    os.makedirs(out_dir)
    
dataset_df_full_train.to_csv(os.path.join(out_dir, 'full_train.csv'), index=False)

dataset_df_train.to_csv(os.path.join(out_dir, 'train.csv'), index=False)
np.save(os.path.join(out_dir, 'train_data_64x64x3.npy'), X_train)
np.save(os.path.join(out_dir, 'train_labels.npy'), y_train)

dataset_df_val.to_csv(os.path.join(out_dir, 'validation.csv'), index=False)
np.save(os.path.join(out_dir, 'validation_data_64x64x3.npy'), X_val)
np.save(os.path.join(out_dir, 'validation_labels.npy'), y_val)

dataset_df_test.to_csv(os.path.join(out_dir, 'test.csv'), index=False)
np.save(os.path.join(out_dir, 'test_data_64x64x3.npy'), X_test)
np.save(os.path.join(out_dir, 'test_labels.npy'), y_test)

# 2.5 Training the model

In [None]:
early_stopping_cb = tf.keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)

In [None]:
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), callbacks=[early_stopping_cb])

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

history_df = pd.DataFrame(history.history)

In [None]:
history_df[['loss', 'val_loss']].plot(figsize=(8, 5))
plt.grid(True)
plt.xlabel('Epochs')
plt.ylabel('Score')

history_df[['accuracy', 'val_accuracy']].plot(figsize=(8, 5))
plt.grid(True)
plt.xlabel('Epochs')
plt.ylabel('Score')

In [None]:

model.evaluate(X_test, y_test)

In [None]:
y_test_proba = model.predict(X_test)
y_test_proba

In [None]:
y_test_pred = np.argmax(y_test_proba, axis=1)
y_test_pred

In [None]:
from sklearn.metrics import classification_report

class_names = label_encoder.classes_

print(classification_report(y_test, y_test_pred, target_names=[name for name in class_names]))

In [None]:
y_test_class_name = label_encoder.inverse_transform(y_test)
y_test_pred_class_name = label_encoder.inverse_transform(y_test_pred)

In [None]:
misclassification_mask = y_test_class_name != y_test_pred_class_name

In [None]:
sheep_error_mask = misclassification_mask & (y_test_class_name == "Alec Baldwin")

np.argwhere(sheep_error_mask)[:3]

In [None]:
img_idx = 18

plt.imshow(X_test[img_idx])
plt.title(f'True: {y_test_class_name[img_idx]}, Predicted: {y_test_pred_class_name[img_idx]}')