# Start


In [None]:
import os
import random
import shutil

import cv2
import kagglehub
import keras_tuner as kt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import tensorflow as tf
from IPython.core.magic import register_cell_magic
from keras.layers import (
    Conv2D,
    Dense,
    Dropout,
    Flatten,
    InputLayer,
    MaxPooling2D,
    Rescaling,
)
from keras.models import Sequential
from keras.preprocessing import image
from PIL import Image
from skimage.metrics import structural_similarity as ssim
from tensorflow.keras.models import load_model

## Import data


In [None]:
def import_raw_data():
    # Load dataset
    path = kagglehub.dataset_download("arashnic/faces-age-detection-dataset")

    # Define path to files
    faces_path = os.path.join(path, "faces")
    train_path = os.path.join(faces_path, "Train")

    # Create directory for raw data
    export_path = "../data/raw/faces"
    os.makedirs(export_path, exist_ok=True)

    # Copy files to raw data directory
    shutil.copytree(train_path, os.path.join(export_path, "images"), dirs_exist_ok=True)
    shutil.copy(os.path.join(faces_path, "train.csv"), os.path.join(export_path, "train.csv"))

    export_path = "../data/raw"


if not os.path.exists("../data/raw/faces"):
    import_raw_data()

-   Faces_02 has no csv file for classification so we are gonna skip it


## Definitons


### Determinism


In [None]:
# Making model more deterministic (not perfect but better than nothing)
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
tf.random.set_global_generator(tf.random.Generator.from_seed(SEED))

# This is perfect deterministic
# tf.config.experimental.enable_op_determinism()
# os.environ["TF_DETERMINISTIC_OPS"] = "1"

### Helper


In [None]:
df_train = pd.read_csv("../data/raw/faces/train.csv")

In [None]:
@register_cell_magic
def ignore(line, cell):
    pass

### EDA


In [None]:
# Get low quality images
def get_low_quality_images(
    images, min_width=15, min_height=15, min_aspect_ratio=0.2, max_aspect_ratio=4, min_laplacian=10, min_ssim=0.5
):
    low_quality_images = []

    for _, row in images.iterrows():
        image_id = row["ID"]
        image_width = row["width"]
        image_height = row["height"]
        aspect_ratio = row["aspect_ratio"]
        laplacian = row["laplacian"]
        ssim_score = row["ssim_score"]

        if (
            image_width < min_width
            or image_height < min_height
            or aspect_ratio < min_aspect_ratio
            or aspect_ratio > max_aspect_ratio
            or laplacian < min_laplacian
            or ssim_score < min_ssim
        ):
            low_quality_images.append(image_id)

    print(f"Number of low quality images: {len(low_quality_images)}")
    print(f"Percentage of low quality images: {len(low_quality_images) / len(images) * 100:.2f}%")
    images = images[~images["ID"].isin(low_quality_images)]

    return images

In [None]:
def multiply_images(df):
    max_id = max([int(id.replace(".jpg", "")) for id in df["ID"]])
    next_id = max_id + 1

    number_of_images = df["Class"].value_counts()

    number_of_young = number_of_images["YOUNG"]
    number_of_middle = number_of_images["MIDDLE"]
    number_of_old = number_of_images["OLD"]

    maximum_images = max(number_of_young, number_of_middle, number_of_old)

    multiply_young = round(maximum_images / number_of_young, 0)
    multiply_middle = round(maximum_images / number_of_middle, 0)
    multiply_old = round(maximum_images / number_of_old, 0)

    print(f"Number of total images: {len(df)}")
    print(f"Number of young images: {number_of_young}")
    print(f"Number of middle images: {number_of_middle}")
    print(f"Number of old images: {number_of_old}")
    print("-----------------------------------")
    print(f"Number of young images to multiply: {multiply_young}")
    print(f"Number of middle images to multiply: {multiply_middle}")
    print(f"Number of old images to multiply: {multiply_old}")

    # Create copies
    young_df = df[df["Class"] == "YOUNG"].copy()
    middle_df = df[df["Class"] == "MIDDLE"].copy()
    old_df = df[df["Class"] == "OLD"].copy()

    # Create additional copies
    young_additional = pd.concat([young_df] * int(multiply_young - 1)) if multiply_young > 1 else pd.DataFrame()
    middle_additional = (
        pd.concat([middle_df] * int(multiply_middle - 1)) if multiply_middle > 1 else pd.DataFrame()
    )
    old_additional = pd.concat([old_df] * int(multiply_old - 1)) if multiply_old > 1 else pd.DataFrame()

    # Combine all additional copies
    additional_df = pd.concat([young_additional, middle_additional, old_additional])

    # Create new_name column for additional copies
    additional_df["new_name"] = [f"{next_id + i}.jpg" for i in range(len(additional_df))]

    # Add new_name column to original DataFrame (same as ID for original entries)
    df = df.assign(new_name=df["ID"])

    # Combine original and additional DataFrames
    balanced_df = pd.concat([df, additional_df])
    balanced_df.reset_index(drop=True, inplace=True)

    print("-----------------------------------")
    print(f"Number of total images after multiplication: {len(balanced_df)}")
    print(f"Number of young images after multiplication: {len(balanced_df[balanced_df['Class'] == 'YOUNG'])}")
    print(f"Number of middle images after multiplication: {len(balanced_df[balanced_df['Class'] == 'MIDDLE'])}")
    print(f"Number of old images after multiplication: {len(balanced_df[balanced_df['Class'] == 'OLD'])}")

    return balanced_df

In [None]:
# Resize images and save them into categorical directories
def process_images(images, target_size, color_mode, directory, rgb_skip):
    if color_mode == "rgb" and rgb_skip:
        output_dir = f"../data/processed/{directory}/{color_mode}_skip/{target_size[0]}x{target_size[1]}"
    else:
        output_dir = f"../data/processed/{directory}/{color_mode}/{target_size[0]}x{target_size[1]}"

    young_dir = os.path.join(output_dir, "young")
    middle_dir = os.path.join(output_dir, "middle")
    old_dir = os.path.join(output_dir, "old")

    os.makedirs(output_dir, exist_ok=True)
    os.makedirs(young_dir, exist_ok=True)
    os.makedirs(middle_dir, exist_ok=True)
    os.makedirs(old_dir, exist_ok=True)

    for _, row in images.iterrows():
        img_name = row["ID"]
        img_new_name = row["new_name"]
        img_path = os.path.join("../data/raw/faces/images", img_name)

        img = cv2.imread(img_path)

        # Check if the image is grayscale
        is_grayscale = True
        b, g, r = cv2.split(img)
        if not (np.array_equal(r, g) and np.array_equal(g, b)):
            is_grayscale = False

        # Skip grayscale images if color_mode is "rgb"
        if rgb_skip and is_grayscale and color_mode == "rgb":
            continue

        img = cv2.resize(img, target_size)
        img = Image.fromarray(img)

        if row["Class"].upper() == "YOUNG":
            img.save(os.path.join(young_dir, img_new_name))
        elif row["Class"].upper() == "MIDDLE":
            img.save(os.path.join(middle_dir, img_new_name))
        elif row["Class"].upper() == "OLD":
            img.save(os.path.join(old_dir, img_new_name))


def process_all_dimensions_colors(df, directory):
    target_sizes = [(64, 64), (96, 96), (128, 128)]
    color_modes = ["grayscale", "rgb"]
    rgb_skip = [False, True]
    for target_size in target_sizes:
        for color_mode in color_modes:
            for skip in rgb_skip:
                print(f"Processing images with target size {target_size} and color mode {color_mode}")
                process_images(df, target_size, color_mode, directory, skip)

### Model


In [None]:
BATCH_SIZE = 64
EPOCHS = 100


def get_dataset(size, color_mode, clearing, rgb_skip):
    if rgb_skip and color_mode == "rgb":
        ds = tf.keras.preprocessing.image_dataset_from_directory(
            f"../data/processed/{clearing}/{color_mode}_skip/{size}x{size}",
            shuffle=True,
            image_size=(size, size),
            batch_size=BATCH_SIZE,
            color_mode=color_mode,
        )
    else:
        ds = tf.keras.preprocessing.image_dataset_from_directory(
            f"../data/processed/{clearing}/{color_mode}/{size}x{size}",
            shuffle=True,
            image_size=(size, size),
            batch_size=BATCH_SIZE,
            color_mode=color_mode,
        )

    # Split dataset
    data_size = len(ds)
    train_split = 0.7
    val_split = 0.2
    test_split = 0.1

    train_size = int(train_split * data_size)
    val_size = int(val_split * data_size)

    train_ds = ds.take(train_size)
    val_ds = ds.skip(train_size).take(val_size)
    test_ds = ds.skip(train_size).skip(val_size)

    # Cache, shuffle and prefetch
    buffer_size = len(train_ds) * BATCH_SIZE
    buffer_size

    number_of_images = train_ds.cardinality().numpy() * BATCH_SIZE
    AUTOTUNE = tf.data.AUTOTUNE
    train_ds = train_ds.cache().shuffle(number_of_images, seed=SEED).prefetch(buffer_size=AUTOTUNE)
    val_ds = val_ds.cache().prefetch(buffer_size=AUTOTUNE)

    return train_ds, val_ds, test_ds


def get_input_shape(train_ds):
    for batch, _ in train_ds.take(1):
        return batch.shape

In [None]:
data_augmentation = tf.keras.Sequential(
    [
        tf.keras.layers.RandomFlip("horizontal"),
        tf.keras.layers.RandomRotation(0.1),
        tf.keras.layers.RandomZoom(0.1),
        tf.keras.layers.RandomContrast(0.1),
        tf.keras.layers.RandomBrightness(0.1),
    ]
)

early_stopping = tf.keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, restore_best_weights=True)

### Model Evaluation


In [None]:
def get_best_record(file):
    history = pd.read_csv(f"../model_history/{file}")
    best_val_accuracy_row = history[history["val_accuracy"] == max(history["val_accuracy"])].tail(1)
    return best_val_accuracy_row


def create_best_records_table(cvs_files):
    # Create an empty DataFrame with file names as the index
    best_records_df = pd.DataFrame()

    # Iterate over each file and append the best record to the DataFrame
    best_records_list = []
    for file in cvs_files:
        best_record = get_best_record(file)
        if best_record["val_accuracy"].item() < 0.8:
            continue
        best_record["file"] = file  # Add a column for the file name
        best_records_list.append(best_record)

    # Concatenate all best records into a single DataFrame
    best_records_df = pd.concat(best_records_list, ignore_index=True)

    # Set the index of the DataFrame to the file names
    best_records_df.set_index("file", inplace=True)

    # Display the DataFrame
    best_records_df = best_records_df.sort_values(by="val_accuracy", ascending=False)
    return best_records_df

In [None]:
def show_accuracy(history):
    plt.plot(history.history["accuracy"], label="Training Accuracy")
    plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
    plt.title("Model accuracy")
    plt.ylabel("Accuracy")
    plt.xlabel("Epoch")
    plt.legend(["Train", "Validation"], loc="upper left")
    plt.show()


def show_loss(history):
    plt.plot(history.history["loss"], label="Training Loss")
    plt.plot(history.history["val_loss"], label="Validation Loss")
    plt.title("Model loss")
    plt.ylabel("Loss")
    plt.xlabel("Epoch")
    plt.legend(["Train", "Validation"], loc="upper left")
    plt.show()


def show_evaluation(model, val_ds, history):
    loss, acc = model.evaluate(val_ds, batch_size=BATCH_SIZE)
    print(f"Loss: {round(loss, 3)}, Acc: {round(acc*100, 3)}%")
    show_accuracy(history)
    show_loss(history)


def show_evaluation_df(history, name):
    best_val_accuracy_row = history[history["val_accuracy"] == max(history["val_accuracy"])]
    best_val_accuracy_epoch = best_val_accuracy_row.index[0]
    total_epochs = len(history)

    fig, axes = plt.subplots(1, 2, figsize=(15, 5))
    fig.suptitle(name)

    # Plot accuracy
    axes[0].plot(history["accuracy"], label="Training Accuracy")
    axes[0].plot(history["val_accuracy"], label="Validation Accuracy")
    axes[0].set_title("Model accuracy")
    axes[0].set_ylabel("Accuracy")
    axes[0].set_xlabel("Epoch")
    axes[0].legend(["Train", "Validation"], loc="upper left")

    # Plot loss
    axes[1].plot(history["loss"], label="Training Loss")
    axes[1].plot(history["val_loss"], label="Validation Loss")
    axes[1].set_title("Model loss")
    axes[1].set_ylabel("Loss")
    axes[1].set_xlabel("Epoch")
    axes[1].legend(["Train", "Validation"], loc="upper left")

    plt.show()
    print(f"Best validation accuracy: {round(best_val_accuracy_row['val_accuracy'].values[0]*100, 3)}%")
    print(f"Best validation accuracy epoch: {best_val_accuracy_epoch}")
    print(f"Total epochs: {total_epochs}")

### Other


In [None]:
def calculate_balance_index(df):
    class_counts = df["Number of images"]
    total_images = class_counts.sum()
    ideal_count = total_images / len(class_counts)
    balance_index = 100 - (class_counts - ideal_count).abs().sum() / (2 * total_images) * 100
    return balance_index

# 4.1


## A


-   EDA a data preprocessing pre Vami vybrané charakteristiky z datasetu


### EDA


#### EDA - simple


In [None]:
df_train.head()

In [None]:
df_train.info()

In [None]:
df_train.describe()

In [None]:
df_train.duplicated().sum()

In [None]:
df_train.isnull().sum()

In [None]:
image_dir = "../data/raw/faces/images"
num_files = len([name for name in os.listdir(image_dir) if os.path.isfile(os.path.join(image_dir, name))])
print(f"Number of images: {num_files}")

In [None]:
df = pd.DataFrame(
    {
        "Class": df_train["Class"].value_counts().index,
        "Number of images": df_train["Class"].value_counts().values,
    }
)
ax = sns.barplot(x="Class", y="Number of images", data=df)
for i, v in enumerate(df["Number of images"]):
    ax.text(i, v, str(v), ha="center", va="bottom")

In [None]:
# Calculate the percentage of each class
class_counts = df["Number of images"]
class_labels = df["Class"]

# Plot the pie chart
plt.figure(figsize=(5, 5))
plt.pie(class_counts, labels=class_labels, autopct="%1.1f%%")
plt.title("Percentage of Each Class")
plt.show()

#### Findings


-   There are no nulls
-   There are no duplicates
-   Number of images is same as number of data in cvs file
-   There is quite big disbalance in number of images for each class


#### EDA - image size and quality


In [None]:
image_dir = "../data/raw/faces/images"

if os.path.exists("../data/raw/faces/image_info.csv"):
    df_image_info = pd.read_csv("../data/raw/faces/image_info.csv")
else:
    df_image_info = df_train.copy()

    for image_id in df_image_info["ID"]:
        img = cv2.imread(os.path.join(image_dir, image_id))

        is_grayscale = True
        b, g, r = cv2.split(img)
        if not (np.array_equal(r, g) and np.array_equal(g, b)):
            is_grayscale = False

        laplacian = cv2.Laplacian(img, cv2.CV_64F).var()

        compressed = cv2.resize(img, (img.shape[1] // 2, img.shape[0] // 2))
        compressed = cv2.resize(compressed, (img.shape[1], img.shape[0]))
        ssim_score, _ = ssim(img, compressed, channel_axis=2, full=True)

        original_size = img.shape[:2]
        compressed = cv2.resize(img, (64, 64))
        compressed = cv2.resize(compressed, (original_size[1], original_size[0]))
        ssim_score_64, _ = ssim(img, compressed, channel_axis=2, full=True)

        compressed = cv2.resize(img, (128, 128))
        compressed = cv2.resize(compressed, (original_size[1], original_size[0]))
        ssim_score_128, _ = ssim(img, compressed, channel_axis=2, full=True)

        df_image_info.loc[df_image_info["ID"] == image_id, "width"] = img.shape[0]
        df_image_info.loc[df_image_info["ID"] == image_id, "height"] = img.shape[1]
        df_image_info.loc[df_image_info["ID"] == image_id, "aspect_ratio"] = img.shape[0] / img.shape[1]
        df_image_info.loc[df_image_info["ID"] == image_id, "image_size"] = img.shape[0] * img.shape[1]
        df_image_info.loc[df_image_info["ID"] == image_id, "is_grayscale"] = is_grayscale
        df_image_info.loc[df_image_info["ID"] == image_id, "laplacian"] = laplacian
        df_image_info.loc[df_image_info["ID"] == image_id, "ssim_score"] = ssim_score
        df_image_info.loc[df_image_info["ID"] == image_id, "ssim_score_64"] = ssim_score_64
        df_image_info.loc[df_image_info["ID"] == image_id, "ssim_score_128"] = ssim_score_128

    os.makedirs("../data/raw/faces", exist_ok=True)
    df_image_info.to_csv("../data/raw/faces/image_info.csv", index=False)

df_image_info

-   Here we tried to scale images to 64 and 128 and look at image similarity
-   It looks like 128 is in 99% cases better than 64 (even for very low size images)
-   Our initial thought was that 64 would have better score for lower resolution images but it looks like it is not the case
-   Therefore we are going to look only at ssim_score, which is calculated by reducing size by 50%, so it tells us how much information is lost


In [None]:
df_image_info.describe()

In [None]:
number_of_grayscale_images = df_image_info["is_grayscale"].sum()
number_of_color_images = len(df_image_info) - number_of_grayscale_images

df = pd.DataFrame(
    {
        "Grayscale": ["Yes", "No"],
        "Number of images": [number_of_grayscale_images, number_of_color_images],
    }
)

plt.figure(figsize=(5, 5))
ax = sns.barplot(x="Grayscale", y="Number of images", data=df)
for i, v in enumerate(df["Number of images"]):
    ax.text(i, v, str(v), ha="center", va="bottom")

plt.title("Number of Grayscale and Color Images")
plt.show()

plt.figure(figsize=(5, 5))
plt.pie(df["Number of images"], labels=df["Grayscale"], autopct="%1.1f%%")
plt.title("Percentage of Grayscale and Color Images")
plt.show()

In [None]:
_, axes = plt.subplots(1, 2, figsize=(15, 5))
sns.histplot(df_image_info["width"], ax=axes[0])
axes[0].set_title("Width")
sns.histplot(df_image_info["height"], ax=axes[1])
axes[1].set_title("Height")
plt.show()

Analysis of image size (64,128) are our main focus


In [None]:
number_of_images_64 = len(df_image_info[(df_image_info["width"] <= 64) & (df_image_info["height"] <= 64)])
number_of_images_128 = len(df_image_info[(df_image_info["width"] >= 128) & (df_image_info["height"] >= 128)])
number_of_images_between = len(
    df_image_info[
        (df_image_info["width"] > 64)
        & (df_image_info["width"] < 128)
        & (df_image_info["height"] > 64)
        & (df_image_info["height"] < 128)
    ]
)

df = pd.DataFrame(
    {
        "Number of images": [number_of_images_64, number_of_images_between, number_of_images_128],
        "Resolution": ["<=64", "64-128", ">=128"],
    }
)

plt.figure(figsize=(5, 5))
ax = sns.barplot(x="Resolution", y="Number of images", data=df)
for i, v in enumerate(df["Number of images"]):
    ax.text(i, v, str(v), ha="center", va="bottom")

plt.title("Number of images by resolution - both dimensions")
plt.show()

In [None]:
number_of_images_64 = len(df_image_info[(df_image_info["width"] <= 64) | (df_image_info["height"] <= 64)])
number_of_images_128 = len(df_image_info[(df_image_info["width"] >= 128) | (df_image_info["height"] >= 128)])
number_of_images_between = len(
    df_image_info[
        ((df_image_info["width"] > 64) & (df_image_info["height"] > 64))
        | ((df_image_info["width"] < 128) & (df_image_info["height"] < 128))
    ]
)

df = pd.DataFrame(
    {
        "Number of images": [number_of_images_64, number_of_images_between, number_of_images_128],
        "Resolution": ["<=64", "64-128", ">=128"],
    }
)

plt.figure(figsize=(5, 5))
ax = sns.barplot(x="Resolution", y="Number of images", data=df)
for i, v in enumerate(df["Number of images"]):
    ax.text(i, v, str(v), ha="center", va="bottom")

plt.title("Number of images by resolution - one dimension")
plt.show()

In [None]:
size_64 = 64 * 64
size_128 = 128 * 128

number_of_images_64 = len(df_image_info[df_image_info["image_size"] <= size_64])
number_of_images_128 = len(df_image_info[df_image_info["image_size"] >= size_128])
number_of_images_between = len(
    df_image_info[(df_image_info["image_size"] > size_64) & (df_image_info["image_size"] < size_128)]
)

df = pd.DataFrame(
    {
        "Number of images": [number_of_images_64, number_of_images_between, number_of_images_128],
        "Resolution": ["<=64x64", "64x64-128x128", ">=128x128"],
    }
)

plt.figure(figsize=(5, 5))
ax = sns.barplot(x="Resolution", y="Number of images", data=df)
for i, v in enumerate(df["Number of images"]):
    ax.text(i, v, str(v), ha="center", va="bottom")

plt.title("Number of images by resolution - both dimensions")
plt.show()

In [None]:
_, axes = plt.subplots(1, 2, figsize=(15, 5))
sns.boxplot(x=df_image_info["width"], ax=axes[0])
axes[0].set_title("Width")
sns.boxplot(x=df_image_info["height"], ax=axes[1])
axes[1].set_title("Height")
plt.show()

In [None]:
_, axes = plt.subplots(2, 2, figsize=(15, 10))
sns.histplot(df_image_info["laplacian"], ax=axes[0, 0])
axes[0, 0].set_title("Laplacian")
sns.histplot(df_image_info["ssim_score"], ax=axes[0, 1])
axes[0, 1].set_title("SSIM")

sns.histplot(df_image_info["laplacian"], ax=axes[1, 0])
axes[1, 0].set_title("Laplacian - Low values")
axes[1, 0].set_xlim(0, 200)
sns.histplot(df_image_info["ssim_score"], ax=axes[1, 1])
axes[1, 1].set_title("SSIM - Low values")
axes[1, 1].set_xlim(0.7, 0.9)
axes[1, 1].set_ylim(0, 60)
plt.show()

In [None]:
sns.scatterplot(x="ssim_score", y="image_size", data=df_image_info)
plt.title("SSIM vs Image Size")
plt.show()

#### Findings


BASIC

-   There are a lot of different sizes of images.
-   Images don't have sma x,y dimensions.
-   When analyzing image size for both dimensions there are more 64x64 or lower images than 128x128 images and higher.
-   When analyzing image size for only one dimension there are significantly more images that range from 64-128 pixels.
-   When analyzing image size by multiplying x and y dimensions there are more images with size 64x64 than 128x128.
-   This means it is slightly better to resize images to 64x64, this will also mean that model is faster.
-   Most of the images that have low ssim_score are images that have low resolution.
-   There is 5.8% of grayscale images, but they are not low resolution images.

PLAN

-   We are going to resize images to 64x64.
-   Later we are also going to look at 96x96 and 128x128.
-   We also got feedback to remove high-res images, but upon further analysis we decided to keep them as they have important features preserved when resizing.
-   We are algo going to multiply images in other datasets to make it more balanced.

REMOVAL

-   At first we are gonna remove only big outliers, then we are gonna remove more, and see results if model is training better.
-   We are going are going to remove small images, under 15x15 or even 20x20 based on how does model perform.
-   We are going to remove images that have very low Laplacian variance, under 15 or even 20-30 based on how does model perform.
-   Since we are removing low resolution images, we don't necesery need to remove images that have low ssim_score, but we are going to try it and see how does model perform. (0.5, later even 0.8)
-   Aspect ration under 0.2 or above 4 (there are none under 0.2, and few above 4), later <0.3 and >3.33
-   None / all grayslake images if model trains on rgb.


### Data preprocessing


#### Low quality images


In [None]:
df_images = pd.read_csv("../data/raw/faces/image_info.csv")
df_images = get_low_quality_images(df_images)

#### Mutiply images


In [None]:
balanced_df = multiply_images(df_images)

In [None]:
df_old = pd.DataFrame(
    {
        "Class": df_train["Class"].value_counts().index,
        "Number of images": df_train["Class"].value_counts().values,
    }
)

df_new = pd.DataFrame(
    {
        "Class": balanced_df["Class"].value_counts().index,
        "Number of images": balanced_df["Class"].value_counts().values,
    }
)

_, axes = plt.subplots(1, 2, figsize=(15, 5))
ax = sns.barplot(x="Class", y="Number of images", data=df_old, ax=axes[0])
for i, v in enumerate(df_old["Number of images"]):
    ax.text(i, v, str(v), ha="center", va="bottom")
ax.set_title("Original dataset")

ax = sns.barplot(x="Class", y="Number of images", data=df_new, ax=axes[1])
for i, v in enumerate(df_new["Number of images"]):
    ax.text(i, v, str(v), ha="center", va="bottom")
ax.set_title("Balanced dataset")
plt.show()

In [None]:
# Calculate the percentage of each class
class_counts_old = df_old["Number of images"]
class_labels_old = df_old["Class"]
class_counts_new = df_new["Number of images"]
class_labels_new = df_new["Class"]

# Plot the pie chart
_, axes = plt.subplots(1, 2, figsize=(15, 5))
axes[0].pie(class_counts_old, labels=class_labels_old, autopct="%1.1f%%")
axes[0].set_title("Original dataset")
axes[1].pie(class_counts_new, labels=class_labels_new, autopct="%1.1f%%")
axes[1].set_title("Balanced dataset")
plt.show()

In [None]:
balanced_df.to_csv("../data/raw/faces/balanced_train.csv", index=False)

-   We see we improved the balance of images in dataset.


#### Process images


This takes a lot of time. only_final_images is set to True, so it will only process images for final model.


In [None]:
only_final_images = True

if os.path.exists("../data/processed"):
    shutil.rmtree("../data/processed")

os.makedirs("../data/processed", exist_ok=True)

if only_final_images:
    df_images = pd.read_csv("../data/raw/faces/image_info.csv")

    df_1 = get_low_quality_images(
        df_images,
        min_width=15,
        min_height=15,
        min_aspect_ratio=0.2,
        max_aspect_ratio=4,
        min_laplacian=10,
        min_ssim=0.5,
    )

    balanced_df = multiply_images(df_images)

    process_images(balanced_df, (64, 64), "rgb", "final_images", True)
else:
    df_images = pd.read_csv("../data/raw/faces/image_info.csv")
    print("Getting low quality images -1st clearing")
    df_1 = get_low_quality_images(
        df_images,
        min_width=15,
        min_height=15,
        min_aspect_ratio=0.2,
        max_aspect_ratio=4,
        min_laplacian=10,
        min_ssim=0.5,
    )
    print("Getting low quality images -2nd clearing")
    df_2 = get_low_quality_images(
        df_images,
        min_width=20,
        min_height=20,
        min_aspect_ratio=0.2,
        max_aspect_ratio=4,
        min_laplacian=10,
        min_ssim=0.5,
    )
    print("Getting low quality images -3rd clearing")
    df_3 = get_low_quality_images(
        df_images,
        min_width=20,
        min_height=20,
        min_aspect_ratio=0.3,
        max_aspect_ratio=3.33,
        min_laplacian=25,
        min_ssim=0.8,
    )

    balanced_df = multiply_images(df_images)
    balanced_df_1 = multiply_images(df_1)
    balanced_df_2 = multiply_images(df_2)
    balanced_df_3 = multiply_images(df_3)

    print("Processing images - no clearing")
    process_all_dimensions_colors(balanced_df, "no_clearing")
    print("Processing images - 1st clearing")
    process_all_dimensions_colors(balanced_df_1, "1_clearing")
    print("Processing images - 2nd clearing")
    process_all_dimensions_colors(balanced_df_2, "2_clearing")
    print("Processing images - 3rd clearing")
    process_all_dimensions_colors(balanced_df_3, "3_clearing")

## B


-   Zdôvodnite výber ML/DL metód vzhľadom na Vami vybraný dataset pre 4.2


Our chosen model is CCN, using Sequential from Tensorflow


# 4.2


## A


-   Modeluje Vami tie vybrané charakteristiky pomocou vhodných ML/DL
    metód. Výsledok modelovania je najlepší model.


#### Main Model


We were already trying different models, just what felt right. We don't have records of this testing. We were doing comparisons of model manually, adding/removing layers, changing BATCH_SIZE. We also tried keras tuner, but it takes too long to run to get some meaningful results (we did not get better model, because we had to lower number of epochs, so it ran in reasonable time).

This is the best model we got in first week of testing. We are going to do another small test (looking especially at learning rate).

The learning rate test showed best learning rate 0.0001


In [None]:
train_ds, val_ds, test_ds = get_dataset(64, "rgb", "final_images", True)
train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y))
input_shape = get_input_shape(train_ds)

In [None]:
model = Sequential(
    [
        # Preprocessing layers
        InputLayer(shape=input_shape[1:]),
        Rescaling(1.0 / 255),
        # Input and first conv block
        Conv2D(32, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Dropout(0.1),
        # Second conv block
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Dropout(0.1),
        # Third conv block
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Dropout(0.1),
        # Flatten and dense layers
        Flatten(),
        Dense(256, activation="relu"),
        Dropout(0.1),
        Dense(64, activation="relu"),
        Dropout(0.1),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(3, activation="softmax"),
    ]
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

In [None]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    batch_size=BATCH_SIZE,
    validation_batch_size=BATCH_SIZE,
    callbacks=[early_stopping],
)

In [None]:
show_evaluation(model, val_ds, history)

#### Clearing comparision


The testing was done in Testing-GPU with deterministic results. We are going to do another small test (looking especially at learning rate).

Here we tested 1_clearing dataset on all color modes and resolutions.

We exported models history to csv files, so we can compare them.


In [None]:
cvs_files = os.listdir("../model_history")
best_records_df = create_best_records_table(cvs_files)

for file in best_records_df.index:
    history = pd.read_csv(f"../model_history/{file}")
    show_evaluation_df(history, file)

Analysis

-   As the goal of this test was to find what is the best color and resolutions we got these results:
-   64x64 rgb with skipping grayscale images is the best. It also doesn't have overfit (due to image augmentation and dropout).
-   96x96 rgb with skip and without are similar, close behind 64x64 rgb-skip.
-   128x128 rgb is the worst out ot these 3.

Final

-   We are going to use 64x64 rgb with skipping grayscale images, for our final model if subsequent tests show it is the best.


#### Testing increased kernel size for larger images


In [None]:
cvs_files = [
    "history_1_clearing_rgb_128.csv",
    "history_1_clearing_rgb_skip_128.csv",
    "history_1_clearing_rgb_128_kernel.csv",
    "history_1_clearing_rgb_skip_128_kernel.csv",
]

best_records_df = create_best_records_table(cvs_files)


for file in best_records_df.index:
    history = pd.read_csv(f"../model_history/{file}")
    show_evaluation_df(history, file)

-   Using increased kernel size (5,5) instead of (3,3) did not help much (only a little in rgb_skip).
-   It is still not better than 64x64 rgb_skip.


#### Cleaning comparison


In [None]:
df_images = pd.read_csv("../data/raw/faces/image_info.csv")
print("Getting low quality images -1st clearing")
df_1 = get_low_quality_images(
    df_images,
    min_width=15,
    min_height=15,
    min_aspect_ratio=0.2,
    max_aspect_ratio=4,
    min_laplacian=10,
    min_ssim=0.5,
)
print("Getting low quality images -2nd clearing")
df_2 = get_low_quality_images(
    df_images,
    min_width=20,
    min_height=20,
    min_aspect_ratio=0.2,
    max_aspect_ratio=4,
    min_laplacian=10,
    min_ssim=0.5,
)
print("Getting low quality images -3rd clearing")
df_3 = get_low_quality_images(
    df_images,
    min_width=20,
    min_height=20,
    min_aspect_ratio=0.3,
    max_aspect_ratio=3.33,
    min_laplacian=25,
    min_ssim=0.8,
)

balanced_df = multiply_images(df_images)
balanced_df_1 = multiply_images(df_1)
balanced_df_2 = multiply_images(df_2)
balanced_df_3 = multiply_images(df_3)

df_no = pd.DataFrame(
    {
        "Class": balanced_df["Class"].value_counts().index,
        "Number of images": balanced_df["Class"].value_counts().values,
    }
)
df_1 = pd.DataFrame(
    {
        "Class": balanced_df_1["Class"].value_counts().index,
        "Number of images": balanced_df_1["Class"].value_counts().values,
    }
)
df_2 = pd.DataFrame(
    {
        "Class": balanced_df_2["Class"].value_counts().index,
        "Number of images": balanced_df_2["Class"].value_counts().values,
    }
)
df_3 = pd.DataFrame(
    {
        "Class": balanced_df_3["Class"].value_counts().index,
        "Number of images": balanced_df_3["Class"].value_counts().values,
    }
)

_, axes = plt.subplots(2, 2, figsize=(15, 10))
axes[0, 0].pie(df_no["Number of images"], labels=df_no["Class"], autopct="%1.1f%%")
axes[0, 0].set_title("No clearing")
axes[0, 1].pie(df_1["Number of images"], labels=df_1["Class"], autopct="%1.1f%%")
axes[0, 1].set_title("1st clearing")
axes[1, 0].pie(df_2["Number of images"], labels=df_2["Class"], autopct="%1.1f%%")
axes[1, 0].set_title("2nd clearing")
axes[1, 1].pie(df_3["Number of images"], labels=df_3["Class"], autopct="%1.1f%%")
axes[1, 1].set_title("3rd clearing")
plt.show()


def calculate_balance_index(df):
    class_counts = df["Number of images"]
    total_images = class_counts.sum()
    ideal_count = total_images / len(class_counts)
    balance_index = 100 - (class_counts - ideal_count).abs().sum() / (2 * total_images) * 100
    return balance_index


balance_index_no = calculate_balance_index(df_no)
balance_index_1 = calculate_balance_index(df_1)
balance_index_2 = calculate_balance_index(df_2)
balance_index_3 = calculate_balance_index(df_3)

print(f"Balance Index - No clearing: {balance_index_no:.2f}")
print(f"Balance Index - 1st clearing: {balance_index_1:.2f}")
print(f"Balance Index - 2nd clearing: {balance_index_2:.2f}")
print(f"Balance Index - 3rd clearing: {balance_index_3:.2f}")

-   no_clearing and 1_clearing have best balance of images.
-   2_clearing and 3_clearing are slightly behind.
-   Lower balance can lower model prefrormance.


In [None]:
cvs_files = [
    "history_no_clearing_rgb_skip_64.csv",
    "history_1_clearing_rgb_skip_64.csv",
    "history_2_clearing_rgb_skip_64.csv",
    "history_3_clearing_rgb_skip_64.csv",
]

best_records_df = create_best_records_table(cvs_files)


for file in best_records_df.index:
    history = pd.read_csv(f"../model_history/{file}")
    show_evaluation_df(history, file)

-   1_clearing is the best, probably because it has very bad pictures cleared and also balanced dataset.
-   2.nd was 3_clearing, with ~2.8% difference in accuracy. Maybe more balanced dataset would help.
-   Interestingly, no_clearing has the biggest following with 2_clearing and 3_clearing, difference in train/val accuracy, but this is not overfit, this is caused by bigger dropout than needed to avoid overfit. Maybe lowering dropout would increase accuracy a little, but it will most likely not reach accuracy of 1_clearing. Therefore we will proceed with 1_clearing.


#### Learning rate comparison


In [None]:
learning_rates = [
    0.01,
    0.005,
    0.002,
    0.0015,
    0.00075,
    0.0005,
    0.0001,
]

cvs_files = [
    "history_1_clearing_rgb_skip_64.csv",
]

for learning_rate in learning_rates:
    cvs_files.append(f"history_1_clearing_rgb_skip_64_learn{learning_rate}.csv")

best_records_df = create_best_records_table(cvs_files)
for file in best_records_df.index:
    history = pd.read_csv(f"../model_history/{file}")
    show_evaluation_df(history, file)

-   File without \_learn{number}, is base adam model with learning rate 0.001.
-   Lower learning rate is better.
-   Decreasing learning rate to 0.0001 did increase model performance from 89.851% to 92.113%. Although model is now training longer, it is worth it.


### Final model evaluation


In [14]:
train_ds, val_ds, test_ds = get_dataset(64, "rgb", "final_images", True)
train_ds = train_ds.map(lambda x, y: (data_augmentation(x, training=True), y))
input_shape = get_input_shape(train_ds)

Found 34750 files belonging to 3 classes.


In [15]:
model = Sequential(
    [
        # Preprocessing layers
        InputLayer(shape=input_shape[1:]),
        Rescaling(1.0 / 255),
        # Input and first conv block
        Conv2D(32, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Dropout(0.1),
        # Second conv block
        Conv2D(64, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Dropout(0.1),
        # Third conv block
        Conv2D(128, (3, 3), activation="relu"),
        MaxPooling2D((2, 2)),
        Dropout(0.1),
        # Flatten and dense layers
        Flatten(),
        Dense(256, activation="relu"),
        Dropout(0.1),
        Dense(64, activation="relu"),
        Dropout(0.1),
        Dense(16, activation="relu"),
        Dropout(0.1),
        Dense(3, activation="softmax"),
    ]
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0001),
    loss="sparse_categorical_crossentropy",
    metrics=["accuracy"],
)

Model training is long, we exported trained model to final_model.keras


In [None]:
history = model.fit(
    train_ds,
    epochs=EPOCHS,
    validation_data=val_ds,
    batch_size=BATCH_SIZE,
    validation_batch_size=BATCH_SIZE,
    callbacks=[early_stopping],
)

model.save("final_model.keras")

Epoch 1/100
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 113ms/step - accuracy: 0.4121 - loss: 1.0711 - val_accuracy: 0.5119 - val_loss: 1.0141
Epoch 2/100
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 107ms/step - accuracy: 0.5404 - loss: 0.9665 - val_accuracy: 0.5571 - val_loss: 0.9587
Epoch 3/100
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 113ms/step - accuracy: 0.5828 - loss: 0.9140 - val_accuracy: 0.5949 - val_loss: 0.9092
Epoch 4/100
[1m380/380[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 109ms/step - accuracy: 0.6009 - loss: 0.8817 - val_accuracy: 0.6224 - val_loss: 0.8705
Epoch 5/100
[1m318/380[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m6s[0m 107ms/step - accuracy: 0.6219 - loss: 0.8601

In [None]:
show_evaluation(model, val_ds, history)

In [None]:
# Test_ds
model = load_model("final_model.keras")
loss, accuracy = model.evaluate(test_ds, batch_size=BATCH_SIZE)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy * 100:.2f}%")

## B


-   Zhodnotíte Váš prístup a získaný výsledok


First week

-   Our approach at first was just random, we loaded dataset, looked at it a little, removed some outliers and proceed to model.
-   For models, we at first go very basic model and tried different methods to improve it. Namely changing parameters inside model, like number of layers, number of neurons, dropout, batch size, learning rate, etc.
-   We were testing these models manually, as we count exactly see what changed in model based on graphs.
-   We also tried to use keras tuner, but it took too long even for few parameters, and it did not give us good result as we had to lower number of epochs to get results in reasonable time.
-   Our best model was similar to the final one.

Second week

-   After consultations, we got valuable feedback, especially about data preprocessing.
-   We increased number of parameters we are eliminating outliers, this helped only little as primary parameter is image size(we we already using it).
-   We also balanced dataset, not perfectly, but it has pretty good balance, and this helped massively with model performance.
-   We also tested different image sizes and color modes, and we found out that 64x64 rgb with skipping grayscale images is the best.
-   We also tested models on GPU with deterministic results, this took longer but gave us more accurate results as to which model performs better.
