**Mon CNN**

**Import des librairies**

In [None]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Activation, Dense, Flatten, BatchNormalization, Conv2D, MaxPool2D, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import categorical_crossentropy
from tensorflow.keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
from tensorflow.keras.applications.vgg16 import VGG16

from sklearn.metrics import confusion_matrix

from tensorboard.plugins.hparams import api as hp

import datetime
import itertools
import os
import shutil
import random
import glob

In [None]:
%load_ext tensorboard

**Parametres CUDA pour modélisation en local**

In [None]:
# Installer CUDA, CUDNN
gpus = tf.config.experimental.list_physical_devices('GPU')

for gpu in gpus:
    tf.config.experimental.set_memory_growth(gpu, True)

print("Nombre de GPU disponible : ", len(gpus))

**Parametres divers**

In [None]:
BATCH_SIZE = 10
N_EPOCHS = 150

VALIDATION_RATIO = 0.2
TEST_RATIO = 0.1
TRAIN_RATIO = 1 - VALIDATION_RATIO - TEST_RATIO

N_BREEDS = 3
N_IMAGE_PER_CLASS = 140
IMG_HEIGHT = 224
IMG_WIDTH = 224

RANDOM_STATE = 42

**On fixe la randomness pour la répétabilité de l'experience**

In [None]:
random.seed(RANDOM_STATE)

**Préparation des dossiers pour la génération d'images et l'augmentation**

In [None]:
# On se déplace dans le dossier images
os.chdir('data/images')

In [None]:
# On supprimer les dossiers de modélisations et leurs contenus si déjà éxistants dans images
_ = [shutil.rmtree(path) for path in ["train","valid","test"] if os.path.isdir(path) is True ]

In [None]:
# On recuperer les races (subdirectories)
list_dir_breeds = os.listdir()

In [None]:
# On observe le nombre d'images pour chaque races dans leur dossier respectifs à l'aide d'un dataframe
df_breds = pd.DataFrame([[f"{path:40}",len(os.listdir(path))] for path in list_dir_breeds] , columns=["race", "nombre_images"])
df_breds

In [None]:
# On selectionne au hasard un nombre N de races.
list_dir_breeds = random.sample(list_dir_breeds, N_BREEDS)
list_dir_breeds

Pour éviter d'augmenter les données lors de la séparation du jeu de validation de Keras avec ImageDataGenerator, il y a plusieurs techniques, je vais séparer le jeu puis créer différent ImageDataGenerator

In [None]:
# Organiser les données en un dossier d'entrainement, de validation et de test
for dir_breeds in list_dir_breeds:
    path_train = f"train/{dir_breeds}"
    path_valid = f"valid/{dir_breeds}"
    path_test = f"test/{dir_breeds}"
    
    # On crée nos dossiers vides
    [os.makedirs(path) for path in [path_train,path_valid,path_test] if os.path.isdir(path) is False]

In [None]:
# Si le nombre d'images minimum par classe est inférieur à notre paramétre on renvoit une erreur
min_images = df_breds["nombre_images"].min()
assert(min_images > N_IMAGE_PER_CLASS)

In [None]:
train_size, valid_size, test_size = [int(TRAIN_RATIO*N_IMAGE_PER_CLASS) , int(VALIDATION_RATIO*N_IMAGE_PER_CLASS), int(TEST_RATIO*N_IMAGE_PER_CLASS)]

In [None]:
# Train Test Split
for dir_breeds in list_dir_breeds:
    path_train = f"train/{dir_breeds}"
    path_valid = f"valid/{dir_breeds}"
    path_test = f"test/{dir_breeds}"
    
    # Si nos dossiers sont vides
    if len(os.listdir(path_train)+os.listdir(path_valid)+os.listdir(path_test)) == 0:

        list_path_images = os.listdir(path=dir_breeds)
        
        # On ajoute le nombre d'images choisi pour l'entrainement, la validation et le test
        for path_image in random.sample(list_path_images, train_size):
            shutil.copy(f"{dir_breeds}/{path_image}", path_train)
            list_path_images.remove(path_image)

        for path_image in random.sample(list_path_images, valid_size):
            shutil.copy(f"{dir_breeds}/{path_image}", path_valid)
            list_path_images.remove(path_image)

        for path_image in random.sample(list_path_images, test_size):
            shutil.copy(f"{dir_breeds}/{path_image}", path_test)
            list_path_images.remove(path_image)
    else:
        print("Les dossiers ne sont pas vides")
        break

In [None]:
# On revient dans le dossier root
os.chdir('../../')

In [None]:
# Chemins de nos dossiers fraichement générer
train_path = "data/images/train"
valid_path = "data/images/valid"
test_path = "data/images/test"

In [None]:
# Création des itérateurs de données pour notre modélisation
print("train batches :")
train_image_data_generator = ImageDataGenerator(rescale=1./255,
                                                rotation_range=40,
                                                width_shift_range=0.2,
                                                height_shift_range=0.2,
                                                shear_range=0.2,
                                                zoom_range=0.2,
                                                horizontal_flip=True)
train_batches = train_image_data_generator.flow_from_directory(directory=train_path,
                                                               target_size=(IMG_HEIGHT,IMG_WIDTH),
                                                               classes=list_dir_breeds,
                                                               batch_size=BATCH_SIZE)

print("valid batches :")
valid_image_data_generator = ImageDataGenerator(rescale=1./255)
valid_batches = valid_image_data_generator.flow_from_directory(directory=valid_path,
                                                               target_size=(IMG_HEIGHT,IMG_WIDTH),
                                                               classes=list_dir_breeds,
                                                               batch_size=BATCH_SIZE)


print("test batches :")
test_image_data_generator = ImageDataGenerator(rescale=1./255)
test_batches = test_image_data_generator.flow_from_directory(directory=test_path,
                                                             target_size=(IMG_HEIGHT,IMG_WIDTH),
                                                             classes=list_dir_breeds,
                                                             batch_size=BATCH_SIZE,
                                                             shuffle=False)

**Visualisation de l'augmentation avant entrainement**

In [None]:
# On génére le prochain batch du train avec augmentation
imgs, labels = next(train_batches)

In [None]:
# Fonction custom pour visualiser l'augmentation
def plot_images(images_arr, labels=None, rescaled=True,print_shape=True):
    if rescaled:
        images_arr= images_arr*255.0
    size = len(images_arr)
    fig, axes = plt.subplots(1, size, figsize=(20,20))
    axes = axes.flatten()
    for i, (img, ax) in enumerate(zip( images_arr, axes)):
        img = img.astype(np.uint8)
        ax.imshow(img)
        if labels is not None:
            ax.set_title(labels[i])
        elif print_shape is True :
            ax.set_title(np.array(img).shape)
        ax.axis('off')
    plt.tight_layout()
    plt.show()

In [None]:
plot_images(imgs, print_shape=True)

**Définir les paramétres du gridsearch**

- Num of units in the first Dense layer: 256 and 512
- Drop out rate: the range is between 0.1 and 0.2. So a dropout rate of 0.1 and 0.2 will be used.
- Optimizers: adam, SGD, and rmsprop
- Learning rate for the optimizers:0.001, 0.0001 and 0.0005,

In [None]:
# Créer les hyperparametres
HP_CONV_FILTER_1 = hp.HParam("conv_filter_1", hp.Discrete([32,64,128]))
HP_CONV_FILTER_2 = hp.HParam("conv_filter_2", hp.Discrete([32,64,128]))
HP_FC_UNITS = hp.HParam("fc_units", hp.Discrete([30,60,120]))

METRIC_ACCURACY = "accuracy"

**Création et configuration des fichiers dans tensorboard**


In [None]:
log_dir = "logs/hparam_tuning_" + datetime.datetime.now().strftime('%Y%m%d-%H%M%S')

with tf.summary.create_file_writer(log_dir).as_default():
    hp.hparams_config(
        hparams = [HP_CONV_FILTER_1,
                   HP_CONV_FILTER_2,
                   HP_FC_UNITS
                  ],
        metrics = [hp.Metric(METRIC_ACCURACY,
                             display_name="Accuracy")
                  ]
    )

**Fonctions pour la modélisation avec gridsearch**

In [None]:
# Fonction de modélisation avec les hyperparametres fournis

def model(hparams) :
    
    model = Sequential([
        Conv2D(filters = hparams[HP_CONV_FILTER_1],
               kernel_size = (3, 3),
               activation = 'relu',
               padding = 'same',
               input_shape = (IMG_HEIGHT,IMG_WIDTH,3)
              ),
        MaxPool2D(pool_size = (2, 2),
                  strides=2
                 ),
        Conv2D(filters = hparams[HP_CONV_FILTER_2],
               kernel_size=(3, 3),
               activation = 'relu',
               padding = 'same'
              ),
        MaxPool2D(pool_size = (2, 2),
                  strides = 2
                 ),
        Flatten(),
        Dense(units = hparams[HP_FC_UNITS],
              activation = 'relu'
             ),
        Dense(units = N_BREEDS,
              activation = 'softmax'
             )
    ])
    
    
    # On compile
    model.compile(optimizer=tf.optimizers.Adam(learning_rate=0.0003),
                  loss='categorical_crossentropy',
                  metrics=['accuracy']
                 )
    
    # Définir les callbacks
    my_callbacks = [
        tf.keras.callbacks.TensorBoard(log_dir),
        hp.KerasCallback(log_dir, hparams),
        tf.keras.callbacks.EarlyStopping(monitor = "val_loss", patience = 12),
        tf.keras.callbacks.ModelCheckpoint(f"model_VGG16_{N_BREEDS}_{'_'.join(str(x) for x in list(hparams.values()))}.hdf5",
                                           save_best_only=True,
                                           monitor='val_loss',
                                           mode='min')
    ]

    
    # On entraine, on récupere l'historique et on enregistre dans nos logs
    history = model.fit(x = train_batches,
                        steps_per_epoch = len(train_batches),
                        validation_data = valid_batches,
                        validation_steps = len(valid_batches),
                        epochs = N_EPOCHS,
                        verbose = 2,
                        callbacks=my_callbacks
                       )
    
    # On retourne la métrique d'optimisation
    return history.history['val_accuracy'][-1]

In [None]:
# Fonction d'enregistrement pour chaque appel à notre fonction de modélisation, des hyperparametres utilisés et de la métrique

def run(run_dir, hparams) :
    
    with tf.summary.create_file_writer(run_dir).as_default() :
        hp.hparams(hparams)
        accuracy = model(hparams)
        
        # On convertit notre tensor métrique en scalaire
        accuracy= tf.reshape(tf.convert_to_tensor(accuracy), []).numpy()
        
        tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)

In [None]:
# Afficher commande a executer pour tensorboard
f"python -m tensorboard.main --logdir=\"{log_dir}\""

**Modélisation**

In [None]:
# Lance l'optimisation du modele avec notre espace d'hyperparametres

session_num = 0

for conv_filter_1 in HP_CONV_FILTER_1.domain.values:
    for conv_filter_2 in HP_CONV_FILTER_2.domain.values:
        for fc_units in HP_FC_UNITS.domain.values:
            hparams = {
                HP_CONV_FILTER_1: conv_filter_1,
                HP_CONV_FILTER_2: conv_filter_2,
                HP_FC_UNITS: fc_units,
            }
            run_name = f"/run-{session_num}"
            run_dir = log_dir + run_name
            print(f"--- Starting trial:{run_name}")
            print({h.name: hparams[h] for h in hparams})
            run(run_dir, hparams)
            session_num += 1

**Visualisation des résultats**

In [None]:
%tensorboard --logdir log_dir

ou pour afficher le tableau de bord en cours, on lance la commande suivante avec le chemin du répertoire où les différents journaux d'exécution ont été stockés:

python -m tensorboard.main --logdir = log_dir

et pour c
taskkill /IM "tensorboard.exe" /F


**Analyse des résultats**

In [None]:
df_20breeds = pd.read_csv("hparams_table_20breeds.csv")

In [None]:
df_20breeds["complexity"] = df_3breeds["conv_filter_1"]*df_3breeds["conv_filter_2"]*df_3breeds["fc_units"]

In [None]:
df_20breeds.sort_values(by="complexity")

In [None]:
fig = px.scatter(df_20breeds, x="complexity", y="Accuracy", width=500, height=400)
fig.show()

In [None]:
fig = px.line(df_20breeds.groupby('conv_filter_1').mean(), y="Accuracy", width=500, height=400)
fig.show()

In [None]:
fig = px.line(df_20breeds.groupby('conv_filter_2').mean(), y="Accuracy", width=500, height=400)
fig.show()

In [None]:
fig = px.line(df_20breeds.groupby('fc_units').mean(), y="Accuracy", width=500, height=400)
fig.show()