<a href="https://colab.research.google.com/github/MaximeGloesener/HandsOnAI-Challenge1/blob/master/Challenge1_gdrive.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Hardware Informations (GPU)**

In [None]:
!/opt/bin/nvidia-smi
!rm -rf sample_data

In [None]:
!pip install ImageHash

# **2. Importation of librairies**

In [None]:
from IPython.display import Image, HTML, display
from matplotlib import pyplot as plt
import numpy as np 
import os
import cv2
import csv
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import image
from keras.models import Model, load_model
from keras import backend as K
from keras.applications.vgg16 import VGG16, preprocess_input #224*224
from keras.applications.xception import Xception, preprocess_input, decode_predictions #299*299
from keras.applications.mobilenet import MobileNet, preprocess_input, decode_predictions #224*224
from keras.preprocessing.image import ImageDataGenerator
from keras.losses import categorical_crossentropy
from keras.layers import Dense, GlobalAveragePooling2D, Activation, Flatten, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
import math
import argparse
import matplotlib
import imghdr
import pickle as pkl
import datetime
from cycler import cycler
from PIL import Image, ImageEnhance
from google.colab import files
from tqdm import tqdm
import imagehash
print("Tensorflow version: "+tf.__version__)
print("Keras version: " + tf.keras.__version__)

#**3. Download of training datasets "FIRE_DATABASE_X"**

In [None]:
bases_path_after="bases"
test="test_data"
if os.path.exists(bases_path_after) == False:
    os.makedirs(bases_path_after)
if not os.path.exists(test):
  os.makedirs(test)

In [None]:
# Données de test
!rm -rf sample_data
!wget --no-check-certificate http://195.154.53.219/downloads/test.tar
! tar xf test.tar -C 'test_data' --one-top-level
! rm test.tar

Important de tester les doublons en utilisant un hash cryptographique qui comparer les images pixels par pixels. Avec un hash robuste, on trouve des faux doublons. Le hash robuste permet de détecter les doublons si resize/légère modifiction mais ce n'est pas le cas ici dans les datasets. 

In [None]:
def read_image(file_name):
  """
  Fonction qui prend en entrée une path d'image et qui return RGB (utile pour plot)
  """
  img = cv2.imread(file_name, 3)
  b,g,r = cv2.split(img)
  rgb_image = cv2.merge([r,g,b])
  return rgb_image

def plot(images, noms):
  f, axarr = plt.subplots(1,len(images))
  for i in range(len(images)):
    axarr[i].imshow(images[i])
    axarr[i].title.set_text(noms[i])


In [None]:
# Analyse des données
# On sait que dans les datasets, il y a parfois plusieurs fois la même image
# But : analyser chaque dataset et trouver le nombre d'images en doublons
def analyse_dataset(folder_name, affichage = False):
  """
  Fonction qui prend en entrée le directory d'un dataset et qui va chercher les images qui sont présentes plusieurs fois pour ce même dataset
  Affichage = True si on veut plot les images qui sont en doubles et leur nom
  Return: - le nombre de doublons dans un dataset
          - le pourcentage de doublons
  """
  img_hashes = dict()
  total = 0
  doublons = 0

  for dir in os.listdir(folder_name):
    for image in os.listdir(os.path.join(folder_name, dir)):
      total += 1
      image = os.path.join(os.getcwd(), folder_name, dir, image)
      hash = imagehash.dhash(Image.open(image))
      if hash in img_hashes:
        doublons += 1
        #print(f'{image} doublons de {img_hashes[hash]}')
        if affichage:
          i = read_image(image) 
          x = read_image(img_hashes[hash])
          plot([x,i],[image.split("/")[-1], img_hashes[hash].split("/")[-1]])
      else:
        img_hashes[hash] = image

  return doublons, doublons/total*100
"""
d1, p1 = analyse_dataset("/content/bases/FIRE_DATABASE_1/")
d2, p2 = analyse_dataset("/content/bases/FIRE_DATABASE_2/")
d3, p3 = analyse_dataset("/content/bases/FIRE_DATABASE_3/", affichage = True)
print('DATASET 1 ')
print(f'Il y a {d1} doublons dans le dataset = {p1}% des données')
print('DATASET 2 ')
print(f'Il y a {d2} doublons dans le dataset = {p2}% des données')
print('DATASET 3 ')
print(f'Il y a {d3} doublons dans le dataset = {p3}% des données')
"""

In [None]:
# Créer un seul dataset à partir des 3 en ne prenant en compte que des images uniques (supprimer tous les doublons)
def make_dataset(base_directory):
  """
  Fonction qui va concaténer les 3 datasets de départ et créer un seul dataset sans doublons
  Return les hashs des images déjà présentes dans le dataset -> utile lors de la phase data augmentation pour 
  ne pas rajouter des images qui sont déjà présentes dans la jeu de données 
  """
  !rm -rf all_data
  directory = 'all_data'
  directory_path = os.path.join(os.getcwd(), directory)
  # créer un nouveau directory all_data s'il n'existe pas déjà
  if not os.path.exists(directory_path):
    os.mkdir(directory_path)
    os.mkdir(os.path.join(directory_path, "fire"))
    os.mkdir(os.path.join(directory_path, "no_fire"))
    os.mkdir(os.path.join(directory_path, "start_fire"))

  images_hash = set()
  for dir in os.listdir(base_directory):
    for dir2 in os.listdir(os.path.join(base_directory,dir)):
      for img in os.listdir(os.path.join(base_directory,dir,dir2)):
        path = os.path.join(os.getcwd(), base_directory, dir, dir2, img)
        hash = imagehash.dhash(Image.open(path))
        if hash not in images_hash:
          images_hash.add(hash)
          cv2.imwrite(os.path.join(directory_path, dir2, img), cv2.imread(path))
  return images_hash 
hashes = make_dataset("bases")

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')
#%cp -av "/content/all_data/" "/content/gdrive/MyDrive/Challenge1/"

In [None]:
print(len(os.listdir("gdrive/MyDrive/Challenge1/all_data/fire")))
print(len(os.listdir("gdrive/MyDrive/Challenge1/all_data/start_fire")))
print(len(os.listdir("gdrive/MyDrive/Challenge1/all_data/no_fire")))

In [None]:
"""
# Analyse des images dans un directory
directory = '/content/gdrive/MyDrive/Challenge1/all_data/start_fire'

for index, img in enumerate(os.listdir(directory)):
  img = os.path.join(os.getcwd(), directory, img)
  fig = plt.figure()
  image = read_image(img)
  plt.imshow(image)
  plt.title(img)
"""

#**4. Cretate the labels file "classes.txt"**

In [None]:
!printf '%s\n' 'fire' 'no_fire' 'start_fire'> classes.txt

#**5. Training parameters and selectioon of Pretrained model**

In [None]:
# Fix random seed 
tf.keras.utils.set_random_seed(42)

In [None]:
nb_classes = 3
nbr_batch_size=8 #@param [1,2,4,8,16,32,64,128] {type:"raw"}
dataset_path = "gdrive/MyDrive/Challenge1"
input_dim=224 #@param [224,299] {type:"raw"}  
dataset_name='all_data' #@param ["all_data"]

dataset_path = os.path.join(dataset_path,dataset_name)
classes_path = "classes.txt"
csv_path = 'result.csv'
epochs = 30 #@ param {type:"slider", min:5, max:100, step:5}

result_path='results/'
log_path='logs'

classifier = "Xception" #@param ["ResNet50","VGG19","Xception","MobileNet","DenseNet169"] {type:"string"}
result_path = 'results/'+classifier
log={
    'epochs':epochs,
    'batch_size':nbr_batch_size,
    'val_loss':-1,
    'val_acc':-1,
}

In [None]:
print(dataset_path)

# **6. Get the number of classes**

In [None]:
# Get the class names
with open(classes_path, 'r') as f:
    classes = f.readlines()
    classes = list(map(lambda x: x.strip(), classes))
num_classes = len(classes)

In [None]:
print(num_classes)

# **8. Selection and configuration of the training dataset**

In [None]:
train_ds = tf.keras.preprocessing.image_dataset_from_directory(
	dataset_path,                     # Path of the dataset
	validation_split=0.2,             # Data division : validation (20%), train (80%)
	subset="training",                # Selection of training data
	seed=42,                          # Initialization of random generator (for permutations)
	image_size=(224,224),    # Input size of images
	batch_size=nbr_batch_size,        # Batch_size
  label_mode="categorical"     # Conversion to One-Hot format
)

#**9. Selection and configuration of the validation dataset**

In [None]:
val_ds = tf.keras.preprocessing.image_dataset_from_directory(
	dataset_path,                     # Path of the dataset
	validation_split=0.2,             # Data division : validation (20%), train (80%)
	subset="validation",                # Selection of validation data
	seed=42,                          # Initialization of random generator (for permutations)
	image_size=(224,224),    # Input size of images
	batch_size=nbr_batch_size,        # Batch_size
  label_mode="categorical"     # Conversion to One-Hot format
)

# **10. Download the pretrained model**

In [None]:
base_model = Xception(include_top = False, weights ='imagenet',input_shape = (input_dim,input_dim,3))
model = base_model.output
model = Flatten()(model)
model = Dense(128,activation='relu')(model)
model = Dropout(0.8)(model)
model = Dense(64,activation = 'relu')(model)
model = Dropout(0.4)(model)
predictions = Dense(num_classes, activation = 'softmax')(model)
model = Model(inputs=base_model.inputs, outputs=predictions)

# **13. Model training**

In [None]:
tf.keras.backend.clear_session()
# pour permettre le ré-entrainement des couches
for layer in model.layers:
    layer.trainable = True

# recompiler le modèle
opt = keras.optimizers.SGD(learning_rate=0.0001,decay=1e-6)
opt2 = keras.optimizers.Adam(lr=0.0001)
opt3 = keras.optimizers.RMSprop(learning_rate = 0.0001,decay =1e-6)
model.compile(loss='categorical_crossentropy',optimizer=opt3,metrics=['accuracy'])  


# Création du dossier pour sauvegrader le model
if os.path.exists(result_path) == False:
    os.makedirs(result_path)


keras_callback = [EarlyStopping(monitor='val_loss',patience = 5, verbose = 2)]

history=model.fit(
    train_ds,
    steps_per_epoch=math.ceil(len(train_ds)),
    epochs=epochs,
    validation_data=val_ds,
    validation_steps=math.ceil(len(val_ds)),
    verbose=1,
    callbacks = keras_callback
)

#**14. Save your model**


In [None]:
model.save('xception2.h5')

#**15. Visualization of training/validation curves**

In [None]:
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'valid'], loc='upper left')
plt.show()

In [None]:
test_ds = tf.keras.preprocessing.image_dataset_from_directory(
	'test_data/test',          # chemin vers le jeu de données
	seed=42,                    # Initialisation du générateur aléatoire (permutations)
	image_size=(input_dim,input_dim),       # Taille des images d'entrée
	batch_size=nbr_batch_size,      # Taille du mini-batch
  label_mode='categorical'    # Conversion au format One-Hot
)

In [None]:
score = model.evaluate(test_ds,  steps=len(test_ds),workers = 1)
print("%s: %.2f%%" % (model.metrics_names[0], score[0]))
print("%s: %.2f%%" % (model.metrics_names[1], score[1]*100))

#**16. Test the model with a test image**

In [None]:
filename = "https://www.ecologie.gouv.fr/sites/default/files/styles/standard/public/Feux.png"

In [None]:
%matplotlib inline
classes = train_ds.class_names
image_path =  "fog.jpg"
img = Image.open(image_path).convert('RGB')
x = tf.keras.utils.img_to_array(img,data_format='channels_last')
x = tf.keras.preprocessing.image.smart_resize(x, size=(input_dim,input_dim))
x = np.expand_dims(x, axis=0)
# predict
pred = model.predict(x,batch_size=1)[0]

for (pos,prob) in enumerate(pred):
    class_name = classes[pos]
    if (pos == np.argmax(pred)) :
        img = cv2.imread(image_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        font = cv2.FONT_HERSHEY_COMPLEX 
        textsize = cv2.getTextSize(class_name, font, 1, 2)[0]
        textX = (img.shape[1] - textsize[0]) / 2
        textY = (img.shape[0] + textsize[1]) / 2
        cv2.putText(img, class_name, (int(textX)-10, int(textY)), font, 2, (255,0,0), 6, cv2.LINE_AA)
        plt.imshow(img)
    print("Class Name : %s" % (class_name), "---", "Class Probability: %.2f%%" % (prob*100))
plt.show()