<a href="https://colab.research.google.com/github/MaximeGloesener/HandsOnAI-Challenge1/blob/master/Challenge1TestData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **1. Hardware Informations (GPU)**

In [None]:
!/opt/bin/nvidia-smi
!rm -rf sample_data

In [None]:
!pip install ImageHash

# **2. Import des librairies**

In [None]:
from IPython.display import Image, HTML, display
from matplotlib import pyplot as plt
import numpy as np 
import os
import cv2
import csv
import tensorflow as tf
from tensorflow import keras
from keras.preprocessing import image
from keras.models import Model, load_model
from keras import backend as K
from keras.applications.vgg16 import VGG16, preprocess_input #224*224
from keras.applications.xception import Xception, preprocess_input, decode_predictions #299*299
from keras.applications.mobilenet import MobileNet, preprocess_input, decode_predictions #224*224
from keras.preprocessing.image import ImageDataGenerator
from keras.losses import categorical_crossentropy
from keras.layers import Dense, GlobalAveragePooling2D, Activation, Flatten, Dropout
from keras.callbacks import ModelCheckpoint, EarlyStopping
import math
import argparse
import matplotlib
import imghdr
import pickle as pkl
import datetime
from cycler import cycler
from PIL import Image, ImageEnhance
from google.colab import files
from tqdm import tqdm
import imagehash
print("Tensorflow version: "+tf.__version__)
print("Keras version: " + tf.keras.__version__)

#**3.Téléchargement des jeux de données**

In [None]:
bases_path_after="bases"
test="test_data"
if os.path.exists(bases_path_after) == False:
    os.makedirs(bases_path_after)
if not os.path.exists(test):
  os.makedirs(test)

In [None]:
!rm -rf FIRE_DATABASE_1.tar
!rm -rf sample_data
!wget https://cluster.ig.umons.ac.be/HackIA21/databases/FIRE_DATABASE_1.tar
!tar xf FIRE_DATABASE_1.tar -C 'bases' --one-top-level
!rm FIRE_DATABASE_1.tar

In [None]:
!rm -rf FIRE_DATABASE_2.tar
!rm -rf sample_data
!wget https://cluster.ig.umons.ac.be/HackIA21/databases/FIRE_DATABASE_2.tar
!tar xf FIRE_DATABASE_2.tar -C 'bases' --one-top-level
!rm FIRE_DATABASE_2.tar

In [None]:
!rm -rf FIRE_DATABASE_3.tar
!rm -rf sample_data
!wget https://cluster.ig.umons.ac.be/HackIA21/databases/FIRE_DATABASE_3.tar
!tar xf FIRE_DATABASE_3.tar -C 'bases' --one-top-level
!rm FIRE_DATABASE_3.tar

In [None]:
!rm -rf sample_data
! wget --no-check-certificate https://download.smartappli.eu/small.tar
! tar xf small.tar -C 'bases' --one-top-level
! rm small.tar

In [None]:
!rm -rf sample_data
! wget --no-check-certificate https://download.smartappli.eu/medium.tar
! tar xf medium.tar -C 'bases' --one-top-level
! rm medium.tar

In [None]:
!rm -rf sample_data
! wget --no-check-certificate https://download.smartappli.eu/big.tar
! tar xf big.tar -C 'bases' --one-top-level
! rm big.tar

#**4. Merge tous les jeux de données ensembles**

In [None]:
!cp -a /content/bases/FIRE_DATABASE_1/no_fire//. /content/bases/big/no_fire/
!cp -a /content/bases/FIRE_DATABASE_2/no_fire/. /content/bases/big/no_fire/
!cp -a /content/bases/FIRE_DATABASE_3/no_fire/. /content/bases/big/no_fire/
!cp -a /content/bases/medium/no_fire/. /content/bases/big/no_fire/
!cp -a /content/bases/small/no_fire/. /content/bases/big/no_fire/
!cp -a /content/bases/FIRE_DATABASE_1/start_fire/. /content/bases/big/start_fire/
!cp -a /content/bases/FIRE_DATABASE_2/start_fire/. /content/bases/big/start_fire/
!cp -a /content/bases/FIRE_DATABASE_3/start_fire/. /content/bases/big/start_fire/
!cp -a /content/bases/medium/start_fire/. /content/bases/big/start_fire/
!cp -a /content/bases/small/start_fire/. /content/bases/big/start_fire/
!cp -a /content/bases/FIRE_DATABASE_1/fire/. /content/bases/big/fire/
!cp -a /content/bases/FIRE_DATABASE_2/fire/. /content/bases/big/fire/
!cp -a /content/bases/FIRE_DATABASE_3/fire/. /content/bases/big/fire/
!cp -a /content/bases/medium/fire/. /content/bases/big/fire/
!cp -a /content/bases/small/fire/. /content/bases/big/fire/

In [None]:
# Nombre d'images après avoir tout merge
len(os.listdir("/content/bases/big/fire"))+len(os.listdir("/content/bases/big/no_fire"))+len(os.listdir("/content/bases/big/start_fire"))

In [None]:
# Données de test
!rm -rf sample_data
!wget --no-check-certificate https://download.smartappli.eu/test.tar
! tar xf test.tar -C 'test_data' --one-top-level
! rm test.tar

Important de tester les doublons en utilisant un hash cryptographique qui comparer les images pixels par pixels. Avec un hash robuste, on trouve des faux doublons. Le hash robuste permet de détecter les doublons si resize/légère modifiction mais ce n'est pas le cas ici dans les datasets. 

# **5. Analyse du dataset**

In [None]:
def read_image(file_name):
  """
  Fonction qui prend en entrée une path d'image et qui return RGB (utile pour plot)
  """
  img = cv2.imread(file_name, 3)
  b,g,r = cv2.split(img)
  rgb_image = cv2.merge([r,g,b])
  return rgb_image

def plot(images, noms):
  f, axarr = plt.subplots(1,len(images))
  for i in range(len(images)):
    axarr[i].imshow(images[i])
    axarr[i].title.set_text(noms[i])


In [None]:
# Analyse des données
# On sait que dans les datasets, il y a parfois plusieurs fois la même image
# But : analyser chaque dataset et trouver le nombre d'images en doublons
def analyse_dataset(folder_name, affichage = False):
  """
  Fonction qui prend en entrée le directory d'un dataset et qui va chercher les images qui sont présentes plusieurs fois pour ce même dataset
  Affichage = True si on veut plot les images qui sont en doubles et leur nom
  Return: - le nombre de doublons dans un dataset
          - le pourcentage de doublons
  """
  img_hashes = dict()
  total = 0
  doublons = 0

  for dir in os.listdir(folder_name):
    for image in os.listdir(os.path.join(folder_name, dir)):
      total += 1
      image = os.path.join(os.getcwd(), folder_name, dir, image)
      hash = imagehash.dhash(Image.open(image))
      if hash in img_hashes:
        doublons += 1
        #print(f'{image} doublons de {img_hashes[hash]}')
        if affichage:
          i = read_image(image) 
          x = read_image(img_hashes[hash])
          plot([x,i],[image.split("/")[-1], img_hashes[hash].split("/")[-1]])
      else:
        img_hashes[hash] = image

  return doublons, doublons/total*100


d2, p2 = analyse_dataset("/content/bases/big/")


print(f'Il y a {d2} doublons dans le dataset = {p2}% des données')


# **6. Suppression des doublons**

In [None]:
classes = ['fire/', 'start_fire/', 'no_fire/']

hash_dict = {}
directory_path = "/content/bases/big/" 
for classe in classes:

  path = os.path.join(directory_path, classe)

  for image in (os.listdir(path)):

    path_image = path + image

    hash = imagehash.dhash(Image.open(path_image))

    if hash not in hash_dict:
      hash_dict[hash] = path_image

    else:
      #print(f'{hash_dict[hash]} is equal to {path_image}')
      os.remove(path_image)
      #print(f'{path_image} removed')

In [None]:
# Taille après suppression des doublons
len(os.listdir("/content/bases/big/fire"))+len(os.listdir("/content/bases/big/no_fire"))+len(os.listdir("/content/bases/big/start_fire"))

# **7. Import des données sur le drive**

In [28]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
!cp -av '/content/bases/big/' '/content/gdrive/MyDrive/Challenge1/'

In [31]:
!cp -av '/content/test_data/' '/content/gdrive/MyDrive/Challenge1/test_data/'

'/content/test_data/' -> '/content/gdrive/MyDrive/Challenge1/test_data/test_data'
