In [25]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import glob
import pathlib

import os
from PIL import Image as PImage

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Activation, Input, Dropout, Flatten, Conv2D, MaxPooling2D, LeakyReLU
from tensorflow.keras.preprocessing import image_dataset_from_directory

from sklearn.metrics import confusion_matrix, classification_report
import itertools



In [8]:
data_dir = pathlib.Path('C:/Users/Planade/Documents/Datascientest/Projet Blood Cells/Mendeley_Data_Barcelona/PBC_dataset_normal_DIB')
#data_dir est un chemin vers un dossier d'images déjà nettoyé des fichiers corrompu par le code de nettoyage du notebook ML
#mais les cellules de codes ci-dessous permettent de faire le nettoyage si l'on repart du fichier de base de kaggle

In [9]:
image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

list_dir=list(data_dir.glob('*/*.jpg'))

17092


In [10]:
#fonction de nettoyage des doublons :
def nettoyage(liste_images):
    liste_im=[]
    for n in list_dir:
        filename_one, extension = os.path.splitext(n)
        liste_im.append(filename_one.split('\\')[-1])
    
    detect_doubles=[liste_im[i].split('(') for i in range(len(liste_im))]
    liste_propre=[liste_im[i] for i in range(len(detect_doubles)) if len(detect_doubles[i])<2]
    path_propre=[liste_images[i] for i in range(len(detect_doubles)) if len(detect_doubles[i])<2]
    liste_suppr=[liste_images[i] for i in range(len(detect_doubles)) if len(detect_doubles[i])>1]

    return liste_propre,path_propre,liste_suppr

In [11]:
#préparation du nettoyage
nouv_liste,list_dir,list_suppr=nettoyage(list_dir)
print(len(nouv_liste),len(list_dir),len(list_suppr))

17092 17092 0


In [12]:
#Les doubles
for file in list_suppr:
    os.remove(file)

#vérification
image_count = len(list(data_dir.glob('*/*.jpg')))
print(image_count)

#les fichiers corrompus :
enlev=[]
for p in list_dir :
    try :
        PImage.open(p)
    except :   
        enlev.append(p)
        
print(len(enlev))

#vérification :
for l in enlev:
    os.remove(l)
  

17092
0


In [13]:
#nouvelle liste des chemins sur dossiers images nettoyés :
list_dir=list(data_dir.glob('*/*.jpg'))
image_count = len(list_dir)
print(image_count)

17092


In [14]:
#préparation des données pour entraînement du modèle :

batch_size=50
img_height=360
img_width=360

In [15]:
train_data,val_data = tf.keras.preprocessing.image_dataset_from_directory(
                                            data_dir, validation_split=0.2, subset='both', seed=123, color_mode='rgb',
                                            image_size=(img_height, img_width), batch_size=batch_size)

Found 17092 files belonging to 8 classes.
Using 13674 files for training.
Using 3418 files for validation.


In [16]:
#vérification :
for image_batch, labels_batch in train_data:
  print(image_batch.shape)
  print(labels_batch.shape)
  print(labels_batch)
  break


(50, 360, 360, 3)
(50,)
tf.Tensor(
[6 7 2 7 7 1 1 7 2 3 6 3 3 0 7 7 6 3 5 3 3 6 5 3 5 7 4 6 5 5 5 6 6 1 7 1 3
 3 2 0 6 4 2 5 7 1 3 4 4 1], shape=(50,), dtype=int32)


In [18]:
class_names = train_data.class_names
num_classes=len(class_names)
print(class_names, '\n',num_classes,'classes.')

['basophil', 'eosinophil', 'erythroblast', 'ig', 'lymphocyte', 'monocyte', 'neutrophil', 'platelet'] 
 8 classes.


In [None]:
# Optimisation du processus :

AUTOTUNE = tf.data.experimental.AUTOTUNE
train_data = train_data.cache().shuffle(1000).prefetch(buffer_size = AUTOTUNE)
validation_data = validation_data.cache().prefetch(buffer_size=AUTOTUNE)

In [19]:
#construction du modèle :

model = Sequential([
    
    layers.experimental.preprocessing.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
    layers.Conv2D(32, kernel_size=(5, 5), padding='same', activation='LeakyReLU'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    
    layers.Conv2D(16, kernel_size=(3, 3), padding='same', activation='relu'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Conv2D(8, kernel_size=(3, 3), padding='same', activation='tanh'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Conv2D(8, kernel_size=(3, 3), padding='same', activation='relu'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Flatten(),
    
    layers.Dense(16, activation='relu'),
    layers.Dense(num_classes, activation='softmax')
])

model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling_1 (Rescaling)     (None, 360, 360, 3)       0         
                                                                 
 conv2d_4 (Conv2D)           (None, 360, 360, 32)      2432      
                                                                 
 max_pooling2d_4 (MaxPooling  (None, 180, 180, 32)     0         
 2D)                                                             
                                                                 
 dropout_4 (Dropout)         (None, 180, 180, 32)      0         
                                                                 
 conv2d_5 (Conv2D)           (None, 180, 180, 16)      4624      
                                                                 
 dropout_5 (Dropout)         (None, 180, 180, 16)      0         
                                                        

In [15]:
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [None]:
history = model.fit(train_data, 
                    validation_data=val_data, 
                    epochs=10)

In [19]:
images = pathlib.Path('TestA')
test_data = image_dataset_from_directory(images,
                                        subset=None,
                                         seed=123,
                                         image_size=(360,360))

Found 4350 files belonging to 8 classes.


In [20]:
#test_data_norm = test_data.map(normalizer) # NON, le modèle contient déjà une ligne de normalisation

In [21]:
model.evaluate(test_data)



[2.269357919692993, 0.0009195402381010354]

In [22]:
y_pred = model.predict(test_data_norm)
y_pred_class = y_pred.argmax(axis=1)

In [None]:
y_true = np.concatenate([y for x,y in test_data_norm])

In [None]:
print(classification_report(y_true, y_pred_class))

In [None]:
#Affichage de la matrice de confusion

cnf_matrix = confusion_matrix(y_true, y_pred_class)

plt.figure(figsize=(10,10))

plt.imshow(cnf_matrix, interpolation='nearest',cmap='Blues')
plt.title("Confusion matrix test_data",fontsize=20)
plt.colorbar()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation =90,fontsize=15)
plt.yticks(tick_marks, class_names,fontsize=15)

for i, j in itertools.product(range(cnf_matrix.shape[0]), range(cnf_matrix.shape[1])):
    plt.text(j, i, cnf_matrix[i, j],
             horizontalalignment = "center",
             color = "white" if cnf_matrix[i, j] > ( cnf_matrix.max() / 2) else "black",
            fontsize=15)

plt.ylabel('True labels',fontsize=20)
plt.xlabel('Predicts labels',fontsize=20)
plt.show()

In [None]:
#Modèle 2 fois plus de couches de convolution : 

In [21]:
model2 = Sequential([
    
    layers.experimental.preprocessing.Rescaling(1./255, input_shape=(img_height, img_width, 3)),
    layers.Conv2D(32, kernel_size=(5, 5), padding='same', activation='LeakyReLU'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    
    layers.Conv2D(16, kernel_size=(3, 3), padding='same', activation='relu'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Conv2D(8, kernel_size=(3, 3), padding='same', activation='tanh'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Conv2D(8, kernel_size=(3, 3), padding='same', activation='relu'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Conv2D(32, kernel_size=(3, 3), padding='same', activation='LeakyReLU'),
    layers.MaxPooling2D(),
    layers.Dropout(0.2),
    
    layers.Conv2D(16, kernel_size=(3, 3), padding='same', activation='relu'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Conv2D(8, kernel_size=(3, 3), padding='same', activation='tanh'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Conv2D(8, kernel_size=(3, 3), padding='same', activation='relu'),
    layers.Dropout(0.4),
    layers.MaxPooling2D(),
    
    layers.Flatten(),
    
    layers.Dense(16, activation='relu'),
    layers.Dense(num_class_names, activation='softmax')
])

model2.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 rescaling_3 (Rescaling)     (None, 360, 360, 3)       0         
                                                                 
 conv2d_16 (Conv2D)          (None, 360, 360, 32)      2432      
                                                                 
 max_pooling2d_16 (MaxPoolin  (None, 180, 180, 32)     0         
 g2D)                                                            
                                                                 
 dropout_16 (Dropout)        (None, 180, 180, 32)      0         
                                                                 
 conv2d_17 (Conv2D)          (None, 180, 180, 16)      4624      
                                                                 
 dropout_17 (Dropout)        (None, 180, 180, 16)      0         
                                                      

In [23]:
model2.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

In [24]:
history = model2.fit(train_data, 
                    validation_data=val_data, 
                    epochs=10)

Epoch 1/10
 17/274 [>.............................] - ETA: 56:48 - loss: 2.1093 - accuracy: 0.1106

KeyboardInterrupt: 

In [None]:
model2.evaluate(test_data)

In [None]:
y_pred2 = model2.predict(test_data)
y_pred_class2 = y_pred.argmax(axis=1)

In [None]:
y_true2 = np.concatenate([y for x,y in test_data])

print(classification_report(y_true2, y_pred_class2))

In [None]:
cnf_matrix2 = confusion_matrix(y_true2, y_pred_class2)

plt.figure(figsize=(10,10))

plt.imshow(cnf_matrix2, interpolation='nearest',cmap='Blues')
plt.title("Confusion matrix",fontsize=20)
plt.colorbar()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names, rotation =90,fontsize=15)
plt.yticks(tick_marks, class_names,fontsize=15)

for i, j in itertools.product(range(cnf_matrix2.shape[0]), range(cnf_matrix2.shape[1])):
    plt.text(j, i, cnf_matrix2[i, j],
             horizontalalignment = "center",
             color = "white" if cnf_matrix2[i, j] > ( cnf_matrix2.max() / 2) else "black",
            fontsize=15)

plt.ylabel('True labels',fontsize=20)
plt.xlabel('Predicts labels',fontsize=20)
plt.show()