In [2]:
#!/usr/bin/env python
__author__ = "Sreenivas Bhattiprolu"
__license__ = "Feel free to copy, I appreciate if you acknowledge Python for Microscopists"


# https://youtu.be/vF21cC-8G1U
# https://youtu.be/Joh3LOaG8Q0

"""

Dataset from: https://lhncbc.nlm.nih.gov/publication/pub9932

Binary problem:
Question is: Is the image uninfected? If yes, probability is close to 1.
If no, the probablility is close to 0.

This is because we added label 1 to uninfected images. 
In summary, probability result close to 1 reflects uninfected image
and close to 0 reflects parasitized image

"""

'\n\nDataset from: https://lhncbc.nlm.nih.gov/publication/pub9932\n\nBinary problem:\nQuestion is: Is the image uninfected? If yes, probability is close to 1.\nIf no, the probablility is close to 0.\n\nThis is because we added label 1 to uninfected images. \nIn summary, probability result close to 1 reflects uninfected image\nand close to 0 reflects parasitized image\n\n'

#  Charger les librairies

## Keras

In [3]:
# Keras est une libraire pour écrire des réseaux de neurones torche
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Conv2D, MaxPooling2D, BatchNormalization
from keras.layers import Activation, Dropout, Flatten, Dense
from keras.layers import SpatialDropout2D
from keras.applications.mobilenet_v2 import MobileNetV2
from keras.layers import Activation, Dropout, Flatten, Dense, GlobalAveragePooling2D
from keras.losses import BinaryCrossentropy
from keras.optimizers import adam_v2
#from keras import backend as K
#from keras.utils import to_categorical

## OS CV2 

In [4]:
import os # Pour lire les images
import cv2
from PIL import Image

## Sk-learn 

In [5]:
# 1.diverser dataset 
# 2.evaluer le modele
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve
from sklearn.metrics import confusion_matrix
from sklearn.metrics import auc
import warnings
warnings.simplefilter("ignore")   #### Supprimer l'avertissement

# Charger les données

In [6]:
#image_directory = 'cell_images/'
image_directory = 'images\\train/'

SIZE = 150
BATCH_SIZE = 16 # 批量大小
EPOCHS = 150    # 迭代次数


dataset = []  #Many ways to handle data, you can use pandas. Here, we are using a list format.  
label = []  #Place holders to define add labels. We will add 0 to all parasitized images and 1 to uninfected.

dirty_images = os.listdir(image_directory + 'dirty/')
for i, image_name in enumerate(dirty_images):    #Remember enumerate method adds a counter and returns the enumerate object
    
    if (image_name.split('.')[-1] in ['jpeg','jpg']):
        image = cv2.imread(image_directory + 'dirty/' + image_name) # 读取图像
        image = Image.fromarray(image, 'RGB')# Décomposer l'image en plusieurs petits pixels pour le calcul
        image = image.resize((224,224))
        dataset.append(np.array(image)/255.0)
        label.append(0)

#Iterate through all images in Uninfected folder, resize to 64 x 64
#Then save into the same numpy array 'dataset' but with label 1

clean_images = os.listdir(image_directory + 'clean/')
for i, image_name in enumerate(clean_images):
    if (image_name.split('.')[-1] in ['jpeg','jpg']):
        image = cv2.imread(image_directory + 'clean/' + image_name)
        image = Image.fromarray(image, 'RGB')
        image = image.resize((224,224))
        dataset.append(np.array(image)/255.0)
        label.append(1)

# Diviser le trainset et testset

In [7]:
dataset = np.array(dataset)
label = np.array(label)

X_train, X_test, y_train, y_test = train_test_split(dataset, label, test_size = 0.20, random_state = 0)


print("X_train",(str(X_train.shape)))
print("y_train",(str(y_train.shape)))
print("label_1",np.sum(y_train))
print("X_test",(str(X_test.shape)))
print("y_test",(str(y_test.shape)))
print("label_1",np.sum(y_test))


X_train (1769, 224, 224, 3)
y_train (1769,)
label_1 975
X_test (443, 224, 224, 3)
y_test (443,)
label_1 229


# Normalization  non moins la moyenne​

In [8]:
#Without scaling (normalize) the training may not converge. 
#Normalization is a rescaling of the data from the original range 
#so that all values are within the range of 0 and 1.
from keras.utils.np_utils import normalize
#X_train = normalize(X_train, axis=1)
#X_test = normalize(X_test, axis=1)

#Do not do one-hot encoding as it generates a shape of (num, 2)
#But the network expects an input of (num, 1) for the last layer for binary classification
#y_train = to_categorical(y_train)
#y_test = to_categorical(y_test)

# Exécutions du modèle

## Former le modèle

In [9]:
###2 conv and pool layers. with some normalization and drops in between.

INPUT_SHAPE = (224,224,3)   #change to (SIZE, SIZE, 3)


#model = Sequential()
#model.add(Conv2D(32, (3, 3), input_shape=INPUT_SHAPE))
#model.add(SpatialDropout2D(0.5))
#model.add(Activation('relu'))
#model.add(MaxPooling2D(pool_size=(2, 2)))

#model.add(Conv2D(32, (3, 3), kernel_initializer = 'he_uniform'))
#model.add(SpatialDropout2D(0.5))
#model.add(Activation('relu'))
#model.add(MaxPooling2D(pool_size=(2, 2)))

#model.add(Conv2D(64, (3, 3), kernel_initializer = 'he_uniform'))
#model.add(SpatialDropout2D(0.5))
#model.add(Activation('relu'))
#model.add(MaxPooling2D(pool_size=(2, 2)))

##model.add(Flatten())
#model.add(Dense(8))
#model.add(BatchNormalization())
#model.add(Activation('relu'))
#model.add(Dropout(0.5))

#model.add(Dense(1))
#model.add(BatchNormalization())
#model.add(Activation('sigmoid'))

#Do not use softmax for binary classification
#Softmax is useful for mutually exclusive classes, either cat or dog but not both.
#Also, softmax outputs all add to 1. So good for multi class problems where each
#class is given a probability and all add to 1. Highest one wins. 

#Sigmoid outputs probability. Can be used for non-mutually exclusive problems.
#But, also good for binary mutually exclusive (cat or not cat).


mobile_v2 = MobileNetV2(include_top=False, input_shape=(224,224,3))

###############  include_top=False  ###     1.Les couches précédentes ne participent pas à la formation
###############  layers_to_freeze = 80 ### 2.Les paramètres qui doivent être ajustés commencent à partir de la couche 80

#choose how many layers to freeze, 80~120 same res on this task # 需要调参数的内容
layers_to_freeze = 80

for layer in mobile_v2.layers[:layers_to_freeze]:
    layer.trainable = False

model = Sequential([mobile_v2, GlobalAveragePooling2D(), Dense(1, activation='sigmoid')]) 
########### Dense ##  former un classificateur binaire 
model.summary()

#As we are training the top layers of the MobileNet, lr stays low

############### 3. Enfin, former un classificateur binaire 
model.compile(
    loss= BinaryCrossentropy(from_logits=True), optimizer=adam_v2.Adam(1e-5),
    metrics=['accuracy'])

#model.compile(loss='binary_crossentropy',    # 分类
              #optimizer='rmsprop',             #also try adam 优化
              #metrics=['accuracy'])

print(model.summary()) 

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 mobilenetv2_1.00_224 (Funct  (None, 7, 7, 1280)       2257984   
 ional)                                                          
                                                                 
 global_average_pooling2d (G  (None, 1280)             0         
 lobalAveragePooling2D)                                          
                                                                 
 dense (Dense)               (None, 1)                 1281      
                                                                 
Total params: 2,259,265
Trainable params: 2,040,065
Non-trainable params: 219,200
_________________________________________________________________
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 mobilenetv2_1.00_22

In [10]:
len(mobile_v2.layers)

154

In [None]:
history = model.fit(X_train,
                         y_train,
                         batch_size = BATCH_SIZE,
                         verbose = 1,
                         epochs = 300,
                         validation_data=(X_test,y_test),
                         shuffle = False
                     )


model.save('waste_model_10epochs.h5')

## Tester le modèle 

In [None]:
#Test the model on one image (for 300 epochs)
#img 23 is parasitized - correctly predicts near 0 probability
#Img 22, parasitized, correctly lables (low value) but relatively high value.
#img 24 is uninfected, correctly predicts as uninfected
#img 26 is parasitized but incorrectly gives high value for prediction, uninfected.

n=2  #Select the index of image to be loaded for testing
img = X_test[n]
plt.imshow(img)
input_img = np.expand_dims(img, axis=0) #Expand dims so the input is (num images, x, y, c) # 模型增加维度变成我们需要的维度
print("The prediction for this image is: ", model.predict(input_img))
print("The actual label for this image is: ", y_test[n])

#Instead f checking for each image, we can evaluate the model on all test data
#for accuracy


In [None]:

#We can load the trained model, so we don't have to train again for 300 epochs!
from keras.models import load_model
# load model
model = load_model('waste_model.h5')

#For 300 epochs, giving 82.5% accuracy

_, acc = model.evaluate(X_test, y_test)
print("Accuracy = ", (acc * 100.0), "%")

#How do we know how it is doing for parasitized vs uninfected? 

# Modèle d'évaluation

## Confusion matrix

In [None]:

#Confusion matrix
#We compare labels and plot them based on correct or wrong predictions.
#Since sigmoid outputs probabilities we need to apply threshold to convert to label.

#mythreshold=0.908
mythreshold=0.44


y_pred = (model.predict(X_test)>= mythreshold).astype(int)
cm=confusion_matrix(y_test, y_pred)
print(cm)

#Check the confusion matrix for various thresholds. Which one is good?
#Need to balance positive, negative, false positive and false negative. 
#ROC can help identify the right threshold.

## ROC

In [None]:
"""
Receiver Operating Characteristic (ROC) Curve is a plot that helps us 
visualize the performance of a binary classifier when the threshold is varied. 
"""
#ROC

y_preds = model.predict(X_test).ravel()

fpr, tpr, thresholds = roc_curve(y_test, y_preds)
plt.figure(1)
plt.plot([0, 1], [0, 1], 'y--')
plt.plot(fpr, tpr, marker='.')
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.show()

"""
#One way to find the best threshold once we calculate the true positive 
and false positive rates is ...
The optimal cut off point would be where “true positive rate” is high 
and the “false positive rate” is low. 
Based on this logic let us find the threshold where tpr-(1-fpr) is zero (or close to 0)
"""



## AUC

In [None]:
import pandas as pd
i = np.arange(len(tpr)) 
roc = pd.DataFrame({'tf' : pd.Series(tpr-(1-fpr), index=i), 'thresholds' : pd.Series(thresholds, index=i)})
ideal_roc_thresh = roc.iloc[(roc.tf-0).abs().argsort()[:1]]  #Locate the point where the value is close to 0
print("Ideal threshold is: ", ideal_roc_thresh['thresholds']) 

#Now use this threshold value in the confusion matrix to visualize the balance
#between tp, fp, fp, and fn


#AUC
#Area under the curve (AUC) for ROC plot can be used to understand hpw well a classifier 
#is performing. 
#% chance that the model can distinguish between positive and negative classes.


auc_value = auc(fpr, tpr)
print("Area under curve, AUC = ", auc_value)