# Build dataset

#### Create the path 

In [18]:
# import the necessary packages
import os
# initialize the path to the *original* input directory of images
ORIG_INPUT_DATASET = "malaria/cell_images"
# initialize the base path to the *new* directory that will contain
# our images after computing the training and testing split
BASE_PATH = "malaria"
# derive the training, validation, and testing directories
TRAIN_PATH = os.path.sep.join([BASE_PATH, "training"])
VAL_PATH = os.path.sep.join([BASE_PATH, "validation"])
TEST_PATH = os.path.sep.join([BASE_PATH, "testing"])
# define the amount of data that will be used training
TRAIN_SPLIT = 0.8
# the amount of validation data will be a percentage of the
# *training* data
VAL_SPLIT = 0.1

In [19]:
from imutils import paths
import random
import shutil
import os

imagePaths = list(paths.list_images(ORIG_INPUT_DATASET))
random.seed(42)
random.shuffle(imagePaths)

i = int(len(imagePaths) * TRAIN_SPLIT)
trainPaths = imagePaths[:i]
testPaths = imagePaths[i:]
# we'll be using part of the training data for validation
i = int(len(trainPaths) * VAL_SPLIT)
valPaths = trainPaths[:i]
trainPaths = trainPaths[i:]

#### Create training, testing, and validition dataset

In [20]:
# define the datasets that we'll be building
datasets = [
    ("training", trainPaths, TRAIN_PATH),
    ("validation", valPaths, VAL_PATH),
    ("testing", testPaths, TEST_PATH)
]

In [21]:
# loop over the datasets

for (dType, imagePaths, baseOutput) in datasets:
    # show which data split we are creating
    print("[INFO] building '{}' split".format(dType))
    
    # if the output base output directory does not exist, create it
    if not os.path.exists(baseOutput):
        print("[INFO] 'creating {}' directory".format(baseOutput))
        os.makedirs(baseOutput)

    # loop over the input image paths
    for inputPath in imagePaths:
        # extract the filename of the input image along with its
        # corresponding class label
        filename = inputPath.split(os.path.sep)[-1]
        label = inputPath.split(os.path.sep)[-2]
        
        # build the path to the label directory
        labelPath = os.path.sep.join([baseOutput, label])
        
        # if the label output directory does not exist, create it
        if not os.path.exists(labelPath):
            print("[INFO] 'creating {}' directory".format(labelPath))
            os.makedirs(labelPath)
        
        # construct the path to the destination image and then copy
        # the image itself
        p = os.path.sep.join([labelPath, filename])
        shutil.copy2(inputPath, p)

[INFO] building 'training' split
[INFO] 'creating malaria/training' directory
[INFO] 'creating malaria/training/Parasitized' directory
[INFO] 'creating malaria/training/Uninfected' directory
[INFO] building 'validation' split
[INFO] 'creating malaria/validation' directory
[INFO] 'creating malaria/validation/Parasitized' directory
[INFO] 'creating malaria/validation/Uninfected' directory
[INFO] building 'testing' split
[INFO] 'creating malaria/testing' directory
[INFO] 'creating malaria/testing/Parasitized' directory
[INFO] 'creating malaria/testing/Uninfected' directory


#### Define a model: VGG16

In [25]:
from keras.applications import VGG16
conv_base = VGG16(weights = "imagenet",include_top = False)
conv_base.summary()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5
Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)  

In [31]:
# set the matplotlib backend so figures can be saved in the background
import matplotlib
matplotlib.use("Agg")
# import the necessary packages
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.optimizers import SGD

import matplotlib.pyplot as plt
import numpy as np
import argparse

totalTrain = len(list(paths.list_images(TRAIN_PATH)))
totalVal = len(list(paths.list_images(VAL_PATH)))
totalTest = len(list(paths.list_images(TEST_PATH)))

In [33]:
# initialize the training training data augmentation object
trainAug = ImageDataGenerator(
    rescale=1 / 255.0,
    rotation_range=20,
    zoom_range=0.05,
    width_shift_range=0.05,
    height_shift_range=0.05,
    shear_range=0.05,
    horizontal_flip=True,
    fill_mode="nearest")

# initialize the validation (and testing) data augmentation object
valAug = ImageDataGenerator(rescale=1 / 255.0)

#### Define train, test, validation generator

In [69]:
# initialize the training generator
trainGen = trainAug.flow_from_directory(
    TRAIN_PATH,
    class_mode="binary",
    target_size=(150, 150),
    color_mode="rgb",
    batch_size=200)

# initialize the validation generator
valGen = valAug.flow_from_directory(
    VAL_PATH,
    class_mode="binary",
    target_size=(150, 150),
    color_mode="rgb",
    batch_size=200)

# initialize the testing generator
testGen = valAug.flow_from_directory(
    TEST_PATH,
    class_mode="binary",
    target_size=(150, 150),
    color_mode="rgb",
    batch_size=200)

Found 19842 images belonging to 2 classes.
Found 2204 images belonging to 2 classes.
Found 5512 images belonging to 2 classes.


#### Extract features

In [63]:
# define the datasets that we will build
generator = [
    ('training', trainGen),
    ('validation', valGen),
    ('testing', testGen)
]

batch_size = 200

def extract_features(generator, sample_count):
    features = np.zeros(shape=(sample_count, 4, 4, 512))
    labels = np.zeros(shape=(sample_count))
    i = 0

    for inputs_batch, labels_batch in generator:
        #print(labels_batch)
        features_batch = conv_base.predict(inputs_batch)
        features[i * batch_size: (i+1) * batch_size] = features_batch
        labels[i * batch_size: (i+1) * batch_size] = labels_batch
        i += 1
        if ((i * batch_size % 1000) == 0 ):
            print("processed size =", i * batch_size)
        if i * batch_size >= sample_count:
            break
            
    return features, labels

In [64]:
valFeatures, valLabels = extract_features(valGen, 2204)

processed size = 1000
processed size = 2000


In [66]:
trainFeatures, trainLabels = extract_features(trainGen, 19842)

processed size = 1000
processed size = 2000
processed size = 3000
processed size = 4000
processed size = 5000
processed size = 6000
processed size = 7000
processed size = 8000
processed size = 9000
processed size = 10000
processed size = 11000
processed size = 12000
processed size = 13000
processed size = 14000
processed size = 15000
processed size = 16000
processed size = 17000
processed size = 18000
processed size = 19000
processed size = 20000


In [70]:
testFeatures, testLabels = extract_features(testGen, 5512)

processed size = 1000
processed size = 2000
processed size = 3000
processed size = 4000
processed size = 5000


#### Save features

In [71]:
import numpy as np
np.save('trainFeatures', trainFeatures)
np.save('trainLabels', trainLabels)

np.save('testFeatures', testFeatures)
np.save('testLabels', testLabels)

np.save('valFeatures', valFeatures)
np.save('valLabels', valLabels)

In [2]:
import numpy as np
trainFeatures = np.load('trainFeatures.npy')
trainLabels = np.load('trainLabels.npy')

testFeatures = np.load('testFeatures.npy')
testLabels = np.load('testLabels.npy')

valFeatures = np.load('valFeatures.npy')
valLabels = np.load('valLabels.npy')

In [3]:
trainFeatures.shape

(19842, 4, 4, 512)

In [7]:
train_features = np.reshape(trainFeatures, (len(trainFeatures), 4 * 4 * 512))

In [8]:
train_features.shape

(19842, 8192)

In [4]:
testFeatures.shape

(5512, 4, 4, 512)

In [12]:
test_features = np.reshape(testFeatures, (len(testFeatures), 4 * 4 * 512))
test_features.shape

(5512, 8192)

In [5]:
valFeatures.shape

(2204, 4, 4, 512)

In [13]:
val_features = np.reshape(valFeatures, (len(valFeatures), 4 * 4 * 512))
val_features.shape

(2204, 8192)

In [25]:
# import the necessary packages
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from sklearn.metrics import classification_report
import numpy as np
import pickle
import os

In [26]:
CLASSES = ["Parasitized", "Uninfected"]
model = Sequential()
model.add(Dense(256, input_shape=(4 * 4 * 512,), activation="relu"))
model.add(Dense(16, activation="relu"))
model.add(Dense(1, activation="sigmoid"))

opt = Adam(learning_rate=2e-3)
model.compile(loss="binary_crossentropy", optimizer=opt,metrics=["accuracy"])

In [27]:
BATCH_SIZE=200
print("[INFO] training simple network...")
H = model.fit(
    train_features.reshape(19842, 8192), trainLabels, batch_size=200, 
    steps_per_epoch=19842 // 200,
    validation_data= (val_features.reshape(2204,8192), valLabels),
    validation_steps=2204 // 200,
    epochs=25)

[INFO] training simple network...
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [31]:
print("[INFO] evaluating network...")
predIdxs = model.predict(x=test_features.reshape(5512,8192),
    steps=(5512 // 200) + 1)
#predIdxs = np.argmax(predIdxs, axis=1)
print(classification_report(testLabels, (predIdxs>0.5).astype(int)))

accuracy = model.evaluate(x=test_features, y=testLabels, batch_size=200)
print('test acc: ', accuracy[1])

[INFO] evaluating network...
              precision    recall  f1-score   support

         0.0       0.95      0.92      0.94      2726
         1.0       0.92      0.96      0.94      2786

    accuracy                           0.94      5512
   macro avg       0.94      0.94      0.94      5512
weighted avg       0.94      0.94      0.94      5512

test acc:  0.9381349682807922


Thus, the final accuracy score is 0.9381