In [None]:
# import the necessary packages
from tensorflow.keras.applications import VGG16
from tensorflow.keras.applications import imagenet_utils
from tensorflow.keras.preprocessing.image import img_to_array
from tensorflow.keras.preprocessing.image import load_img
from sklearn.preprocessing import LabelEncoder
from imutils import paths
import numpy as np
import progressbar
import h5py
import random
import os

In [None]:
!pip install gdown



In [None]:
!gdown https://drive.google.com/uc?id=1DRuyR__pu2lw3d1QeY22v5SvXwV4U_xn

Downloading...
From: https://drive.google.com/uc?id=1DRuyR__pu2lw3d1QeY22v5SvXwV4U_xn
To: /content/mask.zip
233MB [00:02, 80.4MB/s]


In [None]:
!unzip mask.zip

In [None]:
# import the necessary packages
import h5py
import os

class HDF5DatasetWriter:
  def __init__(self, dims, outputPath, dataKey="images",bufSize=1000):
    """
    The constructor to HDF5DatasetWriter accepts four parameters, two of which are optional.
    
    Args:
    dims: controls the dimension or shape of the data we will be storing in the dataset.
    if we were storing the (flattened) raw pixel intensities of the 28x28 = 784 MNIST dataset, 
    then dims=(70000, 784).
    outputPath: path to where our output HDF5 file will be stored on disk.
    datakey: The optional dataKey is the name of the dataset that will store
    the data our algorithm will learn from.
    bufSize: controls the size of our in-memory buffer, which we default to 1,000 feature
    vectors/images. Once we reach bufSize, we’ll flush the buffer to the HDF5 dataset.
    """

    # check to see if the output path exists, and if so, raise
    # an exception
    if os.path.exists(outputPath):
      raise ValueError("The supplied `outputPath` already "
        "exists and cannot be overwritten. Manually delete "
        "the file before continuing.", outputPath)

    # open the HDF5 database for writing and create two datasets:
    # one to store the images/features and another to store the
    # class labels
    self.db = h5py.File(outputPath, "w")
    # 
    # for resource limitations due to hard-disk space, a compression algorithm can be used, the price is the demand of computational power
    #
    self.data = self.db.create_dataset(dataKey, dims,dtype="float")#compression='gzip')
    self.labels = self.db.create_dataset("labels", (dims[0],),dtype="int")

    # store the buffer size, then initialize the buffer itself
    # along with the index into the datasets
    self.bufSize = bufSize
    self.buffer = {"data": [], "labels": []}
    self.idx = 0

  def add(self, rows, labels):
    # add the rows and labels to the buffer
    self.buffer["data"].extend(rows)
    self.buffer["labels"].extend(labels)

    # check to see if the buffer needs to be flushed to disk
    if len(self.buffer["data"]) >= self.bufSize:
      self.flush()

  def flush(self):
    # write the buffers to disk then reset the buffer
    i = self.idx + len(self.buffer["data"])
    self.data[self.idx:i] = self.buffer["data"]
    self.labels[self.idx:i] = self.buffer["labels"]
    self.idx = i
    self.buffer = {"data": [], "labels": []}

  def storeClassLabels(self, classLabels):
    # create a dataset to store the actual class label names,
    # then store the class labels
    dt = h5py.special_dtype(vlen=str) # `vlen=unicode` for Py2.7
    labelSet = self.db.create_dataset("label_names",(len(classLabels),), dtype=dt)
    labelSet[:] = classLabels

  def close(self):
    # check to see if there are any other entries in the buffer
    # that need to be flushed to disk
    if len(self.buffer["data"]) > 0:
      self.flush()

    # close the dataset
    self.db.close()

In [None]:
def feature_extraction(dataset,output,buffer_size,bs):
		'''
			dataset: input folder with images dataset
			output: folder to store the feature extraction
			buffer_size: controls the size of our in-memory buffer
			bs: batch size
		'''

		# grab the list of images that we'll be describing then randomly
		# shuffle them to allow for easy training and testing splits via
		# array slicing during training time
		print("[INFO] loading images...")
		imagePaths = list(paths.list_images(dataset))
		random.shuffle(imagePaths)

		# extract the class labels from the image paths then encode the
		# labels
		labels = [p.split(os.path.sep)[-2] for p in imagePaths]
		le = LabelEncoder()
		labels = le.fit_transform(labels)

		# load the VGG16 network
		print("[INFO] loading network...")
		model = VGG16(weights="imagenet", include_top=False)

		# initialize the HDF5 dataset writer, then store the class label
		# names in the dataset
		dataset = HDF5DatasetWriter((len(imagePaths), 512 * 7 * 7),
																output, 
																dataKey="features", 
																bufSize=buffer_size)
		dataset.storeClassLabels(le.classes_)

		# initialize the progress bar
		widgets = ["Extracting Features: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
		pbar = progressbar.ProgressBar(maxval=len(imagePaths),widgets=widgets).start()

		# loop over the images in batches
		for i in np.arange(0, len(imagePaths), bs):
			# extract the batch of images and labels, then initialize the
			# list of actual images that will be passed through the network
			# for feature extraction
			batchPaths = imagePaths[i:i + bs]
			batchLabels = labels[i:i + bs]
			batchImages = []

			# loop over the images and labels in the current batch
			for (j, imagePath) in enumerate(batchPaths):
				# load the input image using the Keras helper utility
				# while ensuring the image is resized to 224x224 pixels
				image = load_img(imagePath, target_size=(224, 224))
				image = img_to_array(image)

				# preprocess the image by (1) expanding the dimensions and
				# (2) subtracting the mean RGB pixel intensity from the
				# ImageNet dataset
				image = np.expand_dims(image, axis=0)
				image = imagenet_utils.preprocess_input(image)

				# add the image to the batch
				batchImages.append(image)

			# pass the images through the network and use the outputs as
			# our actual features
			batchImages = np.vstack(batchImages)
			features = model.predict(batchImages, batch_size=bs)

			# reshape the features so that each image is represented by
			# a flattened feature vector of the `MaxPooling2D` outputs
			features = features.reshape((features.shape[0], 512 * 7 * 7))

			# add the features and labels to our HDF5 dataset
			dataset.add(features, batchLabels)
			pbar.update(i)

		# close the dataset
		dataset.close()
		pbar.finish()

In [None]:
# import the necessary packages
from sklearn.neural_network  import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
import pickle
import h5py

def train_and_evaluate(features_set):
    db = h5py.File(features_set,mode='r')
    print("Database keys {0:}".format(list(db.keys())))

    # open the HDF5 database for reading then determine the index of
    # the training and testing split, provided that this data was
    # already shuffled *prior* to writing it to disk
    i = int(db["labels"].shape[0] * 0.75)

    # define the set of parameters that we want to tune then start a
    # grid search where we evaluate our model for each value of C
    print("[INFO] tuning hyperparameters...")
    params = {"alpha": [0.01, 0.001]}
    model = MLPClassifier(solver="adam",
                          alpha=1e-5,
                          hidden_layer_sizes=(250, 100, ))
                        

    model.fit(db["features"][:i], db["labels"][:i])
    # print("[INFO] best hyperparameters: {}".format(model.best_params_))

    # evaluate the model
    print("[INFO] evaluating...")
    preds = model.predict(db["features"][i:])

    print(classification_report(db["labels"][i:], 
                                preds,
                                target_names=[str(i,'utf-8') for i in db["label_names"]])
    )
    # serialize the model to disk
    print("[INFO] saving model...")
    f = open(features_set.split("/")[0] + ".cpickle", "wb")
    f.write(pickle.dumps(model))
    f.close()

    # close the database
    db.close()

In [None]:
model = VGG16(weights="imagenet", include_top=False)
model.summary()

Model: "vgg16"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, None, 3)]   0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, None, None, 64)    1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, None, None, 64)    36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, None, None, 64)    0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, None, None, 128)   73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, None, None, 128)   147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, None, None, 128)   0     

In [None]:
dataset = "Dataset"

# path to output HDF5 file
output  = "hdf5/features.hdf5"

# size of feature extraction buffer
buffer_size = 1000

# store the batch size in a convenience variable
bs = 32

In [None]:
!mkdir hdf5

In [None]:
feature_extraction(dataset,output,buffer_size,bs)

[INFO] loading images...
[INFO] loading network...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


Extracting Features: 100% |####################################| Time:  0:02:28


In [None]:
db = h5py.File(output,mode='r')
list(db.keys())

['features', 'label_names', 'labels']

In [None]:
db["features"].shape

(8982, 25088)

In [None]:
db["labels"].shape

In [None]:
[str(i,'utf-8') for i in db["label_names"]]

In [None]:
train_and_evaluate(output)

Database keys ['features', 'label_names', 'labels']
[INFO] tuning hyperparameters...
[INFO] evaluating...
                       precision    recall  f1-score   support

mask_weared_incorrect       1.00      1.00      1.00       745
            with_mask       0.99      0.99      0.99       750
         without_mask       0.99      0.99      0.99       751

             accuracy                           0.99      2246
            macro avg       0.99      0.99      0.99      2246
         weighted avg       0.99      0.99      0.99      2246

[INFO] saving model...


In [None]:
model = pickle.load(open('hdf5.cpickle', 'rb'))

In [None]:
model.predict(db["features"][:10])

array([2, 2, 1, 1, 1, 1, 0, 2, 0, 0])

In [None]:
db["labels"][:10]

array([2, 2, 1, 1, 1, 1, 0, 2, 0, 0])