In [1]:
!unzip flowers17.zip

Archive:  flowers17.zip
   creating: flowers17/
   creating: flowers17/bluebell/
  inflating: flowers17/bluebell/image_0241.jpg  
  inflating: flowers17/bluebell/image_0242.jpg  
  inflating: flowers17/bluebell/image_0243.jpg  
  inflating: flowers17/bluebell/image_0244.jpg  
  inflating: flowers17/bluebell/image_0245.jpg  
  inflating: flowers17/bluebell/image_0246.jpg  
  inflating: flowers17/bluebell/image_0247.jpg  
  inflating: flowers17/bluebell/image_0248.jpg  
  inflating: flowers17/bluebell/image_0249.jpg  
  inflating: flowers17/bluebell/image_0250.jpg  
  inflating: flowers17/bluebell/image_0251.jpg  
  inflating: flowers17/bluebell/image_0252.jpg  
  inflating: flowers17/bluebell/image_0253.jpg  
  inflating: flowers17/bluebell/image_0254.jpg  
  inflating: flowers17/bluebell/image_0255.jpg  
  inflating: flowers17/bluebell/image_0256.jpg  
  inflating: flowers17/bluebell/image_0257.jpg  
  inflating: flowers17/bluebell/image_0258.jpg  
  inflating: flowers17/bluebell/image

In [2]:
!pip install progressbar2



In [3]:
from keras.applications import VGG16
from keras.applications import imagenet_utils
from keras.preprocessing.image import img_to_array
from keras.preprocessing.image import load_img
from sklearn.preprocessing import LabelEncoder
from imutils import paths
import numpy as np
import progressbar
import argparse
import random
import os

In [4]:
import h5py
import os


class HDF5DatasetWriter:
    def __init__(self, dims, output_path, data_key="images", buf_size=1000):
        if os.path.exists(output_path):
            raise ValueError("The supplied 'output_path' already exists and cannot be overwritten. "
                             "Manually delete the file before continuing.", output_path)

        # Open HDF5 database for writing and create two datasets:
        # one to store the images/features and another to store the
        # class labels
        self.db = h5py.File(output_path, "w")
        self.data = self.db.create_dataset(data_key, dims, dtype="float")
        self.labels = self.db.create_dataset("labels", (dims[0],), dtype="int")

        # store buffer size then initialize the buffer itself
        # along with the index into the datasets
        self.buf_size = buf_size
        self.buffer = {"data": [], "labels": []}
        self.idx = 0

    def add(self, rows, labels):
        self.buffer["data"].extend(rows)
        self.buffer["labels"].extend(labels)

        if len(self.buffer["data"]) >= self.buf_size:
            self.flush()

    def flush(self):
        i = self.idx + len(self.buffer["data"])
        self.data[self.idx: i] = self.buffer["data"]
        self.labels[self.idx: i] = self.buffer["labels"]
        self.idx = i
        self.buffer = {"data": [], "labels": []}

    def store_class_labels(self, class_labels):
        dt = h5py.special_dtype(vlen=str)
        label_set = self.db.create_dataset("label_names", (len(class_labels),), dtype=dt)
        label_set[:] = class_labels

    def close(self):
        if len(self.buffer["data"]) > 0:
            self.flush()

        self.db.close()


In [5]:
args = {}
args["dataset"] = "/content/flowers17"
args["output"] = "/content/flowers17_features.hdf5"
args["batch_size"] = 32
args["buffer_size"] = 1000

bs = args["batch_size"]

In [6]:
print("[INFO] loading images...")
image_paths = list(paths.list_images(args["dataset"]))
random.shuffle(image_paths)

labels = [p.split(os.path.sep)[-2] for p in image_paths]
le = LabelEncoder()
labels = le.fit_transform(labels)

[INFO] loading images...


In [7]:
print("[INFO] loading network...")
model = VGG16(weights="imagenet", include_top=False)
dataset = HDF5DatasetWriter((len(image_paths), 512 * 7 * 7),
                            args["output"], data_key="features", buf_size=args["buffer_size"])
dataset.store_class_labels(le.classes_)

[INFO] loading network...
Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/vgg16/vgg16_weights_tf_dim_ordering_tf_kernels_notop.h5


In [8]:
widgets = ["Extracting Features: ", progressbar.Percentage(), " ", progressbar.Bar(), " ", progressbar.ETA()]
pbar = progressbar.ProgressBar(max_value=len(image_paths), widgets=widgets).start()

for i in np.arange(0, len(image_paths), bs):
    batch_paths = image_paths[i:i + bs]
    batch_labels = labels[i:i + bs]
    batch_images = []

    for (j, image_path) in enumerate(batch_paths):
        image = load_img(image_path, target_size=(224, 224))
        image = img_to_array(image)

        image = np.expand_dims(image, axis=0)
        image = imagenet_utils.preprocess_input(image)

        batch_images.append(image)

    batch_images = np.vstack(batch_images)
    features = model.predict(batch_images, batch_size=bs)

    # flatten features
    features = features.reshape((features.shape[0], 512 * 7 * 7))

    dataset.add(features, batch_labels)
    pbar.update(i)

dataset.close()
pbar.finish()

Extracting Features:   0% |                                    | ETA:  --:--:--



Extracting Features:   0% |                                    | ETA:  --:--:--



Extracting Features:   2% |                                    | ETA:   0:07:57



Extracting Features:   4% |#                                   | ETA:   0:03:59



Extracting Features:   7% |##                                  | ETA:   0:02:38



Extracting Features:   9% |###                                 | ETA:   0:01:58



Extracting Features:  11% |####                                | ETA:   0:01:34



Extracting Features:  14% |#####                               | ETA:   0:01:18



Extracting Features:  16% |#####                               | ETA:   0:01:06



Extracting Features:  18% |######                              | ETA:   0:00:58



Extracting Features:  21% |#######                             | ETA:   0:00:51



Extracting Features:  23% |########                            | ETA:   0:00:45



Extracting Features:  25% |#########                           | ETA:   0:00:40



Extracting Features:  28% |##########                          | ETA:   0:00:36



Extracting Features:  30% |###########                         | ETA:   0:00:33



Extracting Features:  32% |###########                         | ETA:   0:00:30



Extracting Features:  35% |############                        | ETA:   0:00:28



Extracting Features:  37% |#############                       | ETA:   0:00:25



Extracting Features:  40% |##############                      | ETA:   0:00:23



Extracting Features:  42% |###############                     | ETA:   0:00:21



Extracting Features:  44% |################                    | ETA:   0:00:20



Extracting Features:  47% |################                    | ETA:   0:00:18



Extracting Features:  49% |#################                   | ETA:   0:00:17



Extracting Features:  51% |##################                  | ETA:   0:00:16



Extracting Features:  54% |###################                 | ETA:   0:00:14



Extracting Features:  56% |####################                | ETA:   0:00:13



Extracting Features:  58% |#####################               | ETA:   0:00:12



Extracting Features:  61% |######################              | ETA:   0:00:11



Extracting Features:  63% |######################              | ETA:   0:00:10



Extracting Features:  65% |#######################             | ETA:   0:00:09



Extracting Features:  68% |########################            | ETA:   0:00:08



Extracting Features:  70% |#########################           | ETA:   0:00:08



Extracting Features:  72% |##########################          | ETA:   0:00:08



Extracting Features:  75% |###########################         | ETA:   0:00:07



Extracting Features:  77% |###########################         | ETA:   0:00:06



Extracting Features:  80% |############################        | ETA:   0:00:05



Extracting Features:  82% |#############################       | ETA:   0:00:05



Extracting Features:  84% |##############################      | ETA:   0:00:04



Extracting Features:  87% |###############################     | ETA:   0:00:03



Extracting Features:  89% |################################    | ETA:   0:00:02



Extracting Features:  91% |#################################   | ETA:   0:00:02



Extracting Features:  94% |#################################   | ETA:   0:00:01



Extracting Features:  96% |##################################  | ETA:   0:00:00



Extracting Features: 100% |####################################| Time:  0:00:27


In [12]:
db = h5py.File("/content/flowers17_features.hdf5")
print(list(db.keys()))
print(db['features'].shape)
print(db['labels'].shape)
print(db['label_names'].shape)

['features', 'label_names', 'labels']
(1360, 25088)
(1360,)
(17,)
