# Introduction

'''This script goes along the blog post
"Building powerful image classification models using very little data"
from blog.keras.io.
"https://blog.keras.io/building-powerful-image-classification-models-using-very-little-data.html"

It uses data that can be downloaded at:
https://www.kaggle.com/c/dogs-vs-cats/data
In our setup, we:
- created a data/ folder
- created train/ and validation/ subfolders inside data/
- created cats/ and dogs/ subfolders inside train/ and validation/
- put the cat pictures index 0-999 in data/train/cats
- put the cat pictures index 1000-1400 in data/validation/cats
- put the dogs pictures index 12500-13499 in data/train/dogs
- put the dog pictures index 13500-13900 in data/validation/dogs
So that we have 1000 training examples for each class, and 400 validation examples for each class.
In summary, this is our directory structure:
```
data/
    train/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
    validation/
        dogs/
            dog001.jpg
            dog002.jpg
            ...
        cats/
            cat001.jpg
            cat002.jpg
            ...
```
'''


# Code

### Constants, paths, configuration

In [7]:
# dimensions of our images.
img_width, img_height = 150, 150
target_size = (img_width, img_height)

## folder in which there are datasets and where bottleneck files will be saved
DATASET_FOLDER = 'dogs_and_cats/data_small'

## remember to edit these values when changing datasets
nb_train_samples = 2000
nb_validation_samples = 800
epochs = 50
batch_size = 16


top_model_weights_path = '{}/bottleneck_fc_model.h5'.format(DATASET_FOLDER)
train_data_dir = '{}/train'.format(DATASET_FOLDER)
validation_data_dir = '{}/validation'.format(DATASET_FOLDER)

features_train_path = '{}/bottleneck_features_train.npy'.format(DATASET_FOLDER)
features_validation_path = '{}/bottleneck_features_validation.npy'.format(DATASET_FOLDER)


# Checks if nb_train_samples and nb_validation_samples
# are divisible by batch_size

if nb_train_samples % batch_size != 0:
    raise ArithmeticError(
        "nb_train_samples should be divisibe by batch_size.\n"
        "nb_train_samples is "+str(nb_train_samples)+", "
        "but batch_size is "+str(batch_size)+"."
    )
    
if nb_validation_samples % batch_size != 0:
    raise ArithmeticError(
        "nb_validation_samples should be divisibe by batch_size.\n"
        "nb_validation_samples is "+str(nb_validation_samples)+", "
        "but batch_size is "+str(batch_size)+"."
    )

### Proper code

In [2]:
import numpy as np
from keras.preprocessing.image import ImageDataGenerator
from keras.models import Sequential
from keras.layers import Dropout, Flatten, Dense
from keras import applications

def save_bottlebeck_features():
    datagen = ImageDataGenerator(rescale = 1. / 255)

    # build the VGG16 network
    model = applications.VGG16(include_top = False, weights = 'imagenet')
    
    def create_bottleneck(data_dir, samples_count, features_path):
        generator = datagen.flow_from_directory(
            data_dir,
            target_size = target_size,
            batch_size = batch_size,
            class_mode = None,
            shuffle = False)
        bottleneck_features = model.predict_generator(generator, samples_count // batch_size)
        np.save(features_path, bottleneck_features)

    create_bottleneck(train_data_dir, nb_train_samples, features_train_path)
    
    create_bottleneck(validation_data_dir, nb_validation_samples, features_validation_path)

def my_model(input_shape):
    model = Sequential()
    model.add(Flatten(input_shape = input_shape))
    model.add(Dense(256, activation = 'relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation = 'sigmoid'))
    return model

def train_top_model():
    
    train_data = np.load(features_train_path)
    train_labels = np.array(
        [0] * (nb_train_samples // 2) + [1] * (nb_train_samples // 2))

    validation_data = np.load(features_validation_path)
    validation_labels = np.array(
        [0] * (nb_validation_samples // 2) + [1] * (nb_validation_samples // 2))

    model = my_model(input_shape = train_data.shape[1:])
    model.compile(optimizer = 'rmsprop', loss = 'binary_crossentropy', metrics = ['accuracy'])
    model.fit(train_data, train_labels, epochs = epochs, batch_size = batch_size, 
              validation_data = (validation_data, validation_labels))
    model.save_weights(top_model_weights_path)


Using TensorFlow backend.


In [None]:
save_bottlebeck_features()
train_top_model()

In [3]:
from urllib.request import urlopen
import io
from keras.preprocessing.image import ImageDataGenerator, array_to_img, img_to_array, load_img

# Loads image
# path_or_url - path to image on disk or url to image on the internet
# target_size - dimentions of the returned image
# return PIL image
def get_image(path_or_url, target_size):
    img_path = None
    
    if path_or_url.startswith("http://") or path_or_url.startswith("https://"):
        img_path = io.BytesIO(urlopen(path_or_url).read())
    else:
        img_path = path_or_url

    return load_img(img_path, target_size = target_size)

In [8]:

example_dog_url = "https://i.ytimg.com/vi/SfLV8hD7zX4/maxresdefault.jpg"
example_cat_url = "http://www.petmd.com/sites/default/files/what-does-it-mean-when-cat-wags-tail.jpg"


img = get_image(example_dog_url, target_size)

images = [img] # one image added by default

# Can also add multiple from test folder
for n in range(1, 16):
    img = get_image("dogs_and_cats/split_data/test/{}.jpg".format(n), target_size)
    x = img_to_array(img)
    images.append(x)

# convert list of ndarrays to ndarray 
# ----- for instance images is N ndarrays of shape (A,B,C)
# ----- result of np.stack will be ndarray of shape (N, A, B, C)
images = np.stack(images, axis = 0)

# find features using VGG16
vgg16_model = applications.VGG16(weights='imagenet', include_top=False)
features = vgg16_model.predict(images)

# get our freakin' epic model
model = my_model(features.shape[1:])

# load saved weights
model.load_weights(top_model_weights_path)

# predict
model.predict(features, batch_size = batch_size)

array([[ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 0.],
       [ 1.],
       [ 0.],
       [ 0.],
       [ 0.]], dtype=float32)