# P

In [1]:
# Famous Pre-trained Image Models
# VGG16 (visual geometry group, by Oxford)
# VGG19
# InceptionV3 (by Google)

In [2]:
# Googles Goggles is the beginning of visual search technology.
# With this image recognition app, users can take a photo of a physical object, and Google will try to find information about what is pictured.

# Take a photo of a landmark and Google Goggles can give you its history.
# Snap a pic of a foreign menu, and it can be translated. 
# the app can recognise and generate informaation on books, CDs, virtually anything that is 2D.

# business value:
# another avenue to generate search data
# recommend users to advertisers and retailers

![](img/vgg16_croped.png)

University of Oxford Visual Geometry Group has developed VGG-16 trained weights [Here](https://github.com/fchollet/deep-learning-models/releases)

Download the [tensorflow h5 file](https://github.com/fchollet/deep-learning-models/releases/download/v0.1/vgg16_weights_tf_dim_ordering_tf_kernels.h5). 

Note this file is a little over half a gigabyte, so it will take a while to download.


In [31]:
import os

weight_file = 'vgg16_weights_tf_dim_ordering_tf_kernels.h5'

if not os.path.exists(weight_file):
    raise FileNotFoundError("No file {weight_file} found. Check path again".format(weight_file))

In [56]:
# weight_file = 'vgg19_weights_tf_dim_ordering_tf_kernels.h5'

In [32]:
# Download labels for VGG16
!curl https://raw.githubusercontent.com/torch/tutorials/master/7_imagenet_classification/synset_words.txt -o synset_words.txt

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed

  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:01 --:--:--     0
  0     0    0     0    0     0      0      0 --:--:--  0:00:03 --:--:--     0
100 31675  100 31675    0     0   8892      0  0:00:03  0:00:03 --:--:--  8892


## A Convolutional Neural Network Architecture

### VGG (Loading a Pretrained Network using keras utilities)

this network has been pretrained on a large dataset (imagenet) as the basis for an image classifier. It has taken a huge amount of gpu time/power and data to train this model.

Here are [more examples of keras transfer learning](https://keras.io/applications/) with modern pretrained CNNs. 

In [33]:
from keras import backend as K
K.common.set_image_dim_ordering('th')

from keras.models import Sequential
from keras.layers.core import Flatten, Dense, Dropout, Activation
from keras.layers.convolutional import Conv2D, MaxPooling2D, ZeroPadding2D
from keras.optimizers import SGD
import numpy as np
import pandas as pd
import PIL

In [34]:
def VGG_16(weights_path=None):
    model = Sequential()
    model.add(ZeroPadding2D((1,1),input_shape=(3,224,224)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(64, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(128, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(256, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(256, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(ZeroPadding2D((1,1)))
    model.add(Conv2D(512, (3, 3), activation='relu'))
    model.add(MaxPooling2D((2,2), strides=(2,2)))

    model.add(Flatten())
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(4096, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(1000, activation='softmax'))

    if weights_path:
        model.load_weights(weights_path)

    return model

In [35]:
synset = pd.read_csv('synset_words.txt', skipinitialspace=True, names = ['synset', 'words'])

# model = VGG16(weights='imagenet')
model = VGG_16(weight_file)   # note that we don't actually train/adjust the weights at all here
sgd = SGD(lr=0.1, decay=1e-6, momentum=0.9, nesterov=True)
model.compile(optimizer=sgd, loss='categorical_crossentropy')

In [36]:
def convert_image_to_bgr_numpy_array(image_path, size=(224,224)):
    """The network has been trained using opencv and BGR images 
    (i.e. channels order blue, green, red rather than red, green, blue).
    The description of why is https://stackoverflow.com/questions/14556545/why-opencv-using-bgr-colour-space-instead-of-rgb
    
    We can use a simpler image library as long as we manually convert
    the data to the expected format.
    """
    image = PIL.Image.open(image_path).resize(size)
    img_data = np.array(image.getdata(), np.float32).reshape(*size, -1)
    # swap R and B channels
    img_data = np.flip(img_data, axis=2)
    return img_data

def prepare_image(image_path):
    im = convert_image_to_bgr_numpy_array(image_path)

    # these subtractions are just mean centering the images 
    # based on known means for different color channels
    im[:,:,0] -= 103.939
    im[:,:,1] -= 116.779
    im[:,:,2] -= 123.68

    im = im.transpose((2,0,1)) # adjust from (224, 224, 3) to (3, 224, 224) for keras
    im = np.expand_dims(im, axis=0) # adjust to (1, 3, 224, 224) for generating keras prediction
    return im

In [55]:
from keras.applications.vgg16 import VGG16
from keras.applications.imagenet_utils import decode_predictions

img = prepare_image('img/dog_2.jpg')

model = InceptionV3(weights='imagenet')
out = model.predict(img)
y_pred = np.argmax(out)

print('Predicted:', decode_predictions(out))

NameError: name 'InceptionV3' is not defined

In [54]:
img = prepare_image('img/test.jpg')

model = VGG19(weights='imagenet')
out = model.predict(img)
y_pred = np.argmax(out)

print('Predicted:', decode_predictions(out))

NameError: name 'VGG19' is not defined

In [57]:
img = prepare_image('img/sloth.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n07930864', 'cup', 0.6994144), ('n03063599', 'coffee_mug', 0.18904433), ('n04131690', 'saltshaker', 0.020779125), ('n03063689', 'coffeepot', 0.011247833), ('n04423845', 'thimble', 0.0071213855)]]


In [58]:
img = prepare_image('img/beagle.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n02089973', 'English_foxhound', 0.87461126), ('n02089867', 'Walker_hound', 0.121277235), ('n02088364', 'beagle', 0.0036947893), ('n02088238', 'basset', 0.00016243898), ('n02088466', 'bloodhound', 0.00014211661)]]


In [41]:
img = prepare_image('img/Labrador.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n02109047', 'Great_Dane', 0.55036515), ('n02099712', 'Labrador_retriever', 0.27059543), ('n02087394', 'Rhodesian_ridgeback', 0.050941028), ('n02090379', 'redbone', 0.019947253), ('n02089973', 'English_foxhound', 0.019448861)]]


In [42]:
img = prepare_image('img/Poodle.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n02113799', 'standard_poodle', 0.5172874), ('n02113712', 'miniature_poodle', 0.39728504), ('n02113624', 'toy_poodle', 0.083053865), ('n02088094', 'Afghan_hound', 0.00066289917), ('n02105505', 'komondor', 0.00046583617)]]


In [43]:
img = prepare_image('img/Dog_3.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n02085936', 'Maltese_dog', 0.27990836), ('n02098286', 'West_Highland_white_terrier', 0.14876175), ('n02113624', 'toy_poodle', 0.12125379), ('n02113712', 'miniature_poodle', 0.10880081), ('n02113978', 'Mexican_hairless', 0.108365685)]]


In [44]:
img = prepare_image('img/Chihuahua.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n02108915', 'French_bulldog', 0.752249), ('n02096585', 'Boston_bull', 0.15032904), ('n02085620', 'Chihuahua', 0.059806228), ('n02087046', 'toy_terrier', 0.012236713), ('n02112706', 'Brabancon_griffon', 0.002886072)]]


In [45]:
img = prepare_image('img/Bulldog.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n02109525', 'Saint_Bernard', 0.3003843), ('n02108915', 'French_bulldog', 0.27818522), ('n02093428', 'American_Staffordshire_terrier', 0.058999773), ('n02108089', 'boxer', 0.048719466), ('n02110958', 'pug', 0.03343548)]]


In [46]:
img = prepare_image('img/test1.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n03991062', 'pot', 0.7327521), ('n12620546', 'hip', 0.04760818), ('n12768682', 'buckeye', 0.029295594), ('n03457902', 'greenhouse', 0.024531063), ('n03930313', 'picket_fence', 0.017273879)]]


In [47]:
img = prepare_image('img/test2.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n07614500', 'ice_cream', 0.28780892), ('n07579787', 'plate', 0.16033651), ('n04476259', 'tray', 0.042177286), ('n07836838', 'chocolate_sauce', 0.031268515), ('n12144580', 'corn', 0.025088983)]]


In [48]:
img = prepare_image('img/mangosteen.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n07745940', 'strawberry', 0.34342515), ('n07742313', 'Granny_Smith', 0.14368135), ('n12768682', 'buckeye', 0.077075586), ('n07753592', 'banana', 0.064830475), ('n07753113', 'fig', 0.056029476)]]


In [49]:
img = prepare_image('img/strawberry.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n07745940', 'strawberry', 0.99977857), ('n04332243', 'strainer', 3.446263e-05), ('n07747607', 'orange', 2.406561e-05), ('n07753592', 'banana', 2.256647e-05), ('n07768694', 'pomegranate', 2.1113809e-05)]]


In [50]:
img = prepare_image('img/icecream.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n07614500', 'ice_cream', 0.20130746), ('n03476684', 'hair_slide', 0.057905596), ('n07579787', 'plate', 0.057216298), ('n07745940', 'strawberry', 0.054555055), ('n07714571', 'head_cabbage', 0.050523743)]]


In [59]:
img = prepare_image('img/icecream2.jpg')
out = model.predict(img)
print('Predicted:', decode_predictions(out))

Predicted: [[('n07614500', 'ice_cream', 0.87625885), ('n07613480', 'trifle', 0.11704696), ('n07836838', 'chocolate_sauce', 0.004256764), ('n07745940', 'strawberry', 0.0008514527), ('n07579787', 'plate', 0.0003462588)]]


### Transfer Learning

it turns out that the lower level featured learned by VGG16 on imagenet are still applicable to other problems with natural images. If we can preserve the lower-level features, we can just train a new model on those features. (In fact, in the case of 'softmax', we can think of this as just training a new multinomial logistic regression, on those convolution features)

Lets just snip off last layer.

A Caveat

if we just add a new layer with default weights, it is going to be very wrong the first iteration. Since it is so wrong, the gradient will be huge, and because we are using back propagation those errors will be sent down stream into the lower level features. This can quickly destroy the rest of the network.

In order to retrain this model we must protect the lower-level features, until our new layers have reached more stability. We can do this by freezing those layers

Then we'll add our new layer.

In [26]:
from keras.models import Model

# note we exclude the final dense layers and add one back below, we would retrain it ourselves
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(3,224,224)) 
 
# Freeze convolutional layers
for layer in base_model.layers:
    layer.trainable = False    
    
x = base_model.output
x = Flatten()(x) # flatten from convolution tensor output 
predictions = Dense(2, activation='softmax')(x) # should match # of classes predicted

# this is the model we will train
model = Model(inputs=base_model.input, outputs=predictions)

In [27]:
base_model.output

<tf.Tensor 'block5_pool/transpose_1:0' shape=(?, 512, 7, 7) dtype=float32>

In [28]:
x

<tf.Tensor 'flatten_2/Reshape:0' shape=(?, ?) dtype=float32>

In [29]:
model.compile(optimizer=SGD(lr=0.0001, momentum=0.9),
            loss='categorical_crossentropy', metrics=['accuracy'])

Then you would just train like normal

```python
# i.e. if we had training images and our own labels, we could run
model.fit(X_train,y_train)
```

How much data do you need?

More!

Actually with this bottleneck approach, you don't need as much. 200-1000 representitive images of each class will give good results. Because
* Google has already done most of the hard work
* We can use image augmentation to increase our number of training samples

New Architectures are being published every day. So much to read!

* [Curated List of Deep Learning papers](https://github.com/ChristosChristofidis/awesome-deep-learning)
* [Good reddit post for keeping up with the latest research](https://www.reddit.com/r/MachineLearning/comments/6d7nb1/d_machine_learning_wayr_what_are_you_reading_week/)
