#Reverse Image Search Engine
- Reverse Image Search is one of engine in the field of Computer vision I very interesting
- This engine explore how one can use embeddings — a contextual representation of an image to find similar images
- explore different strategies and algorithms to speed this up at scale, from thousands to several million images, and making them searchable in microseconds. 

##content
1.   **feature_extraction**: extract image to the feature from CNN
2.   similarity-search: index features and search for most similar features using nearest neighbor algorithms, and visualizing plots
3.   reduce-feature-length-with-pca: experiment with PCA and figure out what is the optimum length of the features to use

##1.feature_extraction

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
import numpy as np
from numpy.linalg import norm
import pickle
from tqdm import tqdm, tqdm_notebook
import os
import random
import time
import math
import tensorflow
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input
from tensorflow.keras.applications.vgg16 import VGG16
from tensorflow.keras.applications.vgg19 import VGG19
from tensorflow.keras.applications.mobilenet import MobileNet
from tensorflow.keras.applications.inception_v3 import InceptionV3
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Flatten, Dense, Dropout, GlobalAveragePooling2D

Step1: import Dataset

In [None]:
!mkdir -p ../../datasets
!pip install gdown
!gdown https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp --output ../../datasets/caltech101.tar.gz
!tar -xvzf ../../datasets/caltech101.tar.gz --directory ../../datasets
!mv ../../datasets/101_ObjectCategories ../../datasets/caltech101
!rm -rf ../../datasets/caltech101/BACKGROUND_Google

  .format(url='https://drive.google.com/uc?id={}'.format(file_id))
Downloading...
From: https://drive.google.com/file/d/137RyRjvTBkBiIfeYBNZBtViDHQ6_Ewsp
To: /datasets/caltech101.tar.gz
63.5kB [00:00, 23.3MB/s]

gzip: stdin: not in gzip format
tar: Child returned status 1
tar: Error is not recoverable: exiting now
mv: cannot stat '../../datasets/101_ObjectCategories': No such file or directory


Step2: Create model

In [None]:
def model_picker(name):
    if (name == 'vgg16'):
        model = VGG16(weights='imagenet',
                      include_top=False,
                      input_shape=(224, 224, 3),
                      pooling='max')
    elif (name == 'vgg19'):
        model = VGG19(weights='imagenet',
                      include_top=False,
                      input_shape=(224, 224, 3),
                      pooling='max')
    elif (name == 'mobilenet'):
        model = MobileNet(weights='imagenet',
                          include_top=False,
                          input_shape=(224, 224, 3),
                          pooling='max',
                          depth_multiplier=1,
                          alpha=1)
    elif (name == 'inception'):
        model = InceptionV3(weights='imagenet',
                            include_top=False,
                            input_shape=(224, 224, 3),
                            pooling='max')
    elif (name == 'resnet'):
        model = ResNet50(weights='imagenet',
                         include_top=False,
                         input_shape=(224, 224, 3),
                        pooling='max')
    elif (name == 'xception'):
        model = Xception(weights='imagenet',
                         include_top=False,
                         input_shape=(224, 224, 3),
                         pooling='max')
    else:
        print("Specified model not available")
    return model

In [None]:
model_architecture = 'resnet'
model = model_picker(model_architecture)

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


Step3: Create Extract feature (include step2)

In [None]:
def extract_features(img_path, model):
    input_shape = (224, 224, 3)
    img = image.load_img(img_path,
                         target_size=(input_shape[0], input_shape[1]))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)
    flattened_features = features.flatten()
    normalized_features = flattened_features / norm(flattened_features)
    return normalized_features

Test A: Extract feature TEST with cat.jpg

In [None]:
try:
  import google.colab
  IS_COLAB_ENV = True
except:
  IS_COLAB_ENV = False

In [None]:
IMG_PATH = '../../sample-images/cat.jpg'
if IS_COLAB_ENV:
  !curl https://raw.githubusercontent.com/PracticalDL/Practical-Deep-Learning-Book/master/sample-images/cat.jpg --output cat.jpg
  IMG_PATH = 'cat.jpg'

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  661k  100  661k    0     0  1407k      0 --:--:-- --:--:-- --:--:-- 1404k


In [None]:
features = extract_features('cat.jpg', model)
print("Total length of features for one image: ", len(features))

Total length of features for one image:  2048


Step4: extract freture with REAL dataset and pickle dump

*   File name -> tranfrom image name to text keep in the arry [,,]
*   File list -> get the feature alredy extract keep in the arry [,,] 



In [None]:
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']

def get_file_list(root_dir):
    file_list = []
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                filepath = os.path.join(root, filename)
                if os.path.exists(filepath):
                  file_list.append(filepath)
                else:
                  print(filepath)
    return file_list

In [None]:
# path to the your datasets
root_dir = '/content/gdrive/MyDrive/image_search/101_ObjectCategories'
filenames = sorted(get_file_list(root_dir))
print(len(filenames))

8677


In [None]:
standard_feature_list = []
for i in tqdm_notebook(range(len(filenames))):
    standard_feature_list.append(extract_features(filenames[i], model))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


  0%|          | 0/8677 [00:00<?, ?it/s]

Now let's try the same with the Keras Image Generator functions.

In [None]:
batch_size = 128
datagen = tensorflow.keras.preprocessing.image.ImageDataGenerator(preprocessing_function=preprocess_input)

generator = datagen.flow_from_directory(root_dir,
                                        target_size=(224, 224),
                                        class_mode=None,
                                        shuffle=False)

num_images = len(generator.filenames)
num_epochs = int(math.ceil(num_images / batch_size))

start_time = time.time()
feature_list = []
feature_list = model.predict_generator(generator, num_epochs)
end_time = time.time()

Found 8677 images belonging to 101 classes.


  


In [None]:
root_dir

'/content/gdrive/MyDrive/image_search/101_ObjectCategories'

In [None]:
len(feature_list[0]) ## continue using OOOO

101

In [None]:
for i, features in enumerate(feature_list):
    feature_list[i] = features / norm(features)

feature_list = feature_list.reshape(len(feature_list), -1)

print("Num images   = ", len(generator.classes))
print("Shape of feature_list = ", feature_list.shape)
print("Time taken in sec = ", end_time - start_time)

Num images   =  8677
Shape of feature_list =  (2176, 101)
Time taken in sec =  309.36880373954773


In [None]:
feature_list

array([[9.7593457e-01, 2.1806362e-01, 4.2282813e-09, ..., 5.4905178e-12,
        5.9562160e-09, 1.9910151e-07],
       [9.9999505e-01, 3.1596171e-03, 3.1561712e-11, ..., 1.5960328e-14,
        6.0038856e-11, 4.9669979e-09],
       [8.9359927e-01, 4.4886565e-01, 1.1590574e-08, ..., 1.5354574e-12,
        1.2531770e-09, 2.6928122e-09],
       ...,
       [7.5723401e-09, 3.3564134e-09, 1.0619596e-06, ..., 8.4453734e-07,
        6.0345948e-05, 2.9422119e-06],
       [3.2655710e-12, 2.6009330e-13, 1.6824890e-12, ..., 4.2382201e-10,
        2.3043820e-06, 3.5522481e-09],
       [7.3336793e-07, 1.0855513e-06, 3.2148044e-05, ..., 1.7060293e-06,
        7.7939819e-04, 2.6305843e-05]], dtype=float32)

In [None]:
filenames = [root_dir + '/' + s for s in generator.filenames]

In [None]:
pickle.dump(generator.classes, open('/content/gdrive/MyDrive/Reverse_Image_Search/class_ids-caltech101.pickle','wb'))
pickle.dump(filenames, open('/content/gdrive/MyDrive/Reverse_Image_Search/filenames-caltech101.pickle', 'wb'))
pickle.dump(feature_list,open('/content/gdrive/MyDrive/Reverse_Image_Search/features-caltech101-' + model_architecture + '.pickle', 'wb'))

Step 5: Let's train the finetuned model and save

In [None]:
TRAIN_SAMPLES = 8677
NUM_CLASSES = 101
IMG_WIDTH, IMG_HEIGHT = 224, 224

In [None]:
train_datagen = ImageDataGenerator(preprocessing_function=preprocess_input,
                                   rotation_range=20,
                                   width_shift_range=0.2,
                                   height_shift_range=0.2,
                                   zoom_range=0.2)

In [None]:
train_generator = train_datagen.flow_from_directory(root_dir,
                                                    target_size=(IMG_WIDTH,
                                                                 IMG_HEIGHT),
                                                    shuffle=True,
                                                    seed=12345,
                                                    class_mode='categorical')

Found 8677 images belonging to 101 classes.


In [None]:
def model_maker():
    base_model = ResNet50(include_top=False,
                           input_shape=(IMG_WIDTH, IMG_HEIGHT, 3))
    for layer in base_model.layers[:]:
        layer.trainable = False
    input = Input(shape=(IMG_WIDTH, IMG_HEIGHT, 3))
    custom_model = base_model(input)
    custom_model = GlobalAveragePooling2D()(custom_model)
    custom_model = Dense(64, activation='relu')(custom_model)
    custom_model = Dropout(0.5)(custom_model)
    predictions = Dense(NUM_CLASSES, activation='softmax')(custom_model)
    return Model(inputs=input, outputs=predictions)

In [None]:
model_finetuned = model_maker()
model_finetuned.compile(loss='categorical_crossentropy',
              optimizer=tensorflow.keras.optimizers.Adam(0.001),
              metrics=['acc'])
model_finetuned.fit_generator(
    train_generator,
    steps_per_epoch=math.ceil(float(TRAIN_SAMPLES) / batch_size),
    epochs=10)

  


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f42700c3050>

In [None]:
model_finetuned.save('/content/gdrive/MyDrive/Reverse_Image_Search/data/model-finetuned.h5')

  layer_config = serialize_layer_fn(layer)


In [None]:
start_time = time.time()
feature_list_finetuned = []
feature_list_finetuned = model_finetuned.predict_generator(generator, num_epochs)
end_time = time.time()

for i, features_finetuned in enumerate(feature_list_finetuned):
    feature_list_finetuned[i] = features_finetuned / norm(features_finetuned)

feature_list = feature_list_finetuned.reshape(len(feature_list_finetuned), -1)

print("Num images   = ", len(feature_list_finetuned) )
print("Shape of feature_list = ", feature_list.shape)
print("Time taken in sec = ", end_time - start_time)

  This is separate from the ipykernel package so we can avoid doing imports until


Num images   =  2176
Shape of feature_list =  (2176, 101)
Time taken in sec =  309.36880373954773


In [None]:
pickle.dump(feature_list,open('/content/gdrive/MyDrive/Reverse_Image_Search/features-caltech101-resnet-finetuned.pickle', 'wb'))

#feature_extraction -Summary 
- Step1: import Dataset
- Step2: Create model_architecture
- Step3: Create Extract feature def
- Step4: extract freture with REAL dataset and pickle dump
- Step5: Let's train finetuned model a and save 

NEXT to Reverse Image Search#2 >>> **similarity-search**