### INSTALLING FAISS LIBRARY

In [None]:
pip install faiss-cpu

Collecting faiss-cpu
[?25l  Downloading https://files.pythonhosted.org/packages/1d/84/9de38703486d9f00b1a63590887a318d08c52f10f768968bd7626aee75da/faiss_cpu-1.6.3-cp36-cp36m-manylinux2010_x86_64.whl (7.2MB)
[K     |████████████████████████████████| 7.2MB 3.2MB/s 
Installing collected packages: faiss-cpu
Successfully installed faiss-cpu-1.6.3


In [None]:
import faiss
import pandas as pd
import numpy as np
from numpy.linalg import norm
import pickle
from tqdm import tqdm, tqdm_notebook
import os
import time
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50, preprocess_input

### FEATURE EXTRACTION

In [None]:
model = ResNet50(weights='imagenet', include_top=False,
                 input_shape=(180, 180, 3))
def extract_features(img_path, model):
    input_shape = (180, 180, 3)
    img = image.load_img(img_path, target_size=(
        input_shape[0], input_shape[1]))
    img_array = image.img_to_array(img)
    expanded_img_array = np.expand_dims(img_array, axis=0)
    preprocessed_img = preprocess_input(expanded_img_array)
    features = model.predict(preprocessed_img)
    flattened_features = features.flatten()
    normalized_features = flattened_features / norm(flattened_features)
    return normalized_features

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/resnet/resnet50_weights_tf_dim_ordering_tf_kernels_notop.h5


In [None]:
extensions = ['.jpg', '.JPG', '.jpeg', '.JPEG', '.png', '.PNG']
def get_file_list(root_dir):
    file_list = []
    counter = 1
    for root, directories, filenames in os.walk(root_dir):
        for filename in filenames:
            if any(ext in filename for ext in extensions):
                file_list.append(os.path.join(root, filename))
                counter += 1
    return file_list

In [None]:
root_dir = './'
filenames = sorted(get_file_list(root_dir))

In [None]:
feature_list = []
for i in tqdm_notebook(range(len(filenames))):
    feature_list.append(extract_features(filenames[i], model))

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=432.0), HTML(value='')))




In [None]:
feature_list

[array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00276322], dtype=float32),
 array([0.      , 0.      , 0.      , ..., 0.      , 0.      , 0.000458],
       dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00105173], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00674423], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01125889], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01191601], dtype=float32),
 array([0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.01042833], dtype=float32),
 array([0., 0., 0., ..., 0., 0., 0.], dtype=float32),
 array([0., 0., 0., ..., 0.

In [None]:
feature_list=np.array(feature_list)

In [None]:
type(feature_list)

numpy.ndarray

In [None]:
feature_list.shape

(432, 73728)

### FAISS IMPLEMENTATION

In [None]:
dimension = 73728    # dimensions of each vector                         
n = len(filenames)    # number of vectors                   
np.random.seed(1)             
db_vectors = feature_list #building image vectors database

In [None]:
db_vectors.shape

(432, 73728)

In [None]:
nlist = 1  # number of clusters
quantiser = faiss.IndexFlatL2(dimension)  
index = faiss.IndexIVFFlat(quantiser, dimension, nlist,   faiss.METRIC_L2)

In [None]:
print(index.is_trained)   # False
index.train(db_vectors)  # train on the database vectors
print(index.ntotal)   # 0
index.add(db_vectors)   # add the vectors and update the index
print(index.is_trained)  # True
print(index.ntotal)   # 200

False
0
True
432


In [None]:
nprobe = 1  # find 2 most similar clusters
n_query = 432  
k = 10  # return 3 nearest neighbours
np.random.seed(0)   
query_vectors = feature_list
distances, indices = index.search(query_vectors, k)

In [None]:
distances

array([[0.        , 1.011936  , 1.0698938 , ..., 1.1142869 , 1.1175389 ,
        1.1187916 ],
       [0.        , 0.        , 0.        , ..., 1.2896417 , 1.2913423 ,
        1.2913423 ],
       [0.        , 0.6672239 , 0.7107788 , ..., 1.0717432 , 1.0772917 ,
        1.0856639 ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.34191236, 0.34191236,
        0.34191236],
       [0.        , 0.        , 0.        , ..., 0.36901706, 0.36901706,
        0.36901706],
       [0.        , 1.0548626 , 1.1217254 , ..., 1.1887734 , 1.2136776 ,
        1.2143064 ]], dtype=float32)

In [None]:
len(distances)

432

In [None]:
indices

array([[  0, 396, 130, ..., 116, 418, 249],
       [219,   1, 335, ..., 389, 394, 339],
       [  2, 218, 217, ..., 421, 131, 287],
       ...,
       [234, 429, 386, ..., 279, 308, 205],
       [349, 235, 202, ..., 188, 309, 281],
       [431,  40, 120, ...,  48, 157, 153]])

In [None]:
faiss.write_index(index,"vector.index") # save the index to disk
diskindex = faiss.read_index("vector.index") # load the index

### EXTRACTING OUTPUT INTO CSV

In [None]:
pickle.dump(feature_list, open('features-caltech101-resnet.pickle', 'wb'))
pickle.dump(filenames, open('filenames-caltech101.pickle','wb'))

In [None]:
filenames = pickle.load(open('filenames-caltech101.pickle', 'rb'))
feature_list = pickle.load(open('features-caltech101-resnet.pickle', 'rb'))

In [None]:
imagenames

['./1046233_0.jpg',
 './1076297_0.jpg',
 './1082183_0.jpg',
 './1088984_0.jpg',
 './1111645_0.jpg',
 './1129805_0.jpg',
 './1170894_0.jpg',
 './1170894_1.jpg',
 './1211240_0.jpg',
 './1211240_1.jpg',
 './1238122_0.jpg',
 './1238122_1.jpg',
 './130374_0.jpg',
 './130374_1.jpg',
 './130374_2.jpg',
 './130374_3.jpg',
 './1306638_0.jpg',
 './1306638_1.jpg',
 './1323502_0.jpg',
 './1323502_1.jpg',
 './1323502_2.jpg',
 './1323502_3.jpg',
 './1360443_0.jpg',
 './1360443_1.jpg',
 './1360443_2.jpg',
 './1360443_3.jpg',
 './1370619_0.jpg',
 './1396752_0.jpg',
 './1396752_1.jpg',
 './1416105_0.jpg',
 './1417301_0.jpg',
 './1417301_1.jpg',
 './1418639_0.jpg',
 './1418639_1.jpg',
 './1418639_2.jpg',
 './1420572_0.jpg',
 './1422091_0.jpg',
 './1456671_0.jpg',
 './1473075_0.jpg',
 './1473075_1.jpg',
 './1478575_0.jpg',
 './1527720_0.jpg',
 './1527720_1.jpg',
 './1527720_2.jpg',
 './1576123_0.jpg',
 './1576123_1.jpg',
 './1576123_2.jpg',
 './1579564_0.jpg',
 './1613848_0.jpg',
 './161551_0.jpg',
 './1

In [None]:
imagename = [i.split('/')[1] for i in imagenames]

In [None]:
imagename

['1046233_0.jpg',
 '1076297_0.jpg',
 '1082183_0.jpg',
 '1088984_0.jpg',
 '1111645_0.jpg',
 '1129805_0.jpg',
 '1170894_0.jpg',
 '1170894_1.jpg',
 '1211240_0.jpg',
 '1211240_1.jpg',
 '1238122_0.jpg',
 '1238122_1.jpg',
 '130374_0.jpg',
 '130374_1.jpg',
 '130374_2.jpg',
 '130374_3.jpg',
 '1306638_0.jpg',
 '1306638_1.jpg',
 '1323502_0.jpg',
 '1323502_1.jpg',
 '1323502_2.jpg',
 '1323502_3.jpg',
 '1360443_0.jpg',
 '1360443_1.jpg',
 '1360443_2.jpg',
 '1360443_3.jpg',
 '1370619_0.jpg',
 '1396752_0.jpg',
 '1396752_1.jpg',
 '1416105_0.jpg',
 '1417301_0.jpg',
 '1417301_1.jpg',
 '1418639_0.jpg',
 '1418639_1.jpg',
 '1418639_2.jpg',
 '1420572_0.jpg',
 '1422091_0.jpg',
 '1456671_0.jpg',
 '1473075_0.jpg',
 '1473075_1.jpg',
 '1478575_0.jpg',
 '1527720_0.jpg',
 '1527720_1.jpg',
 '1527720_2.jpg',
 '1576123_0.jpg',
 '1576123_1.jpg',
 '1576123_2.jpg',
 '1579564_0.jpg',
 '1613848_0.jpg',
 '161551_0.jpg',
 '1625168_0.jpg',
 '1625168_1.jpg',
 '1679426_0.jpg',
 '1679426_1.jpg',
 '1692871_0.jpg',
 '1693803_0.jpg

In [None]:
df = pd.DataFrame()
df['images'] = imagename

In [None]:
df

Unnamed: 0,images
0,1046233_0.jpg
1,1076297_0.jpg
2,1082183_0.jpg
3,1088984_0.jpg
4,1111645_0.jpg
...,...
427,917099_0.jpg
428,948745_0.jpg
429,948745_1.jpg
430,948745_2.jpg


In [None]:
df_indices = pd.DataFrame(indices)
df_indices

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,396,130,162,216,161,50,116,418,249
1,219,1,335,264,146,255,223,389,394,339
2,2,218,217,37,130,418,254,421,131,287
3,3,315,14,13,15,334,158,333,291,231
4,4,160,214,418,363,423,260,216,417,116
...,...,...,...,...,...,...,...,...,...,...
427,427,417,418,363,288,61,108,396,206,160
428,428,385,233,200,347,362,310,280,204,187
429,234,429,386,201,348,186,360,279,308,205
430,349,235,202,430,387,203,361,188,309,281


In [None]:
df_dict = df.to_dict()
df_dict = df_dict['images']

In [None]:
df_dict

{0: '1046233_0.jpg',
 1: '1076297_0.jpg',
 2: '1082183_0.jpg',
 3: '1088984_0.jpg',
 4: '1111645_0.jpg',
 5: '1129805_0.jpg',
 6: '1170894_0.jpg',
 7: '1170894_1.jpg',
 8: '1211240_0.jpg',
 9: '1211240_1.jpg',
 10: '1238122_0.jpg',
 11: '1238122_1.jpg',
 12: '130374_0.jpg',
 13: '130374_1.jpg',
 14: '130374_2.jpg',
 15: '130374_3.jpg',
 16: '1306638_0.jpg',
 17: '1306638_1.jpg',
 18: '1323502_0.jpg',
 19: '1323502_1.jpg',
 20: '1323502_2.jpg',
 21: '1323502_3.jpg',
 22: '1360443_0.jpg',
 23: '1360443_1.jpg',
 24: '1360443_2.jpg',
 25: '1360443_3.jpg',
 26: '1370619_0.jpg',
 27: '1396752_0.jpg',
 28: '1396752_1.jpg',
 29: '1416105_0.jpg',
 30: '1417301_0.jpg',
 31: '1417301_1.jpg',
 32: '1418639_0.jpg',
 33: '1418639_1.jpg',
 34: '1418639_2.jpg',
 35: '1420572_0.jpg',
 36: '1422091_0.jpg',
 37: '1456671_0.jpg',
 38: '1473075_0.jpg',
 39: '1473075_1.jpg',
 40: '1478575_0.jpg',
 41: '1527720_0.jpg',
 42: '1527720_1.jpg',
 43: '1527720_2.jpg',
 44: '1576123_0.jpg',
 45: '1576123_1.jpg',
 4

In [None]:
df_indices

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0,396,130,162,216,161,50,116,418,249
1,219,1,335,264,146,255,223,389,394,339
2,2,218,217,37,130,418,254,421,131,287
3,3,315,14,13,15,334,158,333,291,231
4,4,160,214,418,363,423,260,216,417,116
...,...,...,...,...,...,...,...,...,...,...
427,427,417,418,363,288,61,108,396,206,160
428,428,385,233,200,347,362,310,280,204,187
429,234,429,386,201,348,186,360,279,308,205
430,349,235,202,430,387,203,361,188,309,281


In [None]:
df_indices = df_indices.replace(df_dict)
df_indices

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1046233_0.jpg,6505789_0.jpg,2755487_0.jpg,3169236_1.jpg,3945315_0.jpg,3169236_0.jpg,1625168_0.jpg,2601997_0.jpg,698940_1.jpg,4391117_0.jpg
1,3996265_0.jpg,1076297_0.jpg,5669633_0.jpg,4768716_0.jpg,2853133_0.jpg,4418633_0.jpg,4024812_0.jpg,6359436_1.jpg,6460827_0.jpg,5755533_0.jpg
2,1082183_0.jpg,395560_0.jpg,3950161_0.jpg,1456671_0.jpg,2755487_0.jpg,698940_1.jpg,4412540_0.jpg,739325_0.jpg,2771913_0.jpg,5101613_0.jpg
3,1088984_0.jpg,5456115_2.jpg,130374_2.jpg,130374_1.jpg,130374_3.jpg,5651469_1.jpg,3080655_0.jpg,5651469_0.jpg,5219922_0.jpg,4146518_1.jpg
4,1111645_0.jpg,3154838_0.jpg,3929938_0.jpg,698940_1.jpg,6007737_0.jpg,766234_0.jpg,4530827_0.jpg,3945315_0.jpg,698940_0.jpg,2601997_0.jpg
...,...,...,...,...,...,...,...,...,...,...
427,917099_0.jpg,698940_0.jpg,698940_1.jpg,6007737_0.jpg,510802_0.jpg,1864536_0.jpg,2503614_0.jpg,6505789_0.jpg,3785080_0.jpg,3154838_0.jpg
428,948745_0.jpg,6357292_0.jpg,417077_0.jpg,3766409_0.jpg,5805839_0.jpg,598876_2.jpg,5430819_2.jpg,5057350_1.jpg,3779080_1.jpg,3518216_1.jpg
429,417077_1.jpg,948745_1.jpg,6357292_1.jpg,3766409_1.jpg,5805839_1.jpg,3518216_0.jpg,598876_0.jpg,5057350_0.jpg,5430819_0.jpg,3779080_2.jpg
430,5805839_2.jpg,417077_2.jpg,3766409_2.jpg,948745_2.jpg,6357292_2.jpg,3779080_0.jpg,598876_1.jpg,3518216_2.jpg,5430819_1.jpg,5057350_2.jpg


In [None]:
from google.colab import files
df_indices.to_csv('FAISS.csv')
files.download('FAISS.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>