In [28]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from numpy.linalg import norm
import pickle
from tqdm import tqdm_notebook
import os
import time

import tensorflow as tf
import random

from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt
import matplotlib.image as mpimg

from tensorflow.keras.preprocessing import image
from tensorflow.keras.applications.resnet50 import ResNet50,preprocess_input

from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

from PIL import Image
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from matplotlib.cbook import get_sample_data

In [3]:
def fearure_extraction(img,model):
    img = image.load_img(img,target_size=(224,224))
    img_array = image.img_to_array(img)
    img_batch = np.expand_dims(img_array,axis=0)
    img_processed = preprocess_input(img_batch)
    features = model.predict(img_processed)
    features_flatten = features.flatten()
    normalized_features = features_flatten / norm(features_flatten)
    return normalized_features
    

In [4]:
sample_images = 5000

In [5]:
model = ResNet50(include_top=False,input_shape=(224,224,3),weights='imagenet')

In [6]:
img_path = '../input/flickr-image-dataset/flickr30k_images/flickr30k_images/1000344755.jpg'
features = fearure_extraction(img_path,model)
print(len(features))

In [7]:
for root,directories,filenames in os.walk('../input/flickr-image-dataset/flickr30k_images/flickr30k_images'):
    print(root)

In [8]:
root

In [9]:
def get_files(root_dir):
    file_list = []
    counter = 1
    for root,directories,filenames in os.walk(root_dir):
        for filename in filenames[:sample_images]:
            file_list.append(os.path.join(root,filename))
            counter += 1
        break
    return file_list   
        

In [10]:
root_dir = '../input/flickr-image-dataset/flickr30k_images/flickr30k_images/'
file_names = get_files(root_dir)

In [11]:
len(file_names)

In [12]:
feature_list = []
for file_name in tqdm_notebook(file_names,total=len(file_names)):
    feature_list.append(fearure_extraction(file_name,model))

In [13]:
neighbors = NearestNeighbors(n_neighbors=5, algorithm='brute',metric='euclidean').fit(feature_list)
distances, indices = neighbors.kneighbors([feature_list[0]])

plt.imshow(mpimg.imread(file_names[0]))

In [14]:
indices[0]

In [15]:
plt.imshow(mpimg.imread(file_names[indices[0][0]]))

Basically always the nearest image is original image itself,Let's check second closest image

In [16]:
plt.imshow(mpimg.imread(file_names[indices[0][1]]))

In [17]:
indices[0]

In [18]:
def plot_similar_images(image_paths,distances):
    og_image_path = mpimg.imread(image_paths[0])
    similar_images = []    
    similar_images.append(og_image_path)
    for path in image_paths[1]:
        similar_images.append(mpimg.imread(path))
    plt.figure(figsize=(20,10))
    columns = 5
    for i , image in enumerate(similar_images):
        ax = plt.subplot(1,columns,i+1)
        if i==0:
            ax.set_title('Original Image')
        else:
            ax.set_title(f'Similar Image : {i} Distance : {distances[i]:.2f}')
        plt.imshow(image)

In [20]:
for i in range(6):
    random_image_index = random.randint(0,sample_images)
    distances, indices = neighbors.kneighbors([feature_list[random_image_index]])
    image_paths = (file_names[random_image_index],[file_names[indices[0][j]] for j in range(1,5)])
    plot_similar_images(image_paths,distances[0])
    

T-sne to visualize clusters

In [35]:

num_feature_dimensions=100

pca = PCA(n_components = num_feature_dimensions)
pca.fit(feature_list[:300])

feature_list_compressed = pca.transform(feature_list[:300])

selected_features = feature_list_compressed
selected_filenames = file_names[:300]

tsne_results = TSNE(n_components=2,verbose=1,metric='euclidean').fit_transform(selected_features)

In [36]:
def plot_images_in_2d(x, y, image_paths, axis=None, zoom=1):
    x, y = np.atleast_1d(x, y)
    for x0, y0, image_path in zip(x, y, image_paths):
        image = Image.open(image_path)
        image.thumbnail((100, 100), Image.ANTIALIAS)
        img = OffsetImage(image, zoom=zoom)
        anno_box = AnnotationBbox(img, (x0, y0),
                                  xycoords='data',
                                  frameon=False)
        axis.add_artist(anno_box)
    axis.update_datalim(np.column_stack([x, y]))
    axis.autoscale()

In [37]:
def show_tsne(x, y, selected_filenames):
    fig, axis = plt.subplots()
    fig.set_size_inches(22, 22, forward=True)
    plot_images_in_2d(x, y, selected_filenames, zoom=0.3, axis=axis)
    plt.show()

In [38]:
show_tsne(tsne_results[:, 0], tsne_results[:, 1], selected_filenames)

In [39]:
def tsne_to_grid_plotter_manual(x, y, selected_filenames):
    S = 2000
    s = 100
    x = (x - min(x)) / (max(x) - min(x))
    y = (y - min(y)) / (max(y) - min(y))
    x_values = []
    y_values = []
    filename_plot = []
    x_y_dict = {}
    for i, image_path in enumerate(selected_filenames):
        a = np.ceil(x[i] * (S - s))
        b = np.ceil(y[i] * (S - s))
        a = int(a - np.mod(a, s))
        b = int(b - np.mod(b, s))
        if str(a) + "|" + str(b) in x_y_dict:
            continue
        x_y_dict[str(a) + "|" + str(b)] = 1
        x_values.append(a)
        y_values.append(b)
        filename_plot.append(image_path)
    fig, axis = plt.subplots()
    fig.set_size_inches(22, 22, forward=True)
    plot_images_in_2d(x_values, y_values, filename_plot, zoom=.58, axis=axis)
    plt.show()

In [40]:
tsne_to_grid_plotter_manual(tsne_results[:, 0], tsne_results[:, 1],
                            selected_filenames)