In [38]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys, os, time, gc
import requests, shutil
from sklearn.neighbors import NearestNeighbors
import keras
from keras.preprocessing.image import load_img, img_to_array
from keras.applications.vgg16 import VGG16

%matplotlib inline

# Data Information

In [2]:
train_df = pd.read_csv('./data/triplet/train.csv')
val_df = pd.read_csv('./data/triplet/validation.csv')
test_df = pd.read_csv('./data/triplet/test.csv')

print('Train:\t\t', train_df.shape)
print('Validation:\t', val_df.shape)
print('Test:\t\t', test_df.shape)

print('\nTrain Landmarks:\t', len(train_df['landmark_id'].unique()))
print('Validation Landmarks:\t', len(val_df['landmark_id'].unique()))
print('Test Landmarks:\t\t', len(test_df['landmark_id'].unique()))

Train:		 (113783, 4)
Validation:	 (22255, 4)
Test:		 (22391, 4)

Train Landmarks:	 14943
Validation Landmarks:	 7674
Test Landmarks:		 14436


In [3]:
train_df.head()

Unnamed: 0,image_id,id,url,landmark_id
0,465272,a2ccf8ed2e969f6a,https://lh4.googleusercontent.com/-TPHkS5gzvm4...,0
1,64516,e205ca7c8dd7c027,https://lh3.googleusercontent.com/-V3RjsZtGpxE...,0
2,928409,4e8ab93c1620e8a3,http://mw2.google.com/mw-panoramio/photos/medi...,0
3,88809,896bf928214d1ca4,http://lh5.ggpht.com/-Cy0l41uUaGA/R--yB8vy41I/...,0
4,1001133,375d2a153bdca926,http://lh6.ggpht.com/-UqzFpnqE9bU/S_0u1RovfdI/...,0


# Extract Features using VGG16

In [4]:
# Load pre-trained VGG16
img_size = (224, 224, 3)
vgg16 = VGG16(include_top=False, weights='imagenet', input_shape=img_size, pooling='avg')
vgg16.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 224, 224, 3)       0         
_________________________________________________________________
block1_conv1 (Conv2D)        (None, 224, 224, 64)      1792      
_________________________________________________________________
block1_conv2 (Conv2D)        (None, 224, 224, 64)      36928     
_________________________________________________________________
block1_pool (MaxPooling2D)   (None, 112, 112, 64)      0         
_________________________________________________________________
block2_conv1 (Conv2D)        (None, 112, 112, 128)     73856     
_________________________________________________________________
block2_conv2 (Conv2D)        (None, 112, 112, 128)     147584    
_________________________________________________________________
block2_pool (MaxPooling2D)   (None, 56, 56, 128)       0         
__________

In [5]:
# Define train_imgs and test_imgs
train_imgs = np.zeros(shape=(len(train_df), 512), dtype=np.float32)
val_imgs = np.zeros(shape=(len(val_df), 512), dtype=np.float32)
test_imgs = np.zeros(shape=(len(test_df), 512), dtype=np.float32)

In [6]:
# Process training images
img_ids = train_df['image_id'].values
steps = 20000
for i in range(0, len(train_df), steps):
    tmp_imgs = []
    print('\nProcess: {:10d}'.format(i))
    
    start = i
    end = min(len(train_df), i + steps)
    for idx in range(start, end):
        if idx % 250 == 0:
            print('=', end='')
            
        img_id = img_ids[idx]
        path = './data/triplet/train/' + str(img_id) + '.jpg'
        img = load_img(path, target_size=img_size[:2])
        img = img_to_array(img)
        tmp_imgs.append(img)
        
    tmp_imgs = np.array(tmp_imgs, dtype=np.float32) / 255.0
    tmp_prediction = vgg16.predict(tmp_imgs)
    train_imgs[start: end, ] = tmp_prediction
    _ = gc.collect()


Process:          0
Process:      20000
Process:      40000
Process:      60000
Process:      80000
Process:     100000

In [7]:
# Process validation images
img_ids = val_df['image_id'].values
steps = 4000
for i in range(0, len(val_df), steps):
    tmp_imgs = []
    print('\nProcess: {:10d}'.format(i))
    
    start = i
    end = min(len(val_df), i + steps)
    for idx in range(start, end):
        if idx % 50 == 0:
            print('=', end='')
            
        img_id = img_ids[idx]
        path = './data/triplet/validation/' + str(img_id) + '.jpg'
        img = load_img(path, target_size=img_size[:2])
        img = img_to_array(img)
        tmp_imgs.append(img)
        
    tmp_imgs = np.array(tmp_imgs, dtype=np.float32) / 255.0
    tmp_prediction = vgg16.predict(tmp_imgs)
    val_imgs[start: end, ] = tmp_prediction
    _ = gc.collect()


Process:          0
Process:       4000
Process:       8000
Process:      12000
Process:      16000
Process:      20000

In [9]:
# Process test images
img_ids = test_df['image_id'].values
steps = 4000
for i in range(0, len(test_df), steps):
    tmp_imgs = []
    print('\nProcess: {:10d}'.format(i))
    
    start = i
    end = min(len(test_df), i + steps)
    for idx in range(start, end):
        if idx % 50 == 0:
            print('=', end='')
            
        img_id = img_ids[idx]
        path = './data/triplet/test/' + str(img_id) + '.jpg'
        img = load_img(path, target_size=img_size[:2])
        img = img_to_array(img)
        tmp_imgs.append(img)
        
    tmp_imgs = np.array(tmp_imgs, dtype=np.float32) / 255.0
    tmp_prediction = vgg16.predict(tmp_imgs)
    test_imgs[start: end, ] = tmp_prediction
    _ = gc.collect()


Process:          0
Process:       4000
Process:       8000
Process:      12000
Process:      16000
Process:      20000

In [10]:
print('Train:\t\t', train_imgs.shape)
print('Validation:\t', val_imgs.shape)
print('Test:\t\t', test_imgs.shape)

Train:		 (113783, 512)
Validation:	 (22255, 512)
Test:		 (22391, 512)


In [12]:
# Save to disk
np.save('./data/triplet/train_vgg16_features.npy', train_imgs)
np.save('./data/triplet/validation_vgg16_features.npy', val_imgs)
np.save('./data/triplet/test_vgg16_features.npy', test_imgs)

# Load Features and Labels

In [26]:
train_feature = np.load('./data/triplet/train_vgg16_features.npy')
val_feature = np.load('./data/triplet/validation_vgg16_features.npy')
test_feature = np.load('./data/triplet/test_vgg16_features.npy')

train_df = pd.read_csv('./data/triplet/train.csv')
val_df = pd.read_csv('./data/triplet/validation.csv')
test_df = pd.read_csv('./data/triplet/test.csv')

print('Train:\t\t', train_feature.shape, train_df.shape)
print('Validation:\t', val_feature.shape, val_df.shape)
print('Test:\t\t', test_feature.shape, test_df.shape)

Train:		 (113783, 512) (113783, 4)
Validation:	 (22255, 512) (22255, 4)
Test:		 (22391, 512) (22391, 4)


In [27]:
# Helper function
def accuracy(true_label, prediction, top=1):
    """ function to calculate the prediction accuracy """
    prediction = prediction[:, :top]
    count = 0
    for i in range(len(true_label)):
        if true_label[i] in prediction[i]:
            count += 1
            
    return count / len(true_label)

# 1. Random Guess

In [28]:
seed = 42
random_guess = np.random.choice(train_df['landmark_id'].values, 
                                size=(len(test_df), 50), replace=True)
np.save('./result/random_guess_test_neighbor_prediction.npy')

In [29]:
print('Top  1 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=1))
print('Top  5 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=5))
print('Top 10 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=10))
print('Top 20 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=20))

Top  1 accuracy:	 0.00013398240364432138
Top  5 accuracy:	 0.0003572864097181903
Top 10 accuracy:	 0.0006252512170068331
Top 20 accuracy:	 0.0013844848376579875


# 2. Implement KNN Model

In [32]:
# Normalize features
train_norm = np.linalg.norm(train_feature, axis=1, keepdims=True)
val_norm = np.linalg.norm(val_feature, axis=1, keepdims=True)
test_norm = np.linalg.norm(test_feature, axis=1, keepdims=True)

train_norm_feature = train_feature / train_norm
val_norm_feature = val_feature / val_norm
test_norm_feature = test_feature / test_norm

In [54]:
# Merge train and validation features
train_val_norm_feature = np.concatenate((train_norm_feature, val_norm_feature), axis=0)
train_val_df = pd.concat((train_df, val_df), axis=0)
train_val_df = train_val_df.reset_index(drop=True)

In [39]:
# Implement KNN model
knn = NearestNeighbors(n_neighbors=50, algorithm='auto', leaf_size=30, 
                       metric='minkowski', p=2, n_jobs=-1)
knn.fit(train_val_norm_feature)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=50, p=2, radius=1.0)

In [40]:
# Search the first 50 neighbors
distance, neighbor_index = knn.kneighbors(test_norm_feature, return_distance=True)

# Save the results
np.save('./result/knn_vgg16_distance.npy', distance)
np.save('./result/knn_vgg16_neighbor.npy', neighbor_index)

### Search Neighbors

In [42]:
knn_distance = np.load('./result/knn_vgg16_distance.npy')
knn_neighbor = np.load('./result/knn_vgg16_neighbor.npy')

# Get the first 50 neighbors
predictions = []
for neighbors in knn_neighbor:
    predictions.append(train_val_df.loc[neighbors]['landmark_id'].values)

predictions = np.array(predictions)
np.save('./result/knn_vgg16_test_neighbor_prediction.npy', predictions)

### Compute Accuracy

In [43]:
print('Top  1 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=1))
print('Top  5 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=5))
print('Top 10 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=10))
print('Top 20 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=20))

IndexError: too many indices for array