In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys, os, time, gc
import requests, shutil
from sklearn.neighbors import NearestNeighbors
import keras
from keras.preprocessing.image import load_img, img_to_array
from keras.models import load_model

%matplotlib inline

Using TensorFlow backend.


# Data Information

In [2]:
train_df = pd.read_csv('./data/triplet/train.csv')
val_df = pd.read_csv('./data/triplet/validation.csv')
test_df = pd.read_csv('./data/triplet/test.csv')

print('Train:\t\t', train_df.shape)
print('Validation:\t', val_df.shape)
print('Test:\t\t', test_df.shape)

print('\nTrain Landmarks:\t', len(train_df['landmark_id'].unique()))
print('Validation Landmarks:\t', len(val_df['landmark_id'].unique()))
print('Test Landmarks:\t\t', len(test_df['landmark_id'].unique()))

Train:		 (113783, 4)
Validation:	 (22255, 4)
Test:		 (22391, 4)

Train Landmarks:	 14943
Validation Landmarks:	 7674
Test Landmarks:		 14436


In [3]:
train_df.head()

Unnamed: 0,image_id,id,url,landmark_id
0,465272,a2ccf8ed2e969f6a,https://lh4.googleusercontent.com/-TPHkS5gzvm4...,0
1,64516,e205ca7c8dd7c027,https://lh3.googleusercontent.com/-V3RjsZtGpxE...,0
2,928409,4e8ab93c1620e8a3,http://mw2.google.com/mw-panoramio/photos/medi...,0
3,88809,896bf928214d1ca4,http://lh5.ggpht.com/-Cy0l41uUaGA/R--yB8vy41I/...,0
4,1001133,375d2a153bdca926,http://lh6.ggpht.com/-UqzFpnqE9bU/S_0u1RovfdI/...,0


# Load Features and Labels

In [4]:
# Already normalized
train_feature = np.load('./data/triplet/train_triplet_vgg16(3)_features.npy')
val_feature = np.load('./data/triplet/validation_triplet_vgg16(3)_features.npy')
test_feature = np.load('./data/triplet/test_triplet_vgg16(3)_features.npy')

train_df = pd.read_csv('./data/triplet/train.csv')
val_df = pd.read_csv('./data/triplet/validation.csv')
test_df = pd.read_csv('./data/triplet/test.csv')

print('Train:\t\t', train_feature.shape, train_df.shape)
print('Validation:\t', val_feature.shape, val_df.shape)
print('Test:\t\t', test_feature.shape, test_df.shape)

Train:		 (113783, 512) (113783, 4)
Validation:	 (22255, 512) (22255, 4)
Test:		 (22391, 512) (22391, 4)


In [5]:
# Helper function
def accuracy(true_label, prediction, top=1):
    """ function to calculate the prediction accuracy """
    prediction = prediction[:, :top]
    count = 0
    for i in range(len(true_label)):
        if true_label[i] in prediction[i]:
            count += 1
            
    return count / len(true_label)

# Implement KNN Model

In [6]:
# Merge train and validation features
train_val_feature = np.concatenate((train_feature, val_feature), axis=0)
train_val_df = pd.concat((train_df, val_df), axis=0)
train_val_df = train_val_df.reset_index(drop=True)

In [7]:
# Implement KNN model
knn = NearestNeighbors(n_neighbors=50, algorithm='auto', leaf_size=30, 
                       metric='minkowski', p=2, n_jobs=-1)
knn.fit(train_val_feature)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=50, p=2, radius=1.0)

In [8]:
# Search the first 50 neighbors
distance, neighbor_index = knn.kneighbors(test_feature, return_distance=True)

# Save the results
np.save('./result/knn_triplet_vgg16(3)_distance.npy', distance)
np.save('./result/knn_triplet_vgg16(3)_neighbor.npy', neighbor_index)

### Search Neighbors

In [9]:
knn_distance = np.load('./result/knn_triplet_vgg16(3)_distance.npy')
knn_neighbor = np.load('./result/knn_triplet_vgg16(3)_neighbor.npy')

# Get the first 50 neighbors
predictions = []
for neighbors in knn_neighbor:
    predictions.append(train_val_df.loc[neighbors]['landmark_id'].values)

predictions = np.array(predictions)
np.save('./result/knn_triplet_vgg16(3)_test_prediction.npy', predictions)

### Compute Accuracy

In [10]:
print('Top  1 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=1))
print('Top  5 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=5))
print('Top 10 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=10))
print('Top 20 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=20))

Top  1 accuracy:	 0.377294448662409
Top  5 accuracy:	 0.5274440623464784
Top 10 accuracy:	 0.5901924880532357
Top 20 accuracy:	 0.6495020320664553


In [11]:
knn_acc = []
for i in range(1, 51):
    tmp_acc = accuracy(test_df['landmark_id'].values, predictions, top=i)
    knn_acc.append(tmp_acc)

np.save('./result/knn_triplet_vgg16(3)_accuracy.npy', knn_acc)