# Note

* Use the extracted features from pre-trained VGG16 model
* Fit KNN model directly on the whole training set
* Find first 50 neighbors for all the test images

In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from scipy.stats import mode
from sklearn.neighbors import NearestNeighbors

%matplotlib inline

# Extract Features with VGG16

# Load Features and Labels

In [2]:
train_feature = np.load('./data/knn/train_feature.npy')
test_feature = np.load('./data/knn/test_feature.npy')

train_df = pd.read_csv('./data/knn/train.csv')
test_df = pd.read_csv('./data/knn/test.csv')

In [3]:
print('Train:\t', train_feature.shape, train_df.shape)
print('Test:\t', test_feature.shape, test_df.shape)

Train:	 (115210, 2048) (115210, 3)
Test:	 (22158, 2048) (22158, 3)


In [4]:
train_df.head()

Unnamed: 0,id,url,landmark_id
0,e205ca7c8dd7c027,https://lh3.googleusercontent.com/-V3RjsZtGpxE...,0
1,a46e23a50971b2ae,http://lh3.ggpht.com/-Sh6ivfJthkY/TlfeUyfkShI/...,0
2,1886e9f023806d4a,https://lh3.googleusercontent.com/-iTVvmeGB5gY...,0
3,2d2f210598b07e7f,https://lh3.googleusercontent.com/-L1nwXCRBwdU...,0
4,9c72c5aa9dc30c61,http://lh3.ggpht.com/-qDuF6QmmLh0/SgnLqstPCVI/...,0


# 0. Random Guess

In [6]:
# Helper function
def accuracy(true_label, prediction, top=1):
    """ function to calculate the prediction accuracy """
    prediction = prediction[:, :top]
    count = 0
    for i in range(len(true_label)):
        if true_label[i] in prediction[i]:
            count += 1
            
    return count / len(true_label)

### Random guessing

In [7]:
seed = 42
random_guess = np.random.choice(train_df['landmark_id'].values, 
                                size=(len(test_df), 20), replace=True)

In [10]:
print('Top  1 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=1))
print('Top  5 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=5))
print('Top 10 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=10))
print('Top 20 accuracy:\t', accuracy(test_df['landmark_id'].values, random_guess, top=20))

Top  1 accuracy:	 4.5130426933838797e-05
Top  5 accuracy:	 0.00031591298853687156
Top 10 accuracy:	 0.0005866955501399044
Top 20 accuracy:	 0.0014893040888166802


# 1. Implement KNN Model (unnormalized features)

In [5]:
# Implement KNN model
knn = NearestNeighbors(n_neighbors=20, algorithm='auto', leaf_size=30, 
                       metric='minkowski', p=2, n_jobs=-1)
knn.fit(train_feature)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [15]:
# Search the first 20 neighbors
distance, neighbor_index = knn.kneighbors(test_feature, return_distance=True)

# Save the results
np.save('./result/knn_naive_distance.npy', distance)
np.save('./result/knn_naive_neighbor_index.npy', neighbor_index)

## Search Neighbors

In [5]:
knn_distance = np.load('./result/knn_naive_distance.npy')
knn_neighbor = np.load('./result/knn_naive_neighbor_index.npy')

# Get the first 20 neighbors
predictions = []
for neighbors in knn_neighbor:
    predictions.append(train_df.loc[neighbors]['landmark_id'].values)

predictions = np.array(predictions)

## Calculate Accuracy

### Mode accuracy

In [46]:
prediction_mode = mode(predictions, axis=1)
print('Mode accuracy:\t', accuracy(test_df['landmark_id'].values, prediction_mode[0], top=1))

Mode accuracy:	 0.291993862261937


### Top 1 accuracy

In [11]:
print('Top  1 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=1))
print('Top  5 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=5))
print('Top 10 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=10))
print('Top 20 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=20))

Top  1 accuracy:	 0.3627132412672624
Top  5 accuracy:	 0.5096579113638415
Top 10 accuracy:	 0.5684177272316996
Top 20 accuracy:	 0.6281704124921021


# 2. Implement KNN Model (normalized features)

In [6]:
# Normalize features
train_norm = np.linalg.norm(train_feature, axis=1, keepdims=True)
test_norm = np.linalg.norm(test_feature, axis=1, keepdims=True)
train_feature = train_feature / train_norm
test_feature = test_feature / test_norm

In [10]:
# Implement KNN model
knn = NearestNeighbors(n_neighbors=20, algorithm='auto', leaf_size=30, 
                       metric='minkowski', p=2, n_jobs=-1)
knn.fit(train_feature)

NearestNeighbors(algorithm='auto', leaf_size=30, metric='minkowski',
         metric_params=None, n_jobs=-1, n_neighbors=20, p=2, radius=1.0)

In [12]:
# Search the first 20 neighbors
distance, neighbor_index = knn.kneighbors(test_feature, return_distance=True)

# Save the results
np.save('./result/knn_naive_distance_normalized.npy', distance)
np.save('./result/knn_naive_neighbor_index_normalized.npy', neighbor_index)

## Search Neighbors

In [12]:
knn_distance = np.load('./result/knn_naive_distance_normalized.npy')
knn_neighbor = np.load('./result/knn_naive_neighbor_index_normalized.npy')

# Get the first 20 neighbors
predictions = []
for neighbors in knn_neighbor:
    predictions.append(train_df.loc[neighbors]['landmark_id'].values)

predictions = np.array(predictions)

## Calculate Accuracy

### Mode accuracy

In [17]:
prediction_mode = mode(predictions, axis=1)
print('Mode accuracy:\t', accuracy(test_df['landmark_id'].values, prediction_mode[0], top=1))

Mode accuracy:	 0.32854950807834643


### Top k accuracy

In [14]:
print('Top  1 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=1))
print('Top  5 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=5))
print('Top 10 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=10))
print('Top 20 accuracy:\t', accuracy(test_df['landmark_id'].values, predictions, top=20))

Top  1 accuracy:	 0.3998104522068779
Top  5 accuracy:	 0.5481992959653398
Top 10 accuracy:	 0.6065529379907934
Top 20 accuracy:	 0.6650419712970485
