In [79]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
from scipy.stats import mode

%matplotlib inline

# KNN with Naive Inception-V3 Features

In [46]:
train_df = pd.read_csv('../data/resized/train_resized.csv')
test_df = pd.read_csv('../data/resized/test_resized.csv')
sample_submission = pd.read_csv('./data/all/sample_submission.csv', usecols=['id'])

knn_distance = np.load('./result/knn_naive_distance.npy')
knn_neighbor = np.load('./result/knn_naive_neighbor_index.npy')

In [47]:
print('Train:\t\t\t', train_df.shape)
print('Test:\t\t\t', test_df.shape)
print('Sample Submission:\t', sample_submission.shape)
print('KNN Distance:\t\t', knn_distance.shape)
print('KNN Neighbor:\t\t', knn_neighbor.shape)

Train:			 (1223295, 3)
Test:			 (117224, 2)
Sample Submission:	 (117703, 1)
KNN Distance:		 (117224, 100)
KNN Neighbor:		 (117224, 100)


In [48]:
train_df.head()

Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651
2,6b2bb500b6a38aa0,http://lh6.ggpht.com/-vKr5G5MEusk/SR6r6SJi6mI/...,11284
3,b399f09dee9c3c67,https://lh3.googleusercontent.com/-LOW2cjAqubA...,8429
4,19ace29d77a5be66,https://lh5.googleusercontent.com/-tnmSXwQcWL8...,6231


In [49]:
test_df.head()

Unnamed: 0,id,url
0,000088da12d664db,https://lh3.googleusercontent.com/-k45wfamuhT8...
1,0001623c6d808702,https://lh3.googleusercontent.com/-OQ0ywv8KVIA...
2,0001bbb682d45002,https://lh3.googleusercontent.com/-kloLenz1xZk...
3,0002362830cfe3a3,https://lh3.googleusercontent.com/-N6z79jNZYTg...
4,000270c9100de789,https://lh3.googleusercontent.com/-keriHaVOq1U...


In [50]:
sample_submission.head()

Unnamed: 0,id
0,000088da12d664db
1,0001623c6d808702
2,0001bbb682d45002
3,0002362830cfe3a3
4,000270c9100de789


### Use the first neighbor

In [37]:
# Get prediction for each query images
prediction = []
for neighbors in knn_neighbor:
    prediction.append(train_df.loc[neighbors[0]]['landmark_id'])

prediction_tuple = [str(idx) + ' ' + '1.0' for idx in prediction]

In [63]:
# Create submission files
submission = pd.DataFrame({'id': test_df['id'].values, 'landmarks': prediction_tuple})
submission = pd.merge(sample_submission, submission, how='left', on='id')
submission.to_csv('./result/knn_naive_first_neighbor.csv', index=False, columns=['id', 'landmarks'])

### Use more neighbors

In [82]:
# Get the first 100 neighbors
predictions = []
for neighbors in knn_neighbor:
    predictions.append(train_df.loc[neighbors]['landmark_id'].values)

predictions = np.array(predictions)

In [102]:
# Get mode
prediction_mode = mode(predictions, axis=1)
prediction = prediction_mode[0][:, 0]
prediction_tuple = [str(idx) + ' ' + '1.0' for idx in prediction]

In [103]:
# Create submission files
submission = pd.DataFrame({'id': test_df['id'].values, 'landmarks': prediction_tuple})
submission = pd.merge(sample_submission, submission, how='left', on='id')
submission.to_csv('./result/knn_naive_mode_neighbor.csv', index=False, columns=['id', 'landmarks'])