In [1]:
%matplotlib inline 
import time

import numpy as np # linear algebra 
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import cv2
from multiprocessing.pool import Pool, ThreadPool

from tqdm import tqdm

from data_utils import grab, sample

import matplotlib.pyplot as plt

In [2]:
train_data = pd.read_csv('/home/data/LandmarkRetrieval/index.csv')
test_data = pd.read_csv('/home/data/LandmarkRetrieval/test.csv')
submission = pd.read_csv('/home/data/LandmarkRetrieval/sample_submission.csv')

In [3]:
print("Training data size",train_data.shape)
print("test data size",test_data.shape)

Training data size (1098461, 2)
test data size (117703, 2)


In [4]:
train_data.head()

Unnamed: 0,id,url
0,b09ea096f4daa42e,https://lh5.googleusercontent.com/-wBt6sklzbGs...
1,6648383c7b3a438c,https://lh3.googleusercontent.com/-OADWsOZq83E...
2,d485d9f770e40453,https://lh5.googleusercontent.com/-vtZFUTuFWGM...
3,e7cdfba9ec7e9dd5,https://lh4.googleusercontent.com/-Skq6Q-VUALk...
4,44806beb654e6410,https://lh4.googleusercontent.com/-6eIkjkr3j2w...


In [5]:
test_data.head()

Unnamed: 0,id,url
0,000088da12d664db,https://lh3.googleusercontent.com/-k45wfamuhT8...
1,0001623c6d808702,https://lh3.googleusercontent.com/-OQ0ywv8KVIA...
2,0001bbb682d45002,https://lh3.googleusercontent.com/-kloLenz1xZk...
3,0002362830cfe3a3,https://lh3.googleusercontent.com/-N6z79jNZYTg...
4,000270c9100de789,https://lh3.googleusercontent.com/-keriHaVOq1U...


In [6]:
submission.head()

Unnamed: 0,id,images
0,000088da12d664db,0370c4c856f096e8 766677ab964f4311 e3ae4dcee813...
1,0001623c6d808702,b01175326ee19742 67eb4fc9ff184bd2 0f775e72c031...
2,0001bbb682d45002,5101d06f891261be ae9c548dcf7102e2 a42098eb2dd3...
3,0002362830cfe3a3,40cb31b754bb7249 f860d7fbb0073fab 492c16c0584d...
4,000270c9100de789,fe3a62d9b50ca221 5005114ed61af1bc 754e137d888d...


In [7]:
# now open the URL
temp = 4444
print('id', train_data['id'][temp])
print('url:', train_data['url'][temp])

id 731bf321dc92bdfc
url: https://lh4.googleusercontent.com/-gQEUjiYZrAA/UhcFIFIAJXI/AAAAAAAAE_o/yZ_s7e_zWvg/s1600/


In [8]:
# missing data in training data 
total = train_data.isnull().sum().sort_values(ascending = False)
percent = (train_data.isnull().sum()/train_data.isnull().count()).sort_values(ascending = False)
missing_train_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_train_data.head()

Unnamed: 0,Total,Percent
url,0,0.0
id,0,0.0


In [9]:
# missing data in test data 
total = test_data.isnull().sum().sort_values(ascending = False)
percent = (test_data.isnull().sum()/test_data.isnull().count()).sort_values(ascending = False)
missing_test_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
missing_test_data.head()

Unnamed: 0,Total,Percent
url,0,0.0
id,0,0.0


In [10]:
# train_sample = train_data.sample(100)
# train_imgs = []

# pb = tqdm(total = 100)
# for row in tqdm(train_sample.iterrows()):
#     train_imgs += [grab(row[1]['url'])]
#     pb.update(1)

# pb.close()

def sample(i):
    img = None
    while(img is None):
        try:
            img = grab(train_data.sample(1).iloc[0, 1])
        except:
            img = None

    return img

max_samples = 500
pool = ThreadPool(20)
train_imgs = [imgs for imgs in tqdm(pool.imap_unordered(sample, range(max_samples)), total = max_samples)]


100%|██████████| 500/500 [02:14<00:00,  3.72it/s]


In [11]:
# Initiate ORB detector
orb = cv2.ORB_create()

# find the keypoints and descriptors with ORB
# kp, des = orb.detectAndCompute(img,None)
def getOrb(img):
    return orb.detectAndCompute(img, None)
print('Start Orb')
t0 = time.time()
features = [feat for feat in pool.map(getOrb, train_imgs)]
print("Orb time: ", time.time() - t0)

Start Orb
Orb time:  24.05862259864807


In [12]:
max_vis = 100

# features = [orb.detectAndCompute(img, None) for img in train_imgs]

FLANN_INDEX_LSH = 6
index_params= dict(algorithm = FLANN_INDEX_LSH,
                   table_number = 6, # 12
                   key_size = 12,     # 20
                   multi_probe_level = 1) #2
search_params = dict(checks=50)   # or pass empty dictionary

flann = cv2.FlannBasedMatcher(index_params, search_params)

permutation = np.zeros(max_vis, dtype = np.int32)
score = np.zeros(max_vis, dtype = np.float32)
for i, (kp1, ds1) in tqdm(enumerate(features[:max_vis]), total = max_vis):
    for j, (kp2, ds2) in enumerate(features):
        if j == i:
            continue # Skip if we're comparing the same image
        # Match every image in the trainset with every other image in the trainset
        try:
            m = flann.knnMatch(ds1, ds2, k = 2)
        except:
            continue
        # Select the good matches and use them to calculate a score for the overall match of an image
        score_img = 0
        for k in range(len(m)):
            if len(m[k]) != 2:
                continue
            if m[k][0].distance < 0.7 * m[k][1].distance:
                # score is the sum of reciprocal of distances of all matched points
                score_img += 1.0 / (m[k][0].distance + 1e-7)

        if score_img > score[i] and score_img > 0.4:
            score[i] = score_img
            permutation[i] = j

100%|██████████| 100/100 [02:45<00:00,  1.66s/it]


In [None]:
print(permutation)

In [None]:
print(score)

In [None]:
i = 0
while (i < max_vis):
    if (score[i] == 0):
        i += 1
        continue
    plt.figure(figsize = (10, 5))
    plt.subplot(1, 2, 1)
    plt.title(score[i])
    plt.imshow(train_imgs[i])
    plt.subplot(100, 2, 2)
    plt.imshow(train_imgs[int(permutation[i])])