In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys, os, time, gc
import requests, shutil, random
from skimage import io
from skimage.transform import resize
from PIL import Image

%matplotlib inline

# Load the data set

In [2]:
# Load the data
train = pd.read_csv('./data/all/train.csv')
test = pd.read_csv('./data/all/test.csv')

print('Train:\t\t', train.shape)
print('Test:\t\t', test.shape)

print('Landmarks:\t', len(train['landmark_id'].unique()))

Train:		 (1225029, 3)
Test:		 (117703, 2)
Landmarks:	 14951


In [3]:
train.head()

Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651
2,6b2bb500b6a38aa0,http://lh6.ggpht.com/-vKr5G5MEusk/SR6r6SJi6mI/...,11284
3,b399f09dee9c3c67,https://lh3.googleusercontent.com/-LOW2cjAqubA...,8429
4,19ace29d77a5be66,https://lh5.googleusercontent.com/-tnmSXwQcWL8...,6231


# Get Index for train, validation, and test set

* images >= 15: choose the first 10 as training set, the rest 2 as test images, and the last 3 as validation images
* 12 < images < 15: keep the first 10 images as the training images, the rest 2 as test images, and the rest as validation images.
* 10 < images <= 12: keep the first 10 images as the training images, the rest as test images.
* 2 < images <= 10: choose the last one as test images, the rest as training images.
* images <= 2: choose all as training images.

In [4]:
def valid(path):
    """ function to determine whether or not the given image is valid """
    try:
        img = Image.open(path)
        if img.width < 256 or img.height < 256 or img.format != 'JPEG':
            return False
        _ = img.resize((256, 256))
    except:
        return False
    
    return True

In [5]:
# Choose the unique ids
unique_ids = sorted(train['landmark_id'].unique())
len(unique_ids)

14951

In [9]:
# fix random state
np.random.seed(42)
random.seed(29)

# Split into training and test set
train_idx = []
val_idx = []
test_idx = []

for landmark_id in unique_ids:
    # help information
    if landmark_id % 3000 == 0:
        print('\nProcess: {:8d}'.format(landmark_id))
    if landmark_id % 40 == 0:
        print('=', end='')
        
    # get index corresponding to given landmark_id
    index = list(train[train['landmark_id'] == landmark_id].index)
    np.random.shuffle(index)
    
    # check valid image numbers
    valid_idx = []
    for idx in index:
        path = './data/all/train_images/' + str(idx) + '.jpg'
        if valid(path):
            valid_idx.append(idx)
            
        if len(valid_idx) >= 15:
            break
    
    # split according to given rules
    if len(valid_idx) >= 15:
        train_idx = train_idx + valid_idx[:10]
        test_idx = test_idx + valid_idx[10:12]
        val_idx = val_idx + valid_idx[12:15]
    elif len(valid_idx) > 12:
        train_idx = train_idx + valid_idx[:10]
        test_idx = test_idx + valid_idx[10:12]
        val_idx = val_idx + valid_idx[12:]
    elif len(valid_idx) > 10:
        train_idx = train_idx + valid_idx[:10]
        test_idx = test_idx + valid_idx[10:]
    elif len(valid_idx) > 2:
        train_idx = train_idx + valid_idx[:-1]
        test_idx.append(valid_idx[-1])
    elif len(valid_idx) > 0:
        train_idx = train_idx + valid_idx


Process:        0
Process:     3000
Process:     6000
Process:     9000
Process:    12000

# Train, Validation, and Test Set Split

In [19]:
# Get image information
ids = train['id'].values
urls = train['url'].values
landmark_ids = train['landmark_id'].values

In [21]:
# Split training set
train_image_id = []
train_id = []
train_url = []
train_landmark_id = []

for idx in train_idx:
    from_path = './data/all/train_images/' + str(idx) + '.jpg'
    to_path = './data/triplet/train/' + str(idx) + '.jpg'
    img = io.imread(from_path)
    resized = np.array(resize(img, (256, 256, 3), mode='reflect') * 255, dtype=np.uint8)
    io.imsave(to_path, resized)
    train_image_id.append(idx)
    train_id.append(ids[idx])
    train_url.append(urls[idx])
    train_landmark_id.append(landmark_ids[idx])

In [22]:
# Save to disk   
train_df = pd.DataFrame({'image_id': train_image_id, 'id': train_id, 
                         'url': train_url, 'landmark_id': train_landmark_id})
train_df.to_csv('./data/triplet/train.csv', index=False, 
                columns=['image_id', 'id', 'url', 'landmark_id'])

In [25]:
# Split validation set
val_image_id = []
val_id = []
val_url = []
val_landmark_id = []

for idx in val_idx:
    from_path = './data/all/train_images/' + str(idx) + '.jpg'
    to_path = './data/triplet/validation/' + str(idx) + '.jpg'
    img = io.imread(from_path)
    resized = np.array(resize(img, (256, 256, 3), mode='reflect') * 255, dtype=np.uint8)
    io.imsave(to_path, resized)
    val_image_id.append(idx)
    val_id.append(ids[idx])
    val_url.append(urls[idx])
    val_landmark_id.append(landmark_ids[idx])

In [26]:
# Save to disk   
val_df = pd.DataFrame({'image_id': val_image_id, 'id': val_id, 
                       'url': val_url, 'landmark_id': val_landmark_id})
val_df.to_csv('./data/triplet/validation.csv', index=False, 
              columns=['image_id', 'id', 'url', 'landmark_id'])

In [35]:
# Split test set
test_image_id = []
test_id = []
test_url = []
test_landmark_id = []

for idx in test_idx:
    from_path = './data/all/train_images/' + str(idx) + '.jpg'
    to_path = './data/triplet/test/' + str(idx) + '.jpg'
    img = io.imread(from_path)
    resized = np.array(resize(img, (256, 256, 3), mode='reflect') * 255, dtype=np.uint8)
    io.imsave(to_path, resized)
    test_image_id.append(idx)
    test_id.append(ids[idx])
    test_url.append(urls[idx])
    test_landmark_id.append(landmark_ids[idx])

In [36]:
# Save to disk   
test_df = pd.DataFrame({'image_id': test_image_id, 'id': test_id, 
                        'url': test_url, 'landmark_id': test_landmark_id})
test_df.to_csv('./data/triplet/test.csv', index=False, 
               columns=['image_id', 'id', 'url', 'landmark_id'])

In [37]:
print('Train:\t\t', train_df.shape)
print('Validation:\t', val_df.shape)
print('Test:\t\t', test_df.shape)

Train:		 (113783, 4)
Validation:	 (22255, 4)
Test:		 (22391, 4)
