In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys, requests, shutil, os
from urllib import request
from skimage import io
from skimage.transform import resize
import time, gc

%matplotlib inline

  (fname, cnt))
  (fname, cnt))


# Load the data set

* There are totally 1,225,029 training images and 117,703 test images. 
* Totoally 14,951 landmarks

In [2]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

print('Train:\t\t', train.shape)
print('Test:\t\t', test.shape)

print('Landmarks:\t', len(train['landmark_id'].unique()))

Train:		 (1225029, 3)
Test:		 (117703, 2)
Landmarks:	 14951


In [3]:
train.head()

Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651
2,6b2bb500b6a38aa0,http://lh6.ggpht.com/-vKr5G5MEusk/SR6r6SJi6mI/...,11284
3,b399f09dee9c3c67,https://lh3.googleusercontent.com/-LOW2cjAqubA...,8429
4,19ace29d77a5be66,https://lh5.googleusercontent.com/-tnmSXwQcWL8...,6231


# Download Images

In [10]:
# Get single from the given url, modified from Kaggle kernel
# https://www.kaggle.com/abinesh100/easy-download-images-in-25-lines-py3/code
def fetch_image(url):
    """ Get image from given url """
    response=requests.get(url, stream=True)
    
    with open('./data/image.jpg', 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
        
    del response

In [127]:
# Download images to ./train/
urls = train['url'].values
idx = 0
t0 = time.time()

# Loop through urls to download images
for url in urls:
    # Check if already downloaded
    if os.path.exists('./data/train_images/' + str(idx) + '.jpg'):
        idx += 1
        continue
        
    # Get image from url
    fetch_image(url)
    os.rename('./data/image.jpg', './data/train_images/'+ str(idx) + '.jpg')
    
    # Helpful information
    if idx % 100000 == 0:
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(idx), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if idx % 1250 == 0:
        print('=', end='')
        
    # Go to next image
    idx += 1

In [None]:
# Download images to ./train_images/
urls = train['url'].values
errors = []
t0 = time.time()

# Loop through urls to download images
for idx in range(len(urls)):
    url = urls[idx]
    # Helpful information
    if idx % 100000 == 0:
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(idx), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if idx % 1250 == 0:
        print('=', end='')
    
    # Check if already downloaded
    if os.path.exists('./data/train_images/' + str(idx) + '.jpg'):
        continue
        
    try:
        # Get image from url
        fetch_image(url)
        os.rename('./data/image.jpg', './data/train_images/'+ str(idx) + '.jpg')
    except:
        errors.append(idx)
    
if len(errors) > 0:
    print('Errors:\t', errors)

In [None]:
# Download images to ./test_images/
urls = test['url'].values
errors = []
t0 = time.time()

# Loop through urls to download images
for idx in range(len(urls)):
    url = urls[idx]
    # Helpful information
    if idx % 10000 == 0:
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(idx), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if idx % 125 == 0:
        print('=', end='')
    
    # Check if already downloaded
    if os.path.exists('./data/test_images/' + str(idx) + '.jpg'):
        continue
        
    try:
        # Get image from url
        fetch_image(url)
        os.rename('./data/image.jpg', './data/test_images/'+ str(idx) + '.jpg')
    except:
        errors.append(idx)
    
if len(errors) > 0:
    print('Errors:\t', errors)

# Image Resize

In [28]:
# iterate all images to resize and save images ==> (256, 256, 3)
ids = train['id'].values
urls = train['url'].values
landmark_ids = train['landmark_id'].values

train_id = []
train_url = []
train_label = []
errors = []

t0 = time.time()
idx = 0

for i in range(len(train)):
    # Helpful information
    if i % 100000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 1250 == 0:
        print('=', end='')
        
    # Try to resize the valid images
    try:
        img = io.imread('./data/train_images/' + str(i) + '.jpg')
        resized = np.array(resize(img, (256, 256, 3)) * 255, dtype=np.uint8)
        io.imsave('./data/train_resized/' + str(idx) + '.jpg', resized)
        train_id.append(ids[i])
        train_url.append(urls[i])
        train_label.append(landmark_ids[i])
        idx += 1
    except:
        errors.append(i)
        
train_df = pd.DataFrame({'id': train_id, 'url': train_url, 'landmark_id': train_label}, 
                        columns=['id', 'url', 'landmark_id'])
if len(errors) > 0:
    print('Errors:\t', errors)


Process:         0    Used time: 0.0 s
=

  warn("The default mode, 'constant', will be changed to 'reflect' in "


In [41]:
# iterate all images to resize and save images ==> (256, 256, 3)
ids = test['id'].values
urls = test['url'].values

test_id = []
test_url = []
errors = []

t0 = time.time()
idx = 0

for i in range(len(test)):
    # Helpful information
    if i % 10000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 125 == 0:
        print('=', end='')
        
    # Try to resize the valid images
    try:
        img = io.imread('./data/test_images/' + str(i) + '.jpg')
        resized = np.array(resize(img, (256, 256, 3)) * 255, dtype=np.uint8)
        io.imsave('./data/test_resized/' + str(idx) + '.jpg', resized)
        test_id.append(ids[i])
        test_url.append(urls[i])
        idx += 1
    except:
        errors.append(i)
        
test_df = pd.DataFrame({'id': test_id, 'url': test_url}, columns=['id', 'url'])
if len(errors) > 0:
    print('Errors:\t', errors)

Unnamed: 0,id,url,landmark_id
0,cacf8152e2d2ae60,http://static.panoramio.com/photos/original/70...,4676
1,0a58358a2afd3e4e,http://lh6.ggpht.com/-igpT6wu0mIA/ROV8HnUuABI/...,6651
2,6b2bb500b6a38aa0,http://lh6.ggpht.com/-vKr5G5MEusk/SR6r6SJi6mI/...,11284
3,b399f09dee9c3c67,https://lh3.googleusercontent.com/-LOW2cjAqubA...,8429
4,19ace29d77a5be66,https://lh5.googleusercontent.com/-tnmSXwQcWL8...,6231
5,2c9c54b62f0a6a37,https://lh5.googleusercontent.com/-mEaSECO7D-4...,10400
6,0aac70a1de44ced6,http://lh6.ggpht.com/-cJMh9AYQGk8/SOkF_Q5PrjI/...,9779
7,de770bc720f8e714,https://lh4.googleusercontent.com/-Q_FvRlwaaa8...,11288
8,dc9457d703e612ad,https://lh3.googleusercontent.com/-Px33Q-wekRI...,13170
9,3060f5f75d936abb,http://lh3.ggpht.com/-KXyELwqwp_Q/Ry-qmQAqwUI/...,6051


# Data Exploration

In [None]:
# Visualize the distribution of landmark ids
fig, ax = plt.subplots(figsize=(10, 6))
sns.distplot(train['landmark_id'], ax=ax)
ax.set_xlabel('Landmark ID', fontsize=12)
plt.show()

In [None]:
# Group data according unique landmark_id
grouped = train[['landmark_id', 'id']].groupby('landmark_id').count().reset_index()
grouped = grouped.sort_values('id', ascending=False)
grouped = grouped.rename(columns={'id': 'count'}).reset_index(drop=True)

In [None]:
# landmark with largest images
grouped.head()

In [None]:
# landmark with smallest images
grouped.tail()

In [9]:
len(grouped[grouped['count'] < 10]) / len(grouped)

0.41234700020065546