In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys, os, time, gc
import requests, shutil
from PIL import Image

%matplotlib inline

# Transform json into DataFrame

In [2]:
# Transform json to dataframe
transform = False
if transform is True:
    # Training set
    train_df = pd.read_json('./data/raw_data/train.json')
    train_df['id'] = train_df['annotations'].map(lambda x: x['image_id'])
    train_df['label'] = train_df['annotations'].map(lambda x: x['label_id'])
    train_df['url'] = train_df['images'].map(lambda x: x['url'][0])
    train_df = train_df.drop(columns=['annotations', 'images'])
    train_df.to_csv('./data/train.csv', index=False, columns=['id', 'label', 'url'])

    # Validation set
    val_df = pd.read_json('./data/raw_data/validation.json')
    val_df['id'] = val_df['annotations'].map(lambda x: x['image_id'])
    val_df['label'] = val_df['annotations'].map(lambda x: x['label_id'])
    val_df['url'] = val_df['images'].map(lambda x: x['url'][0])
    val_df = val_df.drop(columns=['annotations', 'images'])
    val_df.to_csv('./data/validation.csv', index=False, columns=['id', 'label', 'url'])

    # Test set
    test_df = pd.read_json('./data/raw_data/test.json')
    test_df['id'] = test_df['images'].map(lambda x: x['image_id'])
    test_df['url'] = test_df['images'].map(lambda x: x['url'][0])
    test_df = test_df.drop(columns=['images'])
    test_df.to_csv('./data/test.csv', index=False, columns=['id', 'url'])

# Exploration

In [3]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/validation.csv')
test_df = pd.read_csv('./data/test.csv')

print('Train:\t\t', train_df.shape, '\t', len(train_df['label'].unique()))
print('Validation:\t', val_df.shape, '\t', len(val_df['label'].unique()))
print('Test:\t\t', test_df.shape)

Train:		 (194828, 3) 	 128
Validation:	 (6400, 3) 	 128
Test:		 (12800, 2)


In [4]:
train_df.head()

Unnamed: 0,id,label,url
0,1,5,https://img13.360buyimg.com/imgzone/jfs/t2857/...
1,2,5,http://www.tengdakeli.cn/350/timg01/uploaded/i...
2,3,5,https://img13.360buyimg.com/imgzone/jfs/t8899/...
3,4,5,http://img4.tbcdn.cn/tfscom/i1/2855447419/TB2S...
4,5,5,http://a.vpimg4.com/upload/merchandise/287883/...


# Download Images

### Training Set

In [5]:
# Get single from the given url, modified from Kaggle kernel
# https://www.kaggle.com/abinesh100/easy-download-images-in-25-lines-py3/code
def fetch_image(url):
    """ Get image from given url """
    response=requests.get(url, stream=True)
    
    with open('./data/image.jpg', 'wb') as out_file:
        shutil.copyfileobj(response.raw, out_file)
        
    del response

In [6]:
# # Download images to ./train_images/
# ids = train_df['id'].values
# urls = train_df['url'].values
# errors = []
# t0 = time.time()

# # Loop through urls to download images
# for idx in range(len(ids)):
#     tmp_id = ids[idx]
#     tmp_url = urls[idx]

#     # Helpful information
#     if idx % 2000 == 0:
#         _ = gc.collect()   # cleanup
#         t = time.time() - t0
#         print('\nProcess: {:9d}'.format(idx), '   Used time: {} s'.format(np.round(t, 0)))
#         t0 = time.time()
#     if idx % 25 == 0:
#         print('=', end='')
    
#     # Check if already downloaded
#     if os.path.exists('./data/train_images/' + str(tmp_id) + '.jpg'):
#         continue
        
#     try:
#         # Get image from url
#         fetch_image(tmp_url)
#         os.rename('./data/image.jpg', './data/train_images/'+ str(tmp_id) + '.jpg')
#     except:
#         errors.append(idx)
    
# # Print error information
# if len(errors) > 0:
#     print('\nErrors:')
#     for j in errors:
#         print(j, '\t', ids[j], urls[j])

### Validation Set

In [None]:
# Download images to ./validation_images/
ids = val_df['id'].values
urls = val_df['url'].values
errors = []
t0 = time.time()

# Loop through urls to download images
for idx in range(len(ids)):
    tmp_id = ids[idx]
    tmp_url = urls[idx]

    # Helpful information
    if idx % 1000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(idx), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if idx % 20 == 0:
        print('=', end='')
    
    # Check if already downloaded
    if os.path.exists('./data/validation_images/' + str(tmp_id) + '.jpg'):
        continue
        
    try:
        # Get image from url
        fetch_image(tmp_url)
        os.rename('./data/image.jpg', './data/validation_images/'+ str(tmp_id) + '.jpg')
    except:
        errors.append(idx)
    
# Print error information
if len(errors) > 0:
    print('\nErrors:')
    for j in errors:
        print(j, '\t', ids[j], urls[j])


Process:         0    Used time: 0.0 s
Process:      1000    Used time: 299.0 s

### Test Set

In [None]:
# # Download images to ./test_images/
# ids = test_df['id'].values
# urls = test_df['url'].values
# errors = []
# t0 = time.time()

# # Loop through urls to download images
# for idx in range(len(ids)):
#     tmp_id = ids[idx]
#     tmp_url = urls[idx]

#     # Helpful information
#     if idx % 2000 == 0:
#         _ = gc.collect()   # cleanup
#         t = time.time() - t0
#         print('\nProcess: {:9d}'.format(idx), '   Used time: {} s'.format(np.round(t, 0)))
#         t0 = time.time()
#     if idx % 25 == 0:
#         print('=', end='')
    
#     # Check if already downloaded
#     if os.path.exists('./data/test_images/' + str(tmp_id) + '.jpg'):
#         continue
        
#     try:
#         # Get image from url
#         fetch_image(tmp_url)
#         os.rename('./data/image.jpg', './data/test_images/'+ str(tmp_id) + '.jpg')
#     except:
#         errors.append(idx)
    
# # Print error information
# if len(errors) > 0:
#     print('\nErrors:')
#     for j in errors:
#         print(j, '\t', ids[j], urls[j])

# Check Image Sizes and Validity

In [None]:
# sizes = []
# errors = []
# ids = train_df['id'].values

# for i in range(len(train_df)):
#     tmp_id = ids[i]
    
#     # Helpful information
#     if i % 10000 == 0:
#         _ = gc.collect()   # cleanup
#         t = time.time() - t0
#         print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
#         t0 = time.time()
#     if i % 125 == 0:
#         print('=', end='')
        
#     # Try to resize the valid images
#     try:
#         img = Image.open('./data/train_images/' + str(tmp_id) + '.jpg')
#         sizes.append(img.size)
#         _ = img.resize((224, 224))
#     except:
#         errors.append(i)
        
# # Print error information
# if len(errors) > 0:
#     print('\nErrors:')
#     for j in errors:
#         print(j, '\t', ids[j], urls[j])

# Re-organize Images according Labels

In [None]:
# # Check whether or not the destination direction exists
# if not os.path.exists('./data/train'):
#     os.mkdir('./data/train')
# for i in range(1, 129):
#     if not os.path.exists('./data/train/' + str(i)):
#         os.mkdir('./data/train/' + str(i))
        
# if not os.path.exists('./data/validation'):
#     os.mkdir('./data/validation')
# for i in range(1, 129):
#     if not os.path.exists('./data/validation/' + str(i)):
#         os.mkdir('./data/validation/' + str(i))
        
# if not os.path.exists('./data/test'):
#     os.mkdir('./data/test')

In [None]:
# Put training images into sub-directories according to class label


In [None]:
# Put validation images into sub-directories according to class label


In [None]:
# Put test images into test directory
