In [1]:
import warnings
warnings.simplefilter('ignore')

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import sys, os, time, gc
import requests, shutil
from PIL import Image

%matplotlib inline

# 1. Download Raw Images

Get single from the given url, modified from Kaggle kernel https://www.kaggle.com/syltruong/img-download-multi-proc-bar-resume-fail-logs

In [2]:
# Change to True if the images are not already downloaded
download = False

In [3]:
# Download Train Images
if download is True:
    !python downloader.py './data/raw_data/train.json' './data/raw_data/train_images'

In [4]:
# Download Validation Images
if download is True:
    !python downloader.py './data/raw_data/validation.json' './data/raw_data/validation_images'

In [5]:
# Download Test Images
if download is True:
    !python downloader.py './data/raw_data/test.json' './data/raw_data/test_images'

# 2. Transform Json into Dataframe

In [6]:
# Transform json to dataframe
transform = False
if transform is True:
    # Training set
    train_df = pd.read_json('./data/raw_data/train.json')
    train_df['id'] = train_df['annotations'].map(lambda x: x['image_id'])
    train_df['label'] = train_df['annotations'].map(lambda x: x['label_id'])
    train_df['url'] = train_df['images'].map(lambda x: x['url'][0])
    train_df = train_df.drop(columns=['annotations', 'images'])
    train_df.to_csv('./data/train.csv', index=False, columns=['id', 'label', 'url'])

    # Validation set
    val_df = pd.read_json('./data/raw_data/validation.json')
    val_df['id'] = val_df['annotations'].map(lambda x: x['image_id'])
    val_df['label'] = val_df['annotations'].map(lambda x: x['label_id'])
    val_df['url'] = val_df['images'].map(lambda x: x['url'][0])
    val_df = val_df.drop(columns=['annotations', 'images'])
    val_df.to_csv('./data/validation.csv', index=False, columns=['id', 'label', 'url'])

    # Test set
    test_df = pd.read_json('./data/raw_data/test.json')
    test_df['id'] = test_df['images'].map(lambda x: x['image_id'])
    test_df['url'] = test_df['images'].map(lambda x: x['url'][0])
    test_df = test_df.drop(columns=['images'])
    test_df.to_csv('./data/test.csv', index=False, columns=['id', 'url'])

# 3. Exploration

In [7]:
train_df = pd.read_csv('./data/train.csv')
val_df = pd.read_csv('./data/validation.csv')
test_df = pd.read_csv('./data/test.csv')

print('Train:\t\t', train_df.shape, '\t', len(train_df['label'].unique()))
print('Validation:\t', val_df.shape, '\t', len(val_df['label'].unique()))
print('Test:\t\t', test_df.shape)

Train:		 (194828, 3) 	 128
Validation:	 (6400, 3) 	 128
Test:		 (12800, 2)


In [8]:
!ls ./data/raw_data/train_images/ | wc -l
!ls ./data/raw_data/validation_images/ | wc -l
!ls ./data/raw_data/test_images/ | wc -l

191711
6297
12593


In [9]:
train_df.head()

Unnamed: 0,id,label,url
0,1,5,https://img13.360buyimg.com/imgzone/jfs/t2857/...
1,2,5,http://www.tengdakeli.cn/350/timg01/uploaded/i...
2,3,5,https://img13.360buyimg.com/imgzone/jfs/t8899/...
3,4,5,http://img4.tbcdn.cn/tfscom/i1/2855447419/TB2S...
4,5,5,http://a.vpimg4.com/upload/merchandise/287883/...


# 4. Check Image Sizes and Validity

In [10]:
# Check training set
sizes = []
errors = []
index = []
missing = []

ids = train_df['id'].values
urls = train_df['url'].values
t0 = time.time()

for i in range(len(train_df)):
    tmp_id = ids[i]
    
    # Helpful information
    if i % 20000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 250 == 0:
        print('=', end='')
        
    # Check if exists
    if not os.path.exists('./data/raw_data/train_images/' + str(tmp_id) + '.jpg'):
        missing.append(i)
        continue
        
    # Try to resize the valid images
    try:
        img = Image.open('./data/raw_data/train_images/' + str(tmp_id) + '.jpg')
        _ = img.resize((224, 224))
        if img.width < 260 or img.height < 260:
            errors.append(i)
        else:
            sizes.append(img.size)
            index.append(tmp_id)
    except:
        errors.append(i)

# Helpful information
sizes = np.array(sizes)
print('\n\nMissing:\t', len(missing))
print('Errors:\t\t', len(errors))
print('Image Width:\t', min(sizes[:, 0]), max(sizes[:, 0]))
print('Image Height:\t', min(sizes[:, 1]), max(sizes[:, 1]))

# Print error information
if len(errors) > 0:
    print('\nErrors:')
    for j in errors:
        print(j, '\t', ids[j], '\t', urls[j])


Process:         0    Used time: 0.0 s
Process:     20000    Used time: 244.0 s
Process:     40000    Used time: 254.0 s
Process:     60000    Used time: 248.0 s
Process:     80000    Used time: 243.0 s
Process:    100000    Used time: 242.0 s
Process:    120000    Used time: 268.0 s
Process:    140000    Used time: 280.0 s
Process:    160000    Used time: 296.0 s
Process:    180000    Used time: 270.0 s

Missing:	 3117
Errors:		 31
Image Width:	 260 7874
Image Height:	 260 6614

Errors:
16456 	 16457 	 http://www.lyh8.cn/upload/news/2011-4-22/20114221713493373.jpg
29320 	 29321 	 http://i.ebayimg.com/00/s/MTIwMFgxNjAw/z/BMEAAOSwvihY-WBx/%24_57.JPG?set_id=8800005007
40458 	 40459 	 http://image.cn.made-in-china.com/cnimg/prod_0d0qcnapec52/0/%E9%92%A2%E6%9C%A8%E7%BB%93%E6%9E%84%E8%A1%A3%E5%B8%BD%E6%9E%B6-SRA_270x270.jpg
41837 	 41838 	 http://www.lyh8.cn/upload/news/2014-8-31/20148311211159880z1jk.jpg
48201 	 48202 	 http://www.lyh8.cn/upload/news/2011-4-22/20114221536529854.jpg
49642 

In [11]:
# Check validation set
sizes = []
errors = []
index = []
missing = []

ids = val_df['id'].values
urls = val_df['url'].values
t0 = time.time()

for i in range(len(val_df)):
    tmp_id = ids[i]
    
    # Helpful information
    if i % 2000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 25 == 0:
        print('=', end='')
        
    # Check if exists
    if not os.path.exists('./data/raw_data/validation_images/' + str(tmp_id) + '.jpg'):
        missing.append(i)
        continue
        
    # Try to resize the valid images
    try:
        img = Image.open('./data/raw_data/validation_images/' + str(tmp_id) + '.jpg')
        _ = img.resize((224, 224))
        if img.width < 260 or img.height < 260:
            errors.append(i)
        else:
            sizes.append(img.size)
            index.append(tmp_id)
    except:
        errors.append(i)

# Helpful information
sizes = np.array(sizes)
print('\n\nMissing:\t', len(missing))
print('Errors:\t\t', len(errors))
print('Image Width:\t', min(sizes[:, 0]), max(sizes[:, 0]))
print('Image Height:\t', min(sizes[:, 1]), max(sizes[:, 1]))

# Print error information
if len(errors) > 0:
    print('\nErrors:')
    for j in errors:
        print(j, '\t', ids[j], '\t', urls[j])


Process:         0    Used time: 0.0 s
Process:      2000    Used time: 25.0 s
Process:      4000    Used time: 24.0 s
Process:      6000    Used time: 25.0 s

Missing:	 103
Errors:		 0
Image Width:	 260 6016
Image Height:	 260 5792


In [12]:
# Check test set
sizes = []
errors = []
index = []
missing = []

ids = test_df['id'].values
urls = test_df['url'].values
t0 = time.time()

for i in range(len(test_df)):
    tmp_id = ids[i]
    
    # Helpful information
    if i % 2000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 25 == 0:
        print('=', end='')
        
    # Check if exists
    if not os.path.exists('./data/raw_data/test_images/' + str(tmp_id) + '.jpg'):
        missing.append(i)
        continue
        
    # Try to resize the valid images
    try:
        img = Image.open('./data/raw_data/test_images/' + str(tmp_id) + '.jpg')
        _ = img.resize((224, 224))
        if img.width < 260 or img.height < 260:
            errors.append(i)
        else:
            sizes.append(img.size)
            index.append(tmp_id)
    except:
        errors.append(i)

# Helpful information
sizes = np.array(sizes)
print('\n\nMissing:\t', len(missing))
print('Errors:\t\t', len(errors))
print('Image Width:\t', min(sizes[:, 0]), max(sizes[:, 0]))
print('Image Height:\t', min(sizes[:, 1]), max(sizes[:, 1]))

# Print error information
if len(errors) > 0:
    print('\nErrors:')
    for j in errors:
        print(j, '\t', ids[j], '\t', urls[j])


Process:         0    Used time: 0.0 s
Process:      2000    Used time: 24.0 s
Process:      4000    Used time: 26.0 s
Process:      6000    Used time: 28.0 s
Process:      8000    Used time: 28.0 s
Process:     10000    Used time: 30.0 s
Process:     12000    Used time: 32.0 s

Missing:	 207
Errors:		 0
Image Width:	 260 5760
Image Height:	 260 5616


# 5. Reorganize Images according Labels

In [13]:
# Check whether or not the destination direction exists
if not os.path.exists('./data/train'):
    os.mkdir('./data/train')
for i in range(1, 129):
    if not os.path.exists('./data/train/' + str(i)):
        os.mkdir('./data/train/' + str(i))
        
if not os.path.exists('./data/validation'):
    os.mkdir('./data/validation')
for i in range(1, 129):
    if not os.path.exists('./data/validation/' + str(i)):
        os.mkdir('./data/validation/' + str(i))
        
if not os.path.exists('./data/test'):
    os.mkdir('./data/test')

In [14]:
# Re-order training set
errors = []
missing = []
ids = train_df['id'].values
labels = train_df['label'].values
count = 0
t0 = time.time()

for i in range(len(train_df)):
    tmp_id = ids[i]
    tmp_label = labels[i]
    
    # Helpful information
    if i % 20000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 250 == 0:
        print('=', end='')
        
    # Check if image exists
    if not os.path.exists('./data/raw_data/train_images/' + str(tmp_id) + '.jpg'):
        missing.append(i)
        continue
        
    # Try to resize the valid images
    try:
        img = Image.open('./data/raw_data/train_images/' + str(tmp_id) + '.jpg')
        _ = img.resize((224, 224))
        if img.width < 260 or img.height < 260:
            errors.append(i)
        else:
            from_path = './data/raw_data/train_images/' + str(tmp_id) + '.jpg'
            to_path = './data/train/' + str(tmp_label) + '/' + str(tmp_id) + '.jpg'
            if os.path.exists(to_path):
                continue
            shutil.copyfile(from_path, to_path)
            count += 1
    except:
        errors.append(i)


Process:         0    Used time: 0.0 s
Process:     20000    Used time: 271.0 s
Process:     40000    Used time: 287.0 s
Process:     60000    Used time: 286.0 s
Process:     80000    Used time: 278.0 s
Process:    100000    Used time: 276.0 s
Process:    120000    Used time: 300.0 s
Process:    140000    Used time: 309.0 s
Process:    160000    Used time: 322.0 s
Process:    180000    Used time: 293.0 s

In [17]:
# Helpful information
print('Missing:\t\t', len(missing))
print('Errors:\t\t\t', len(errors))
print('Totally Moved:\t\t', count)

# Double check
count = 0
for i in range(1, 129):
    count += len(os.listdir('./data/train/' + str(i)))
print('Training Images:\t', count)

Missing:		 3117
Errors:			 31
Totally Moved:		 191680
Training Images:	 191680


In [18]:
# Re-order validation set
errors = []
missing = []
ids = val_df['id'].values
labels = val_df['label'].values
count = 0
t0 = time.time()

for i in range(len(val_df)):
    tmp_id = ids[i]
    tmp_label = labels[i]
    
    # Helpful information
    if i % 2000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 25 == 0:
        print('=', end='')
        
    # Check if image exists
    if not os.path.exists('./data/raw_data/validation_images/' + str(tmp_id) + '.jpg'):
        missing.append(i)
        continue
        
    # Try to resize the valid images
    try:
        img = Image.open('./data/raw_data/validation_images/' + str(tmp_id) + '.jpg')
        _ = img.resize((224, 224))
        if img.width < 260 or img.height < 260:
            errors.append(i)
        else:
            from_path = './data/raw_data/validation_images/' + str(tmp_id) + '.jpg'
            to_path = './data/validation/' + str(tmp_label) + '/' + str(tmp_id) + '.jpg'
            if os.path.exists(to_path):
                continue
            shutil.copyfile(from_path, to_path)
            count += 1
    except:
        errors.append(i)


Process:         0    Used time: 0.0 s
Process:      2000    Used time: 26.0 s
Process:      4000    Used time: 26.0 s
Process:      6000    Used time: 26.0 s

In [19]:
# Helpful information
print('Missing:\t\t', len(missing))
print('Errors:\t\t\t', len(errors))
print('Totally Moved:\t\t', count)

# Double check
count = 0
for i in range(1, 129):
    count += len(os.listdir('./data/validation/' + str(i)))
print('Validation Images:\t', count)

Missing:		 103
Errors:			 0
Totally Moved:		 6297
Validation Images:	 6297


In [20]:
# Copy test set
errors = []
missing = []
ids = test_df['id'].values
count = 0
t0 = time.time()

for i in range(len(test_df)):
    tmp_id = ids[i]
    
    # Helpful information
    if i % 20000 == 0:
        _ = gc.collect()   # cleanup
        t = time.time() - t0
        print('\nProcess: {:9d}'.format(i), '   Used time: {} s'.format(np.round(t, 0)))
        t0 = time.time()
    if i % 250 == 0:
        print('=', end='')
        
    # Check if image exists
    if not os.path.exists('./data/raw_data/test_images/' + str(tmp_id) + '.jpg'):
        missing.append(i)
        continue
        
    # Try to resize the valid images
    try:
        img = Image.open('./data/raw_data/test_images/' + str(tmp_id) + '.jpg')
        _ = img.resize((224, 224))
        if img.width < 260 or img.height < 260:
            errors.append(i)
        else:
            from_path = './data/raw_data/test_images/' + str(tmp_id) + '.jpg'
            to_path = './data/test/' + str(tmp_id) + '.jpg'
            if os.path.exists(to_path):
                continue
            shutil.copyfile(from_path, to_path)
            count += 1
    except:
        errors.append(i)


Process:         0    Used time: 0.0 s

In [22]:
# Helpful information
print('Missing:\t\t', len(missing))
print('Errors:\t\t\t', len(errors))
print('Totally Moved:\t\t', count)

# Double check
print('Test Images:\t\t', len(os.listdir('./data/test')))

Missing:		 207
Errors:			 0
Totally Moved:		 12593
Test Images:		 12593
