In [19]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import io
import os
import sys
import json
import urllib3
import multiprocessing

from PIL import Image
from tqdm import tqdm
from urllib3.util import Retry

urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

In [20]:
def download_image(fnames_and_urls):
    """
    download image and save its with 90% quality as JPG format
    skip image downloading if image already exists at given path
    :param fnames_and_urls: tuple containing absolute path and url of image
    """
    fname, url = fnames_and_urls
    if not os.path.exists(fname):
        http = urllib3.PoolManager(retries=Retry(connect=3, read=2, redirect=3))
        response = http.request("GET", url)
        image = Image.open(io.BytesIO(response.data))
        image_rgb = image.convert("RGB")
        image_rgb.save(fname, format='JPEG', quality=90)


def parse_dataset(_dataset, _outdir, _max=10000):
    """
    parse the dataset to create a list of tuple containing absolute path and url of image
    :param _dataset: dataset to parse
    :param _outdir: output directory where data will be saved
    :param _max: maximum images to download (change to download all dataset)
    :return: list of tuple containing absolute path and url of image
    """
    _fnames_urls = []
    with open(_dataset, 'r') as f:
        data = json.load(f)
        for image in data["images"]:
            url = image["url"]
            fname = os.path.join(outdir, "{}.jpg".format(image["imageId"]))
            _fnames_urls.append((fname, url))
    return _fnames_urls[:_max]


In [21]:
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print("error: not enough arguments")
        sys.exit(0)

    # get args and create output directory
    dataset, outdir = sys.argv[1:]
    if not os.path.exists(outdir):
        os.makedirs(outdir)

    # parse json dataset file
    fnames_urls = parse_dataset(f'train.json', 'data/train')

    # download data
    pool = multiprocessing.Pool(processes=12)
    with tqdm(total=len(fnames_urls)) as progress_bar:
        for _ in pool.imap_unordered(download_image, fnames_urls):
            progress_bar.update(1)

    sys.exit(1)

  0%|          | 0/10000 [00:00<?, ?it/s]


NotADirectoryError: [Errno 20] Not a directory: '/run/user/1000/jupyter/kernel-f8337a69-8748-4f93-9207-ba5ba56f7fba.json/1.jpg'

In [14]:
len(list(open(f'train.json')))

15052768

In [15]:
dataset

'-f'

In [22]:
fnames_urls

[('/run/user/1000/jupyter/kernel-f8337a69-8748-4f93-9207-ba5ba56f7fba.json/1.jpg',
  'https://contestimg.wish.com/api/webimage/570f35feb2f4b95d223aa9b1-large'),
 ('/run/user/1000/jupyter/kernel-f8337a69-8748-4f93-9207-ba5ba56f7fba.json/2.jpg',
  'https://contestimg.wish.com/api/webimage/5468f1c0d96b290ff8e5c805-large'),
 ('/run/user/1000/jupyter/kernel-f8337a69-8748-4f93-9207-ba5ba56f7fba.json/3.jpg',
  'https://contestimg.wish.com/api/webimage/546410237d57f323e72ca414-large'),
 ('/run/user/1000/jupyter/kernel-f8337a69-8748-4f93-9207-ba5ba56f7fba.json/4.jpg',
  'https://contestimg.wish.com/api/webimage/550b955fdd699c1a0351f84e-large'),
 ('/run/user/1000/jupyter/kernel-f8337a69-8748-4f93-9207-ba5ba56f7fba.json/5.jpg',
  'https://contestimg.wish.com/api/webimage/54451f33355b4e0fd3028a30-large'),
 ('/run/user/1000/jupyter/kernel-f8337a69-8748-4f93-9207-ba5ba56f7fba.json/6.jpg',
  'https://contestimg.wish.com/api/webimage/571e0b1cea3cc75d8a004f37-large'),
 ('/run/user/1000/jupyter/kernel-f