# Imports

In [1]:
import os
import requests
import shutil
from PIL import Image, UnidentifiedImageError
from requests.exceptions import HTTPError
from io import BytesIO
from tqdm.auto import tqdm

# Sequential download

In [2]:
# build the download directory or simply assign the dest variable some existing directory
dest = './huggingpics/'
os.makedirs(dest, exist_ok=True)

In [3]:
# shamelessly borrowed from https://github.com/nateraw/huggingpics
# slightly modified

SEARCH_URL = "https://huggingface.co/api/experimental/images/search"

def get_image_urls_by_term(search_term: str, count=150, thumbnail=True):
    params  = {"q": search_term, "license": "public", "imageType": "photo", "count": count}
    response = requests.get(SEARCH_URL, params=params)
    response.raise_for_status()
    response_data = response.json()
    if thumbnail:
        key = 'thumbnailUrl'   # if thumbnail flag is set to True, a small thumbnail is downloaded
    else:
        key = 'contentUrl'     # if thumbnail flag is set to False, a large image is downloaded
    image_urls = [img[key] for img in response_data['value']]
    return image_urls


def gen_images_from_urls(urls, dest):
    num_skipped = 0
    for i, url in enumerate(tqdm(urls)):
        response = requests.get(url)
        if not response.status_code == 200:
            num_skipped += 1
        try:
            img = Image.open(BytesIO(response.content))
            # yield img
            img.save(os.path.join(dest, f'{i}.jpg'))
        except UnidentifiedImageError:
            num_skipped +=1

    print(f"Retrieved {len(urls) - num_skipped} images. Skipped {num_skipped}.")

In [4]:
# generate the URLS
urls = get_image_urls_by_term(search_term='northern lights', count=50, thumbnail=True)

In [5]:
urls[:5]

['https://tse3.mm.bing.net/th?id=OIP.GRPWYjmpKFK2zcxTyCgDtQHaEo&pid=Api',
 'https://tse2.mm.bing.net/th?id=OIP.ASrM6kHRQFLnwRtJ9uKrDgHaDl&pid=Api',
 'https://tse3.mm.bing.net/th?id=OIP.WLg05vsBc3tkxp8qU57ZVQHaCF&pid=Api',
 'https://tse3.mm.bing.net/th?id=OIP.wYgB4CwHcQGx8SnFJ_sCVQHaEo&pid=Api',
 'https://tse4.mm.bing.net/th?id=OIP.Q6XA_lGFgAW2L5uF-jjGlQHaE7&pid=Api']

In [6]:
# download the images sequentially
gen_images_from_urls(urls, dest)

  0%|          | 0/50 [00:00<?, ?it/s]

Retrieved 50 images. Skipped 0.


# Parallel downloads

Using Joblib to parallelize download operation

In [7]:
import uuid
import joblib
import contextlib

In [8]:
@contextlib.contextmanager
def tqdm_parallel(tqdm_object):
    """Context manager to patch joblib to display tqdm progress bar"""

    def tqdm_print_progress(self):
        if self.n_completed_tasks > tqdm_object.n:
            n_completed = self.n_completed_tasks - tqdm_object.n
            tqdm_object.update(n=n_completed)

    original_print_progress = joblib.parallel.Parallel.print_progress
    joblib.parallel.Parallel.print_progress = tqdm_print_progress

    try:
        yield tqdm_object
    finally:
        joblib.parallel.Parallel.print_progress = original_print_progress
        tqdm_object.close()

In [9]:
def download_single_image(url, dest):
    response = requests.get(url)
    if not response.status_code == 200:
        return False
    try:
        # generate uuid based filename
        filename = str(uuid.uuid4().hex)
        while os.path.exists(os.path.join(dest, f'{filename}.jpg')):
            filename = str(uuid.uuid4().hex) # keep generating new uuid until it already does not exist in the directory
        file_path = os.path.join(dest, f'{filename}.jpg')
        img = Image.open(BytesIO(response.content))
        # yield img
        img.save(file_path)
        return True
    except UnidentifiedImageError:
        return False

In [10]:
def parallel_gen_images_from_urls(urls, dest):
    downloaded = 0
    with tqdm_parallel(tqdm(total=len(urls))):
        with joblib.Parallel(n_jobs=os.cpu_count()) as parallel:
            results = parallel(joblib.delayed(download_single_image)(url, dest) for url in urls)
            for result in results:
                if result:
                    downloaded += 1
    print(f"Retrieved {downloaded} images. Skipped {len(urls) - downloaded}.")

In [11]:
parallel_gen_images_from_urls(urls, dest)

  0%|          | 0/50 [00:00<?, ?it/s]

Retrieved 50 images. Skipped 0.


In [12]:
os.cpu_count()

4

# Performance comparsion

Note: My PC has a **4** Core CPU
* Sequential download took 14-15 seconds to download 50 images.
* Parallel download took 4-5 seconds to download 50 images. Approximately 3X speedup over sequential download