In [3]:
# Author: Helal Chowdhury

In [4]:
# Import libraries

In [5]:
import requests
import math
import shutil
from getpass import getpass
from PIL import Image, UnidentifiedImageError
from requests.exceptions import HTTPError
from io import BytesIO
from pathlib import Path
import itertools
import glob


In [6]:
# Download images from net. In this example, three items: Cat, Dog, and Lion are choosen

In [7]:
term_1 = "Cat" 
term_2 = "Dog" 
term_3 = "Lion" 
term_4 = "" 
term_5 = "" 

search_terms = sorted([
    term_1,
    term_2,
    term_3,
    term_4,
    term_5
])

search_terms = [x for x in search_terms if x.strip() != '']

In [8]:
# Huggingface search API 

In [4]:
SEARCH_URL = "https://huggingface.co/api/experimental/images/search"

def get_image_urls_by_term(search_term: str, count=50):
    params  = {"q": search_term, "license": "public", "imageType": "photo", "count": count}
    response = requests.get(SEARCH_URL, params=params)
    response.raise_for_status()
    response_data = response.json()
    image_urls = [img['thumbnailUrl'] for img in response_data['value']]
    return image_urls


def gen_images_from_urls(urls):
    num_skipped = 0
    for url in urls:
        response = requests.get(url)
        if not response.status_code == 200:
            num_skipped += 1
        try:
            img = Image.open(BytesIO(response.content))
            yield img
        except UnidentifiedImageError:
            num_skipped +=1

    print(f"Retrieved {len(urls) - num_skipped} images. Skipped {num_skipped}.")


def urls_to_image_folder(urls, save_directory):
    for i, image in enumerate(gen_images_from_urls(urls)):
        image.save(save_directory / f'{i}.jpg')

In [None]:
# Huggingface API will create folder "images" and download images accordig to the item names

In [5]:
data_dir = Path('images')

if data_dir.exists():
    shutil.rmtree(data_dir)

for search_term in search_terms:
    search_term_dir = data_dir / search_term
    search_term_dir.mkdir(exist_ok=True, parents=True)
    urls = get_image_urls_by_term(search_term)
    #print(urls)
    print(f"Saving images of {search_term} to {str(search_term_dir)}...")
    urls_to_image_folder(urls, search_term_dir)

Saving images of Cat to images/Cat...
Retrieved 50 images. Skipped 0.
Saving images of Dog to images/Dog...
Retrieved 50 images. Skipped 0.
Saving images of Lion to images/Lion...
Retrieved 50 images. Skipped 0.


In [None]:
# create folder

In [6]:
!mkdir Output

In [None]:
# import splitfolders library

In [7]:
import splitfolders

In [8]:
! tree .

[01;34m.[00m
├── Data_Collection_Split.ipynb
├── [01;34mimages[00m
│   ├── [01;34mCat[00m
│   │   ├── [01;35m0.jpg[00m
│   │   ├── [01;35m10.jpg[00m
│   │   ├── [01;35m11.jpg[00m
│   │   ├── [01;35m12.jpg[00m
│   │   ├── [01;35m13.jpg[00m
│   │   ├── [01;35m14.jpg[00m
│   │   ├── [01;35m15.jpg[00m
│   │   ├── [01;35m16.jpg[00m
│   │   ├── [01;35m17.jpg[00m
│   │   ├── [01;35m18.jpg[00m
│   │   ├── [01;35m19.jpg[00m
│   │   ├── [01;35m1.jpg[00m
│   │   ├── [01;35m20.jpg[00m
│   │   ├── [01;35m21.jpg[00m
│   │   ├── [01;35m22.jpg[00m
│   │   ├── [01;35m23.jpg[00m
│   │   ├── [01;35m24.jpg[00m
│   │   ├── [01;35m25.jpg[00m
│   │   ├── [01;35m26.jpg[00m
│   │   ├── [01;35m27.jpg[00m
│   │   ├── [01;35m28.jpg[00m
│   │   ├── [01;35m29.jpg[00m
│   │   ├── [01;35m2.jpg[00m
│   │   ├── [01;35m30.jpg[00m
│   │   ├── [01;35m31.jpg[00m
│   │   ├── [01;35m32.jpg[00m
│   │   ├── [01;35m33.jpg[00m
│   │   ├── 

In [None]:
# Split the folder into train and validation folder

In [9]:
input_dir="./images/"
output_dir="./Output/"

splitfolders.ratio(input_dir, # The location of dataset
                   output=output_dir, # The output location
                   seed=42, # The number of seed
                   ratio=(.8, .2), # The ratio of splited dataset
                   group_prefix=None, # If your dataset contains more than one file like ".jpg", ".pdf", etc
                   move=False) # If you choose to move, turn this into True

Copying files: 150 files [00:00, 5155.62 files/s]
