# Dataset downloading

In [None]:
import json

import requests

api_url = f"https://pixabay.com/api"
parameters = {
    "key": "<API_KEY>",
    "q": "<API_QUERY>",
    "image_type": "photo",
    "category": "<API_CATEGORY>",
    "per_page": "200"
}
request_url = f'{api_url}?{"&".join([f"{key}={value}" for key, value in parameters.items()])}'

response = requests.get(request_url)
json_response = json.loads(response.text)
print(f'Found {json_response["total"]} pictures about {parameters["q"]} on Pixabay')


In [None]:
import io
import os
from time import sleep, time

from PIL import Image
from tqdm import tqdm

dataset_path = os.path.join(os.getcwd(), "dataset", parameters["q"])
if not os.path.isdir(dataset_path):
    os.mkdir(dataset_path)

number_of_pages = int(json_response["total"] / int(parameters["per_page"])) + 1

# For each result page
for page in range(1, number_of_pages + 1):
    parameters["page"] = page
    request_url = f'{api_url}?{"&".join([f"{key}={value}" for key, value in parameters.items()])}'
    response = requests.get(request_url)
    json_response = json.loads(response.text)
    # For each image from that page
    for result in tqdm(json_response["hits"],
                       desc=f"Page {page}/{number_of_pages}",
                       total=int(parameters["per_page"])):
        if not os.path.isfile(os.path.join(dataset_path, f'{parameters["q"]}-{result["id"]}.jpg')):
            start = time()
            # Download the image
            image_url = result["largeImageURL"]
            response = requests.get(image_url,
                                    headers={"User-Agent": "Chrome"})
            # Resize and save the image
            stream = io.BytesIO(response.content)
            raw_image = Image.open(stream)
            try:
                raw_image.resize((800, 600)).save(os.path.join(dataset_path, f'{parameters["q"]}-{result["id"]}.jpg'))
            except OSError:
                continue
            # Only wait if the process took less than 0.66 seconds (100 calls per minute)
            end = time()
            elapsed = end - start
            if elapsed < 0.66:
                sleep(0.66 - elapsed)


# Model training