<a href="https://colab.research.google.com/github/M-Nitsche/BeeWatch/blob/main/dataset/flickr_dataset_collection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collect dataset by flickr website

In [25]:
#!pip install flickrapi

In [7]:
from flickrapi import FlickrAPI

import requests
import json
import os
import sys
import time

In [8]:
with open('flickr_api_key.json', 'r') as f:
    api_keys = json.load(f)
print(api_keys.keys())

dict_keys(['KEY', 'SECRET'])


In [9]:
KEY = api_keys['KEY']
SECRET = api_keys.get('SECRET')

SIZES = ["url_l", "url_c"]  #size of downloaded images (1024 × 732) or (800, 572); other sizes are: "url_o", "url_k", "url_h"

In [10]:
def get_photos(image_tag):
    extras = ','.join(SIZES)
    flickr = FlickrAPI(KEY, SECRET)
    photos = flickr.walk(text=image_tag, 
                            extras=extras, 
                            privacy_filter=1,  # search only for public photos with creative common licence
                            per_page=50,
                            sort='relevance')
    return photos

In [11]:
def get_url(photo):
    for i in range(len(SIZES)):
        url = photo.get(SIZES[i])
        if url: 
            return url

In [12]:
def get_urls(image_tag, max):
    photos = get_photos(image_tag)
    counter=0
    urls=[]

    for photo in photos:
        if counter < max:
            url = get_url(photo) 
            if url:
                urls.append(url)
                counter += 1
        else:
            break

    return urls

In [13]:
def create_folder(path):
    if not os.path.isdir(path):
        os.makedirs(path)

In [14]:
def download_images(urls, path):
    create_folder(path)  # makes sure path exists

    for url in urls:
        image_name = url.split("/")[-1]
        image_path = os.path.join(path, image_name)

        if not os.path.isfile(image_path):  # ignore if already downloaded
            response=requests.get(url,stream=True)

            with open(image_path,'wb') as outfile:
                outfile.write(response.content)

In [15]:
def download(image_tag, no_of_images = 10, folder_name=None):
    start_time = time.time()
    print('Getting urls for:', image_tag)
    urls = get_urls(image_tag, no_of_images)

    print('Downloading images for', image_tag)
    if folder_name:
        path = os.path.join(folder_name.replace(" ", "-") + "_data")
    else:
        path = os.path.join(image_tag.replace(" ", "_") + "_data")

    download_images(urls, path)
    print('Took', round(time.time() - start_time, 2), 'seconds')

Collect bees on flower dataset: [Flickr](https://www.flickr.com/search/?media=photos&advanced=1&text=)

In [60]:
download('bee flowers', no_of_images=1000)

Getting urls for: bee flowers
Downloading images for bee flowers
Took 0.4 seconds


[Mosaic Augmentation](https://iopscience.iop.org/article/10.1088/1742-6596/1684/1/012094/pdf)

In [17]:
download('flower bushes', no_of_images=1000)

Getting urls for: flower bushes
Downloading images for flower bushes
Took 502.35 seconds


In [16]:
download('flowers', no_of_images=1000)

Getting urls for: flowers
Downloading images for flowers
Took 460.2 seconds


In [66]:
#download('blumen', no_of_images=10, folder_name="flowers")

Getting urls for: blumen
Downloading images for blumen
Took 0.37 seconds


Ref: https://medium.com/@adrianmrit/creating-simple-image-datasets-with-flickr-api-2f19c164d82f