In [1]:
import pandas as pd
import requests
from io import BytesIO
from PIL import Image
import os
import time
import concurrent.futures

In [2]:
image_dir = 'datasets/images/'
os.makedirs(image_dir, exist_ok=True)

In [3]:
# csv_file_path = 'datasets/tester.csv'
csv_file_path = 'datasets/keklik_clear.csv'

df = pd.read_csv(csv_file_path)
df.head()

Unnamed: 0,id,image_url,scientific_name,common_name,iconic_taxon_name,taxon_id
0,8292,https://inaturalist-open-data.s3.amazonaws.com...,Alectoris chukar,Азиатский кеклик,Aves,846
1,41520,https://inaturalist-open-data.s3.amazonaws.com...,Alectoris chukar,Азиатский кеклик,Aves,846
2,44190,https://inaturalist-open-data.s3.amazonaws.com...,Alectoris chukar,Азиатский кеклик,Aves,846
3,49645,https://static.inaturalist.org/photos/79346/sm...,Alectoris chukar,Азиатский кеклик,Aves,846
4,65918,https://inaturalist-open-data.s3.amazonaws.com...,Alectoris chukar,Азиатский кеклик,Aves,846


In [4]:
df.scientific_name.unique()

array(['Alectoris chukar', 'Alectoris rufa', 'Alectoris graeca',
       'Alectoris rufa rufa', 'Alectoris rufa intercedens', 'Alectoris',
       'Alectoris barbara koenigi', 'Alectoris barbara spatzi',
       'Alectoris melanocephala', 'Alectoris barbara',
       'Alectoris chukar falki', 'Alectoris graeca whitakeri',
       'Alectoris chukar cypriotes', 'Alectoris magna',
       'Alectoris graeca orlandoi', 'Alectoris rufa hispanica',
       'Alectoris graeca saxatilis', 'Alectoris graeca graeca',
       'Alectoris chukar chukar', 'Alectoris barbara barbara',
       'Alectoris philbyi', 'Alectoris chukar subpallida'], dtype=object)

In [5]:
print(df.shape[0])
df = df.drop(index = df[df['scientific_name'] == 'Alectoris'].index).reset_index(drop=True)
print(df.shape[0])

8393
8389


In [6]:
df.loc[df['scientific_name'] == 'Alectoris rufa rufa', 'scientific_name'] = 'Alectoris rufa'
df.loc[df['scientific_name'] == 'Alectoris rufa intercedens', 'scientific_name'] = 'Alectoris rufa'
df.loc[df['scientific_name'] == 'Alectoris rufa intercedens', 'scientific_name'] = 'Alectoris rufa hispanica'

df.loc[df['scientific_name'] == 'Alectoris barbara koenigi', 'scientific_name'] = 'Alectoris barbara'
df.loc[df['scientific_name'] == 'Alectoris barbara spatzi', 'scientific_name']  = 'Alectoris barbara'
df.loc[df['scientific_name'] == 'Alectoris rufa hispanica', 'scientific_name']  = 'Alectoris barbara'
df.loc[df['scientific_name'] == 'Alectoris barbara barbara', 'scientific_name'] = 'Alectoris barbara'

df.loc[df['scientific_name'] == 'Alectoris chukar falki', 'scientific_name']      = 'Alectoris chukar'
df.loc[df['scientific_name'] == 'Alectoris chukar cypriotes', 'scientific_name']  = 'Alectoris chukar'
df.loc[df['scientific_name'] == 'Alectoris chukar chukar', 'scientific_name']     = 'Alectoris chukar'
df.loc[df['scientific_name'] == 'Alectoris chukar subpallida', 'scientific_name'] = 'Alectoris chukar'

df.loc[df['scientific_name'] == 'Alectoris graeca orlandoi', 'scientific_name']  = 'Alectoris graeca'
df.loc[df['scientific_name'] == 'Alectoris graeca saxatilis', 'scientific_name'] = 'Alectoris graeca'
df.loc[df['scientific_name'] == 'Alectoris graeca graeca', 'scientific_name']    = 'Alectoris graeca'
df.loc[df['scientific_name'] == 'Alectoris graeca whitakeri', 'scientific_name'] = 'Alectoris graeca'

df.scientific_name.nunique()

7

In [7]:
df['scientific_name'] = [s.replace(' ', '_') for s in df['scientific_name']]
unique_species = df.scientific_name.unique()
unique_species

array(['Alectoris_chukar', 'Alectoris_rufa', 'Alectoris_graeca',
       'Alectoris_barbara', 'Alectoris_melanocephala', 'Alectoris_magna',
       'Alectoris_philbyi'], dtype=object)

In [8]:
row_counts = df.groupby('scientific_name').size()
row_counts

scientific_name
Alectoris_barbara           535
Alectoris_chukar           3447
Alectoris_graeca            193
Alectoris_magna               5
Alectoris_melanocephala      36
Alectoris_philbyi             4
Alectoris_rufa             4169
dtype: int64

In [8]:
for name in unique_species:
    os.makedirs(image_dir + name + '/', exist_ok=True)

In [10]:
def download_image(img_url, img_name, img_folder_name, i = -1):
    try:
        img_bytes = requests.get(img_url).content
        img_name = f'{image_dir}{img_folder_name}/{img_name}.jpg'
        with open(img_name, 'wb') as img_file:
            img_file.write(img_bytes)
            if i % 100 == 0:
                print(f'Image {i} was downloaded...')
            # print(f'{img_name} was downloaded...')
            # print(f'Image {i} was downloaded...')
    except Exception as e:
        print('Exception in download_url():', e)

t1 = time.perf_counter()

with concurrent.futures.ThreadPoolExecutor() as executor:
    for i, (url, name, folder) in enumerate(zip(df['image_url'], df.id, df['scientific_name'])):
        executor.submit(download_image, url, name, folder, i)

t2 = time.perf_counter()

print(f'Finished in {t2-t1} seconds')

Image 0 was downloaded...
Image 100 was downloaded...
Image 200 was downloaded...
Image 300 was downloaded...
Image 400 was downloaded...
Image 500 was downloaded...
Image 600 was downloaded...
Image 700 was downloaded...
Image 800 was downloaded...
Image 900 was downloaded...
Image 1000 was downloaded...
Image 1100 was downloaded...
Image 1200 was downloaded...
Image 1300 was downloaded...
Image 1400 was downloaded...
Image 1500 was downloaded...
Image 1600 was downloaded...
Image 1700 was downloaded...
Image 1800 was downloaded...
Image 1900 was downloaded...
Image 2000 was downloaded...
Image 2100 was downloaded...
Image 2200 was downloaded...
Image 2300 was downloaded...
Image 2400 was downloaded...
Image 2500 was downloaded...
Image 2600 was downloaded...
Image 2700 was downloaded...
Image 2800 was downloaded...
Image 2900 was downloaded...
Image 3000 was downloaded...
Image 3100 was downloaded...
Image 3200 was downloaded...
Image 3300 was downloaded...
Image 3400 was downloaded.

---

In [131]:
# from multiprocessing import cpu_count
# from multiprocessing.pool import ThreadPool

# def download_parallel(args):
#     url, name, folder = args
#     download_image(url, name, folder)

# args_list = list(zip(df['image_url'], df['id'], df['scientific_name']))

# cpus = cpu_count()

# t1 = time.perf_counter()

# with ThreadPool(cpus - 1) as pool:
#     print("Starting parallel image downloads...")
#     pool.imap_unordered(download_parallel, args_list)
#     pool.close()
#     pool.join()

# t2 = time.perf_counter()
# print("All downloads completed.")
# print(f'Finished in {t2-t1} seconds')