In [1]:
import pandas as pd
from tqdm import tqdm 
import os
from shutil import copyfile as copy

In [2]:
df = pd.read_csv('images.csv')
df

Unnamed: 0,img_link,tags,likes,comments,path
0,https://cdn.pixabay.com/photo/2022/03/06/05/30...,"Clouds, Sky, Atmosphere, Blue Sky",196,55,Imgs/clouds-7050884__480.jpg
1,https://cdn.pixabay.com/photo/2022/04/07/11/45...,"Bird, Ornithology, Hummingbird",76,20,Imgs/bird-7117346__340.jpg
2,https://cdn.pixabay.com/photo/2022/02/28/15/28...,"Sea, Rainbow, Rainfall, Subtropical",282,106,Imgs/sea-7039471__340.jpg
3,https://cdn.pixabay.com/photo/2022/04/04/02/52...,"Cherry Blossoms, Road, Japan, Sakura",42,11,Imgs/cherry-blossoms-7110279__340.jpg
4,https://cdn.pixabay.com/photo/2022/04/09/18/06...,"Cape Marguerite, Flower, Plant",39,15,Imgs/cape-marguerite-7121992__340.jpg
...,...,...,...,...,...
657,https://cdn.pixabay.com/photo/2022/03/19/12/13...,"Flower, Plant, Petals, Snowdrop, Blossom",41,31,Imgs/flower-7078412__340.jpg
658,https://cdn.pixabay.com/photo/2022/04/06/13/11...,"Flower, Flora, Nature, Garden, Blooming",5,4,Imgs/flower-7115571__340.jpg
659,https://cdn.pixabay.com/photo/2022/03/28/03/00...,Foto vom 25.03.2022,25,24,Imgs/insect-7096440__340.jpg
660,https://cdn.pixabay.com/photo/2015/03/03/05/56...,"Avenue, Trees, Road, Tree Lined",1264,311,Imgs/avenue-656969__340.jpg


### 1. Removing unnecessary columns

In [3]:
df.isna().sum()

img_link    0
tags        1
likes       0
comments    0
path        0
dtype: int64

In [4]:
df = df.dropna()

In [5]:
df = df.drop(columns=['img_link', 'likes', 'comments'])
df

Unnamed: 0,tags,path
0,"Clouds, Sky, Atmosphere, Blue Sky",Imgs/clouds-7050884__480.jpg
1,"Bird, Ornithology, Hummingbird",Imgs/bird-7117346__340.jpg
2,"Sea, Rainbow, Rainfall, Subtropical",Imgs/sea-7039471__340.jpg
3,"Cherry Blossoms, Road, Japan, Sakura",Imgs/cherry-blossoms-7110279__340.jpg
4,"Cape Marguerite, Flower, Plant",Imgs/cape-marguerite-7121992__340.jpg
...,...,...
657,"Flower, Plant, Petals, Snowdrop, Blossom",Imgs/flower-7078412__340.jpg
658,"Flower, Flora, Nature, Garden, Blooming",Imgs/flower-7115571__340.jpg
659,Foto vom 25.03.2022,Imgs/insect-7096440__340.jpg
660,"Avenue, Trees, Road, Tree Lined",Imgs/avenue-656969__340.jpg


### 2. Finding all the tags


In [10]:
tags = []
for tag in tqdm(df['tags']):
    tags += [t.strip() for t in tag.split(', ')]
tags = list(set(tags))

100%|████████████████████████████████████████████████████████████████████████████████████████| 661/661 [00:00<?, ?it/s]


### 3. Creating Folders for each Tag


In [17]:
for tag in tqdm(tags):
    try:
        os.makedirs('Dataset/' + tag)
    except:
        pass
    

100%|████████████████████████████████████████████████████████████████████████████| 1168/1168 [00:00<00:00, 5122.28it/s]


### 4. Saving Images in Specific Folders


In [36]:
for data in tqdm(df.values):
    tag, src = data
    paths = ['Dataset/' + t.strip() + '/' for t in tag.split(', ')]
    
    for path in paths:
        try:
            copy(src, path + src.split('/')[-1])
        except Exception as e:
            pass


100%|███████████████████████████████████████████████████████████████████████████████| 661/661 [00:01<00:00, 371.71it/s]


### 5. Checking Number of Folders


In [37]:
folders = os.listdir('Dataset')
print(len(folders))

1163


### 6. Checking number of Images in Each Folder


In [38]:
folder = []
freq = []

for f in tqdm(folders):
    try:
        freq.append(len(os.listdir('Dataset/' + f)))
        folder.append(f)
    except:
        pass

100%|███████████████████████████████████████████████████████████████████████████| 1163/1163 [00:00<00:00, 11899.07it/s]


### 7. Top 10 Folders with most number of Images

In [41]:
df_cur = pd.DataFrame({'folder': folder, 'freq': freq})
df_cur.sort_values(by='freq', ascending=False).head(10)

Unnamed: 0,folder,freq
694,Nature,70
414,Flowers,64
412,Flower,59
94,Bird,58
29,Animal,41
967,Spring,34
798,Plant,29
107,Bloom,27
725,Ornithology,27
110,Blossom,26
