In [1]:
import os
import glob
import shutil

import numpy as np
import pandas as pd
from PIL import Image

import tensorflow as tf
import torch 

from sklearn.model_selection import train_test_split

In [2]:
# download the data
data_path = 'https://he-public-data.s3.ap-southeast-1.amazonaws.com/dataset.zip'
data_path = tf.keras.utils.get_file('full-dataset', data_path, cache_dir='.', extract=True)
data_path

Downloading data from https://he-public-data.s3.ap-southeast-1.amazonaws.com/dataset.zip


'./datasets/full-dataset'

In [3]:
images_path = os.listdir('/content/datasets/dataset/images')
images_path[:5]

['dc3a705b1f27cd4a660a1bb1c53b898d.jpg',
 'f26a657e0e25b351503adb12059525e8.jpg',
 'c5e6d669cb50585bc4e911ac32485991.jpg',
 'fb3ab7a23f98bd3c884b7b77e3deb2f0.jpg',
 '7ce059e0f1f5254f018be9572654c4fd.jpg']

In [4]:
train = pd.read_csv('/content/datasets/dataset/train.csv')
train.head()

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,797.0,701.0,262.0,211.0
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,932.0,786.0,329.0,238.0
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,736.0,657.0,275.0,229.0
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,986.0,786.0,136.0,0.0
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,667.0,549.0,228.0,179.0


In [5]:
test = pd.read_csv('/content/datasets/dataset/test.csv')
test.head()

Unnamed: 0,image_path
0,953ab1447c46ecfef67ab14629cd70c7.jpg
1,e4ddbaa7970fca225a51288ce5f7d3f9.jpg
2,5b8120d69607a077b7583334be3ba18b.jpg
3,138b1dc82005b4c33e4886260649d313.jpg
4,0f91ec1533b845b13089f8cf4e0a36f7.jpg


In [6]:
classes = list(train['name'].value_counts().index)
classes_num = list(train['class'].value_counts().index)

labels = dict(zip(classes, classes_num))
labels

{'GARBAGE': 3.0,
 'CONSTRUCTION_ROAD': 4.0,
 'POTHOLES': 2.0,
 'CLUTTER_SIDEWALK': 9.0,
 'BAD_BILLBOARD': 7.0,
 'GRAFFITI': 0.0,
 'SAND_ON_ROAD': 8.0,
 'UNKEPT_FACADE': 10.0,
 'FADED_SIGNAGE': 1.0,
 'BROKEN_SIGNAGE': 5.0,
 'BAD_STREETLIGHT': 6.0}

In [7]:
%cd /content/datasets/dataset

/content/datasets/dataset


In [8]:
os.makedirs('train/images')
os.makedirs('train/labels')

os.makedirs('val/images')
os.makedirs('val/labels')

os.makedirs('test')

In [9]:
train['class'].value_counts()

3.0     8597
4.0     2730
2.0     2625
9.0     2253
7.0     1555
0.0     1124
8.0      748
10.0     127
1.0      107
5.0       83
6.0        1
Name: class, dtype: int64

In [10]:
train = train.drop(train[train['class']==6].index, axis=0)

In [11]:
width , height = Image.open(glob.glob('/content/datasets/dataset/images/*')[0]).size
print(width, height)

1920 1080


In [12]:
# normalize the boundary box 
train['xc'] = round((train['xmax'] + train['xmin']) / (2*width), 2).values
train['yc'] = round((train['ymax'] + train['ymin']) / (2*height), 2).values

train['w'] = round((train['xmax'] - train['xmin']) / width, 2).values
train['h'] = round((train['ymax'] - train['ymin']) / height, 2).values

In [13]:
train.head()

Unnamed: 0,class,image_path,name,xmax,xmin,ymax,ymin,xc,yc,w,h
0,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,797.0,701.0,262.0,211.0,0.39,0.22,0.05,0.05
1,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,932.0,786.0,329.0,238.0,0.45,0.26,0.08,0.08
2,3.0,4a48c42c9579ec0399e6c5a3e825e765.jpg,GARBAGE,736.0,657.0,275.0,229.0,0.36,0.23,0.04,0.04
3,7.0,ea906a663da6321bcef78be4b7d1afff.jpg,BAD_BILLBOARD,986.0,786.0,136.0,0.0,0.46,0.06,0.1,0.13
4,8.0,1c7d48005a12d1b19261b8e71df7cafe.jpg,SAND_ON_ROAD,667.0,549.0,228.0,179.0,0.32,0.19,0.06,0.05


In [14]:
# split data according to classes
train_df, val_df = train_test_split(train, test_size=0.2, stratify=train['class'])

In [15]:
train_df['class'].value_counts(normalize=True)*100

3.0     43.097938
4.0     13.685068
2.0     13.158719
9.0     11.291434
7.0      7.794975
0.0      5.633185
8.0      3.747102
10.0     0.639138
1.0      0.538881
5.0      0.413560
Name: class, dtype: float64

In [16]:
val_df['class'].value_counts(normalize=True)*100

3.0     43.082707
4.0     13.684211
2.0     13.157895
9.0     11.303258
7.0      7.794486
0.0      5.639098
8.0      3.759398
10.0     0.626566
1.0      0.526316
5.0      0.426065
Name: class, dtype: float64

In [17]:
train_imgs = list(set(train_df['image_path']))
val_imgs = list(set(val_df['image_path']))
test_imgs = list(set(test['image_path']))

In [18]:
print(f'Train images num : {len(train_imgs)}')
print(f'Val images num : {len(val_imgs)}')
print(f'Test images num : {len(test_imgs)}')

Train images num : 7232
Val images num : 3092
Test images num : 2092


In [19]:
def copy_files(paths_list, dist, test, folder=None):
  """
  A function takes image paths and copy them in distination path and create txt files with normalized boundary boxes.
  :param paths_list : list
  :param dist       : str
  :param test       : bool
  :param folder     : optional and its values is ['train', 'val', None]. Defult is None.
  :return           : images and text files
  """
  for path in paths_list:
    img_name = path
    path = os.path.join('/content/datasets/dataset/images', path)
    shutil.copy(path, dist)
    
    if not test:
      if folder=='train':
        os.chdir('/content/datasets/dataset/train/labels')
        new_df = train_df[train_df['image_path'] == img_name].drop(['name', 'xmax', 'xmin', 'ymin', 'ymax', 'image_path'], axis=1)

      elif folder=='val':
        os.chdir('/content/datasets/dataset/val/labels')
        new_df = val_df[val_df['image_path'] == img_name].drop(['name', 'xmax', 'xmin', 'ymin', 'ymax', 'image_path'], axis=1)


      with open(f"{img_name.split('.')[0]}" + '.txt', "w") as file:
        for i in range(len(new_df)):
          output = list(new_df.iloc[i, :].values)
          file.write(f'{output[0]} {output[1]} {output[2]} {output[3]} {output[4]}\n')  
      
      os.chdir('/content/datasets/dataset')
    
    else:
      continue    
  

In [20]:
copy_files(train_imgs, '/content/datasets/dataset/train/images', test=False, folder='train')
copy_files(val_imgs, '/content/datasets/dataset/val/images', test=False, folder='val')
copy_files(test_imgs, '/content/datasets/dataset/test', test=True)

In [21]:
paths = ['/content/datasets/dataset/train/labels', '/content/datasets/dataset/val/labels',
         '/content/datasets/dataset/test']

In [22]:
[len(os.listdir(path)) for path in paths]

[7232, 3092, 2092]