## ***Import neccessay packages***

In [None]:
import os, glob, shutil
import cv2
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import plotly.express as px
from PIL import Image
from google.colab import drive
from xml.dom import minidom
from IPython.display import clear_output

## ***Data Download***
***Data Sources:***
> *   3002 Images from dataset released by organizer
> *   272 Images from [Dhaka-Traffic repository](https://github.com/Morshed-Alam/Dhaka-Traffic.git)
> *   499 Images from 1st round test1 data. (Annotated manually)

***Download data release by organizer***

> Corrected incorrect files manually



In [None]:
# Downloading the dataset
%cd /content/

# Removing unnecessary demo data folder from workspace.
!rm -r sample_data
 
!gdown --id 1CdSorVH5Umf5FbY1FAuIq-UrLxFJlSPz
!unzip Final-Train-Dataset.zip; rm Final-Train-Dataset.zip;

# Renaming raw data folder to remove space. It makes life a lot easier
%mv 'Final Train Dataset' train_data_raw
clear_output()

***Download data released in [Dhaka-Traffic repository](https://github.com/Morshed-Alam/Dhaka-Traffic.git)***

In [None]:
%cd /content/
!curl -L 'https://codeload.github.com/Morshed-Alam/Dhaka-Traffic/zip/v1.0.0' > source.zip; unzip source.zip; rm source.zip
clear_output()

# Move data to train_data_raw
files = glob.glob(f'/content/Dhaka-Traffic-1.0.0/data/*')
for file in files:
    shutil.move(file, '/content/train_data_raw/')

# Remove remaining folders
!rm -r /content/Dhaka-Traffic-1.0.0/

***Download Test1 data***

In [None]:
%cd /content/
!gdown --id 1AgJ9d7JoNujSucBUlBZPTRga9Yp2G_j_
!unzip test1_labeled.zip; rm test1_labeled.zip;
clear_output()

# Move data to train_data_raw
files = glob.glob(f'/content/test1_labeled/*')
for file in files:
    shutil.move(file, '/content/train_data_raw/')

# Remove remaining folder
!rm -r /content/test1_labeled

## ***Find Images containing lower frequency classes from data***

> Frequency = number of vehicles



***Vehicle class labels***

In [None]:
lut={"ambulance": 0,
     "army vehicle": 1,
     "auto rickshaw": 2,
     "bicycle": 3,
     "bus": 4,
     "car": 5,
     "garbagevan": 6,
     "human hauler": 7,
     "minibus": 8,
     "minivan": 9,
     "motorbike": 10,
     "pickup": 11,
     "policecar": 12,
     "rickshaw": 13,
     "scooter": 14,
     "suv": 15,
     "taxi": 16,
     "three wheelers (CNG)": 17,
     "truck": 18,
     "van": 19,
     "wheelbarrow": 20
     }

***Read number of vehicles in each class***

In [None]:
xml = glob.glob(f'/content/train_data_raw/*.xml')
label_stat = {}
for fname in xml:
   xmldoc = minidom.parse(fname)
   itemlist = xmldoc.getElementsByTagName('object')
   for item in itemlist:
        # get class label
        classid =  (item.getElementsByTagName('name')[0]).firstChild.data
        if classid in lut:
            try: 
               label_stat[classid].extend([fname])
            except:
               label_stat[classid] = [fname]
        else:
            label_str = "-1"
            print ("warning: label '%s' not in look-up table for file '%s'" % classid, fname )

label_stat = {k: v for k, v in sorted(label_stat.items(), key=lambda item: len(item[1]))}

for key in label_stat.keys():
    print(key, " :", len(label_stat[key]))

***List xml files containing lower frequency classes***


In [None]:
num = 11                                 # Number of low frequency classes to which we apply augmentation
minor_class_list = []
i = 0
for value in label_stat.values():
    if i in range(num):
        minor_class_list.extend(value)
        i += 1

minor_class_set = set(minor_class_list)
minor_class_list = list(minor_class_set)

## ***Apply augmentation to increase the image items***
We only apply the augmentation to low frequency classes of our training set not to validation set

[***Download augmentation repository from github***](https://github.com/Morshed-Alam/DataAugmentation.git)

In [None]:
%cd /content/
!git clone https://github.com/Morshed-Alam/DataAugmentation.git

***List of files containing low frequency classes***

In [None]:
# Reading train Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
image_file_list = []
for file in minor_class_list:
    for format in formats:
        image_file_list.extend(glob.glob(f'{file[:-4]}.{format}'))

# Reading train xml label file paths
label_file_list_xml = minor_class_list

print(f'Image files found: {len(image_file_list)} \nLabel files found: { len(label_file_list_xml)}')

***Create augment output directory***

> Image and xml file will be saved to these directories after augmentation



In [None]:
augment_output_dirs = ['/content/horizontalflip', '/content/translate', '/content/scale', '/content/rotate', '/content/shear', '/content/randomhsv']

for dir in augment_output_dirs:
    if os.path.exists(dir):
       print(f'Directory {dir} already exists !')
    else: 
       os.makedirs(dir)
       print(f"Directory {dir} is created successfully!") 
       

***Create augmentation objects***

> We apply following augmentation

*   Horizontal flip
*   Scale
*   Translation
*   Rotation
*   Random HSV
*   Shear



In [None]:
%cd /content/DataAugmentation/
import augmentation as ag
import data_aug as da

aug_hf = da.HorizontalFlip()
aug_s = da.Scale(0.898)
aug_t = da.Translate(0.245)
aug_r = da.Rotate(37)
aug_hsv = da.RandomHSV(hue=2, saturation=66, brightness=20)
aug_shear = da.Shear(0.602)

***Apply augmentation***

In [None]:
# Horizontal flip
ag.apply_aug(image_file_list, augment_output_dirs[0], aug_hf, lut)
 
# Translate
ag.apply_aug(image_file_list, augment_output_dirs[1], aug_t, lut)
 
# Scale
ag.apply_aug(image_file_list, augment_output_dirs[2], aug_s, lut)
 
# Rotation
ag.apply_aug(image_file_list, augment_output_dirs[3], aug_r, lut)
 
# Shear
ag.apply_aug(image_file_list, augment_output_dirs[4], aug_shear, lut)
 
# Random HSV
ag.apply_aug(image_file_list, augment_output_dirs[5], aug_hsv, lut)

clear_output()

***Testing correctness of augmentation***

In [None]:
%cd /content/DataAugmentation/
from bbox_util import draw_rect
from augmentation import xml2array
 
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
for i in range(6):
   path = augment_output_dirs[i]+'/'+os.path.split(augment_output_dirs[i])[1]+'_'
   img_files = []
   for format in formats:
       img_files.extend(glob.glob(f'{path}*.{format}'))
   # change index to see other image
   img_file = img_files[1]
   img = cv2.imread(img_file)
   bbox = xml2array(os.path.splitext(img_file)[0]+'.xml', lut)
   an_img = draw_rect(img, bbox)
   original_img_file = '/content/train_data_raw/'+img_file.replace(path, '')
   original_img = cv2.imread(original_img_file)
   original_bbox = xml2array(os.path.splitext(original_img_file)[0]+'.xml', lut)
   original_an_img = draw_rect(original_img, original_bbox)
 
   plt.figure(i, figsize=(12,6))
   plt.subplot(121)
   plt.imshow(an_img)
   plt.title(os.path.split(img_file)[1])
 
   plt.subplot(122)
   plt.imshow(original_an_img)
   plt.title(os.path.split(original_img_file)[1])

***Move all augmentation output to train_data_raw folder***

In [None]:
for dir in augment_output_dirs:
    files = os.listdir(dir)
    for file in files:
        shutil.move(dir+'/'+file, '/content/train_data_raw/'+file)
    os.rmdir(dir)

<a id="5"></a>
## ***Train and Validation Split***
For training the model and evaluating at the same time, we will split the whole training dataset into a train and validation set. We will be using $80-20$ dividion rule for the train and validation split. 

In [None]:
# Reading Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
image_file_list = []
for format in formats:
    image_file_list.extend(glob.glob(f'/content/train_data_raw/*.{format}'))

# Reading xml label file paths
label_file_list_xml = glob.glob(f'/content/train_data_raw/*.xml')

print(f'Image files found: {len(image_file_list)} \nLabel files found: { len(label_file_list_xml)}')

In [None]:
random.seed(1500)

#randomply selecting the index of the files
valid_set_index = random.sample(range(len(image_file_list)), 1000)
len(set(image_file_list)), len(set(label_file_list_xml)), len(valid_set_index)

image_file_list = sorted(image_file_list)
label_file_list_xml = sorted(label_file_list_xml)

# sanity check of the image files and labels being in the same order
print('Checking files concurrency')
print(image_file_list[:5])
print(label_file_list_xml[:5])

# code to separate train and validation set
valid_selected_images = []
valid_selected_labels = []

for index in range(len(valid_set_index)): 
    valid_selected_images.append(image_file_list[index])
    valid_selected_labels.append(label_file_list_xml[index])

print('\n\nChecking files concurrency in validation set')
print(valid_selected_images[-5:])
print(valid_selected_labels[-5:])

In [None]:
# Creating required directories
dir = '/content/valid/'

if os.path.exists(dir):
    print(f'Directory {dir} already exists !')
else: 
    os.makedirs(dir)
    print(f"Directory {dir} is created successfully!") 


for idx in range(len(valid_selected_images)):
    # moving image files to valid
    mypath = valid_selected_images[idx]
    if os.path.exists(mypath):
        filename = mypath.split('/')[-1]
        shutil.move(mypath , dir + filename)
    else:
        print(f'{mypath} not found')
        
    # moving label files to valid
    mypath = valid_selected_labels[idx]
    if os.path.exists(mypath):
        filename = mypath.split('/')[-1]
        shutil.move(mypath , dir + filename)
    else:
        print(f'{mypath} not found')



In [None]:
# Rename train_data_raw to train
%mv /content/train_data_raw /content/train

In [None]:
# Reading remaining train Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
image_file_list = []
for format in formats:
    image_file_list.extend(glob.glob(f'/content/train/*.{format}'))

# Reading xml label file paths
label_file_list_xml = glob.glob(f'/content/train/*.xml')

print(f'Image files found: {len(image_file_list)} \nLabel files found: { len(label_file_list_xml)}')

<a id="3"></a>
## ***Convert .xml  to .txt***

***Define function for xml to txt conversion***

In [None]:
# Track number of labels in each class
label_count ={}

# Normalize bounding box
def convert_coordinates(size, box):
    """
    This function converts the coordinates. 
    box: (xmin, xmax, ymin, ymax)
    size: (width, height)
 
    returns a touple where (x, y, height, width) of the boundary box
    """
    dw = 1./(size[0])
    dh = 1./(size[1])
    x = (box[0] + box[1])/2.0 - 1
    y = (box[2] + box[3])/2.0 - 1
    w = box[1] - box[0]
    h = box[3] - box[2]
    x = x*dw
    w = w*dw
    y = y*dh
    h = h*dh
    return (x,y,w,h)
 
# Convert PASCAL VOC xml format to YOLO txt format
def convert_xml2yolo(filelist, lut ):
    """
    filelist: list of .xml file paths to convert to .txt file
    lut: a dictionary containing class_name to class_index mapping
    """
    for fname in filelist:
        xmldoc = minidom.parse(fname)
        fname_out = (fname[:-4]+'.txt')
 
        with open(fname_out, "w") as f:
            print(f'processing{fname}')
 
            itemlist = xmldoc.getElementsByTagName('object')
            size = xmldoc.getElementsByTagName('size')[0]
            width = int((size.getElementsByTagName('width')[0]).firstChild.data)
            height = int((size.getElementsByTagName('height')[0]).firstChild.data)
 
            for item in itemlist:
                # get class label
                classid =  (item.getElementsByTagName('name')[0]).firstChild.data
                if classid in lut:
                    label_str = str(lut[classid])
                else:
                    label_str = "-1"
                    print ("warning: label '%s' not in look-up table for file '%s'" % classid, fname )
                # get bbox coordinates
                xmin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmin')[0]).firstChild.data
                ymin = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymin')[0]).firstChild.data
                xmax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('xmax')[0]).firstChild.data
                ymax = ((item.getElementsByTagName('bndbox')[0]).getElementsByTagName('ymax')[0]).firstChild.data
                b = (float(xmin), float(xmax), float(ymin), float(ymax))
                bb = convert_coordinates((width,height), b)
                #print(bb)
                x = bb[0]
                y = bb[1]
                w = bb[2]
                h = bb[3]
                if x > 1.0: print('Error')
                if y > 1.0: print('Error')
                if w > 1.0: print('Error')
                if h > 1.0: print('Error')
 
                label_count[classid] = label_count.get(classid, 0) + 1
 
                f.write(label_str + " " + " ".join([("%.11f" % a) for a in bb]) + '\n')
        # print ("wrote %s" % fname_out)
        clear_output()

***Convert training xml file to txt file***

In [None]:
# Reading train Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
image_file_list = []
for format in formats:
    image_file_list.extend(glob.glob(f'/content/train/*.{format}'))
 
# Reading train XML label file paths
label_file_list_xml = glob.glob('/content/train/*.xml')
 
print(f'Image files found: {len(image_file_list)} \nLabel files found: { len(label_file_list_xml)}')

In [None]:
# Converting  train .xml file to .txt file
convert_xml2yolo(label_file_list_xml, lut)
label_file_list_txt = glob.glob('/content/train/*.txt')
print(f'XML --> TXT files: {len(label_file_list_txt)}')

In [None]:
# Print number of vehicles per classes in train
train_label_count = label_count  # used to visualize
label_count = {}
train_label_count

In [None]:
# Remove xml files
%rm -r /content/train/*.xml

In [None]:
# Convert to YOLOv5 data format
%mkdir /content/train/images /content/train/labels
for file in image_file_list:
    shutil.move(file, '/content/train/images/'+os.path.split(file)[1])
for file in label_file_list_txt:
    shutil.move(file, '/content/train/labels/'+os.path.split(file)[1])

***Convert validation xml file to txt file***

In [None]:
# Reading valid Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
image_file_list = []
for format in formats:
    image_file_list.extend(glob.glob(f'/content/valid/*.{format}'))
 
# Reading valid XML label file paths
label_file_list_xml = glob.glob('/content/valid/*.xml')
 
print(f'Image files found: {len(image_file_list)} \nLabel files found: { len(label_file_list_xml)}')

In [None]:
# Converting valid .xml file to .txt file
convert_xml2yolo(label_file_list_xml, lut)
label_file_list_txt = glob.glob('/content/valid/*.txt')
print(f'XML --> TXT files: {len(label_file_list_txt)}')

In [None]:
# Print number of vehicles per classes in valid
valid_label_count = label_count # used to visualize
valid_label_count

In [None]:
# Remove xml files
%rm -r /content/valid/*.xml

In [None]:
# Convert to YOLOv5 data format
%mkdir /content/valid/images /content/valid/labels
for file in image_file_list:
    shutil.move(file, '/content/valid/images/'+os.path.split(file)[1])
for file in label_file_list_txt:
    shutil.move(file, '/content/valid/labels/'+os.path.split(file)[1])

## ***Data Visualization***
Let us have a look at the existance of the labels in the dataset.

***Train data visualization***

In [None]:
# DataFrame Generation
df = pd.DataFrame({'labels': train_label_count.keys(), 'count': train_label_count.values()})
df.columns = ['labels', 'count']
df.sort_values(['count'], ascending = False, inplace =True)
df.head()

# Plotting
fig = px.bar(df, x="labels", y='count',  color="count",
    orientation='v', 
    title='Frequency of the Labels in Train data', 
    color_continuous_scale=px.colors.sequential.Viridis_r
)
fig.update_layout(title_x=0.5, xaxis_title = 'Labels', yaxis_title = 'Label Count')
fig.update_xaxes(tickangle=60)
fig.show()

***Valid data visualization***

In [None]:
import pandas as pd
import plotly.express as px

# DataFrame Generation
df = pd.DataFrame({'labels': valid_label_count.keys(), 'count': valid_label_count.values()})
df.columns = ['labels', 'count']
df.sort_values(['count'], ascending = False, inplace =True)
df.head()

# Plotting
fig = px.bar(df, x="labels", y='count',  color="count",
    orientation='v', 
    title='Frequency of the Labels in valid data', 
    color_continuous_scale=px.colors.sequential.Viridis_r
)
fig.update_layout(title_x=0.5, xaxis_title = 'Labels', yaxis_title = 'Label Count')
fig.update_xaxes(tickangle=60)
fig.show()

<a id="4"></a>
## ***Resizing all the Images***

***Define resize function***

In [None]:
def resize_images(file_list, width = 1024, height = 1024, overwrite = True, save_dir = ''):
    total_files = len(file_list)
    idx = 1
    for path in file_list:
        img = Image.open(path)
        img_resized = img.resize((width, height), Image.ANTIALIAS)
        if overwrite:
            img_resized.save(path)
            filename = path.split('/')[-1] 
            print(f"{idx}/{total_files}: {filename} {img.size}--> ({width}x{height})")
        else:
            filename = path.split('/')[-1]
            img_resized.save(save_dir + filename)
            print(f'{filename} saved to {save_dir}')
        idx +=1
    clear_output()

***Resize train images***

In [None]:
# Reading train Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
image_file_list = []
for format in formats:
    image_file_list.extend(glob.glob(f'/content/train/images/*.{format}'))

print(f'Image files found: {len(image_file_list)}')

In [None]:
resize_images(image_file_list , overwrite= True)

***Resize valid images***

In [None]:
# Reading valid Image file paths
formats = ['jpg', 'jpeg', 'JPG', 'png', 'PNG']
image_file_list = []
for format in formats:
    image_file_list.extend(glob.glob(f'/content/valid/images/*.{format}'))

print(f'Image files found: {len(image_file_list)}')

In [None]:
resize_images(image_file_list , overwrite= True)

## ***Adding blur and dark augmented images to train***

> 537 images are selected by random shuffling from train data released by organizer and blur and dark augmentation is applied to them using roboflow platform



In [None]:
%cd /content/
!gdown --id 19b_nmsDZrJAWaL0AArN5_lQ0ftYWOL3m
!unzip roboflow.zip; rm roboflow.zip;
clear_output()

# Move images to trian
files = glob.glob(f'/content/roboflow/images/*')
for file in files:
    shutil.move(file, '/content/train/images/')

# Move labels to train
files = glob.glob(f'/content/roboflow/labels/*')
for file in files:
    shutil.move(file, '/content/train/labels/')

# Remove roboflow folder
!rm -r /content/roboflow

## ***Finally copy valid data to train***

> To increase train data



In [None]:
# Copy images to train
files = glob.glob(f'/content/valid/images/*')
for file in files:
    shutil.copy(file, '/content/train/images/')

# Copy labels to train
files = glob.glob(f'/content/valid/labels/*')
for file in files:
    shutil.copy(file, '/content/train/labels/')

## ***Dataset statistics***

In [None]:
train_imgs = glob.glob(f'/content/train/images/*')
train_labels = glob.glob(f'/content/train/labels/*')
valid_imgs = glob.glob(f'/content/valid/images/*')
valid_labels = glob.glob(f'/content/valid/labels/*')
print('Train images: ' + str(len(train_imgs)))
print('Train labesl: ' + str(len(train_labels)))
print('Valid images: ' + str(len(valid_imgs)))
print('Valid labels: ' + str(len(valid_labels)))

<a id="6"></a>
## ***Creating data.yaml file***

In [None]:
#Customize IPython write file so we can write variables
from IPython.core.magic import register_line_cell_magic

@register_line_cell_magic
def writetemplate(line, cell):
  with open(line, 'w') as f:
    f.write(cell.format(**globals()))

In [None]:
# list of labels
names = [k for k, v in lut.items()]
names

In [None]:
%%writetemplate /content/data.yaml

train: ../train/images
val: ../valid/images

nc: 21
names: {names}

<a id="7"></a>
## ***Saving the Processed Dataset***

In [None]:
# Zip all to dataset.zip
%cd /content/
!zip -r dataset.zip train valid data.yaml
clear_output()

In [None]:
# Mount google drive
drive.mount('/content/drive')

In [None]:
# Copy dataset.zip to drive
!cp dataset.zip '/content/drive/My Drive/Colab Notebooks/dataset/'