# The purpose of this script to generate test and train dataset with augmentation, as well as their labels in csv files

In [9]:
# -*- coding: utf-8 -*-
"""
Created on Mon Nov  4 23:20:16 2019

@author: Tomas Latysovic
"""
from data_aug.data_aug import *
from data_aug.bbox_util import *
import numpy as np 
import matplotlib.pyplot as plt 
import pickle as pkl
import glob
from PIL import Image
import os
import pandas as pd
import cv2
import random
from sklearn.model_selection import GroupShuffleSplit
pd.options.mode.chained_assignment = None  # default='warn'
model_code = "model"
img_ext = ".jpg"
%matplotlib inline

# Replace label to sign names and save to csv (model1)
d = {0 : "speed limit 20", #
1 : "speed limit 30",
2 : "speed limit 50",
3 : "speed limit 60",
4 : "speed limit 70",
5 : "speed limit 80",
6 : "restriction ends 80", #
7 : "speed limit 100",
8 : "speed limit 120",
9 : "no overtaking",
10 : "no overtaking trucks",
11 : "priority at next intersection",
12 : "priority road",
13 : "give way",
14 : "stop",
15 : "no traffic both ways",
16 : "no trucks",
17 : "no entry",
18 : "danger",
19 : "bend left",
20 : "bend right",
21 : "bend",
22 : "uneven road",
23 : "slippery road",
24 : "road narrows",
25 : "construction",
26 : "traffic signal",
27 : "pedestrian crossing",
28 : "school crossing", #
29 : "cycles crossing",
30 : "snow",
31 : "animals",
32 : "restriction ends", #
33 : "go right",
34 : "go left",
35 : "go straight",
36 : "go right or straight",
37 : "go left or straight",
38 : "keep right",
39 : "keep left",
40 : "roundabout",
41 : "restriction ends overtaking",
42 : "restriction ends overtaking trucks"}


class_cat = {0: "prohibitory", #
1 : "prohibitory",
2 : "prohibitory",
3 : "prohibitory",
4 : "prohibitory",
5 : "prohibitory",
6 : "other", #
7 : "prohibitory",
8 : "prohibitory",
9 : "prohibitory",
10 : "prohibitory",
11 : "danger",
12 : "other",
13 : "other",
14 : "other",
15 : "prohibitory",
16 : "prohibitory",
17 : "other",
18 : "danger",
19 : "danger",
20 : "danger",
21 : "danger",
22 : "danger",
23 : "danger",
24 : "danger",
25 : "danger",
26 : "danger",
27 : "danger",
28 : "danger", #
29 : "danger",
30 : "danger",
31 : "danger",
32 : "other", #
33 : "mandatory",
34 : "mandatory",
35 : "mandatory",
36 : "mandatory",
37 : "mandatory",
38 : "mandatory",
39 : "mandatory",
40 : "mandatory",
41 : "other",
42 : "other"}

cat_class = {
    "prohibitory":0,
    "mandatory":1,
    "danger":2,
    "other":3
}


inv_d = {v: k for k, v in d.items()} # model1
class_to_four_categories = {v: k for k, v in cat_class.items()} # model1

In [21]:
if not(os.path.isdir("dataset/"+model_code)):
    os.mkdir("dataset/"+model_code)

mainFolder = "dataset/FullIJCNN2013/"
with open(mainFolder + "gt.txt", 'r') as f:
    file = f.readlines()
    file = [value.strip().split(';')  for value in file]
    df = pd.DataFrame(file)


df_copy = df.copy()
df_copy["height"] = 800
df_copy["width"] = 1360
cols = df_copy.columns.tolist()
print(cols)
df_copy = df_copy[[0, "width", 'height', 5, 1,2,3,4]]
df_copy = df_copy.rename({0: 'filename',\
                          5: 'class', 1: 'xmin', 2: 'ymin', 3: 'xmax', 4: 'ymax'}, axis=1)
print(df_copy.head(10))







[0, 1, 2, 3, 4, 5, 'height', 'width']
    filename  width  height class xmin ymin  xmax ymax
0  00000.ppm   1360     800    11  774  411   815  446
1  00001.ppm   1360     800    40  983  388  1024  432
2  00001.ppm   1360     800    38  386  494   442  552
3  00001.ppm   1360     800    13  973  335  1031  390
4  00002.ppm   1360     800    39  892  476  1006  592
5  00003.ppm   1360     800     4  742  443   765  466
6  00003.ppm   1360     800     9  742  466   764  489
7  00003.ppm   1360     800    21  737  412   769  443
8  00004.ppm   1360     800    21  898  342   967  409
9  00004.ppm   1360     800     2  906  407   955  459


In [68]:
# Check class distribution
df_copy["class"].value_counts().sum()

1213

In [41]:
if not(os.path.isdir("dataset/"+model_code+"/train")):
    os.mkdir("dataset/"+model_code+"/train")
if not(os.path.isdir("dataset/"+model_code+"/test")):
    os.mkdir("dataset/"+model_code+"/test")
    
if not(os.path.isdir("dataset/"+model_code+"/train_aug")):
    os.mkdir("dataset/"+model_code+"/train_aug")
if not(os.path.isdir("dataset/"+model_code+"/negative_images")):
    os.mkdir("dataset/"+model_code+"/negative_images")

In [45]:
u = df[0].unique().tolist()
file_list = sorted(glob.glob("dataset/FullIJCNN2013/*.ppm"))
filenames = []
for i in file_list:
    filenames.append(str(i.split("\\")[1]))
negative_images = set(filenames) - set (u)

for file_name in negative_images:
    name = file_name.split(".")[0]
    # Read image
    img = cv2.imread("dataset/"+"FullIJCNN2013/"+file_name)
    img_rgb = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
    # save
    cv2.imwrite("dataset/"+model_code+"/negative_images/"+ name + img_ext, cv2.cvtColor(img_rgb, cv2.COLOR_RGB2BGR))
    

In [71]:
not_empty=True
seed=0
while not_empty:
    train_inds, test_inds = next(GroupShuffleSplit(test_size=0.3, n_splits=1,\
                                                   random_state = seed).split(df_copy,\
                                                    groups=df_copy['filename']))
    train = df_copy.iloc[train_inds]
    test = df_copy.iloc[test_inds]
    


    # Train test split

    train_list_names = train["filename"].unique().tolist()
    test_list_names = test["filename"].unique().tolist()

    df_test = df_copy.loc[df_copy["filename"].isin(test_list_names)]
    df_train = df_copy.loc[df_copy["filename"].isin(train_list_names)]

    df_test["class"] = df_test["class"].astype(int)
    df_train["class"] = df_train["class"].astype(int)



    dataset_dif = set(df_train["class"]) - set(df_test["class"])
    dataset_dif_from_test = set(df_test["class"]) - set(df_train["class"])
    print("train-test ", dataset_dif, len(dataset_dif))
    print("test-train ", dataset_dif_from_test, len(dataset_dif_from_test))


    if (len(dataset_dif) == 0) and (len(dataset_dif_from_test) == 0):
        not_empty=False
    else:
        seed+=1

sum os signs:  1213
train-test  {19, 31} 2
test-train  set() 0
sum os signs:  1213
train-test  {41, 27, 39, 31} 4
test-train  {19} 1
sum os signs:  1213
train-test  {0, 31} 2
test-train  set() 0
sum os signs:  1213
train-test  {27, 37, 31} 3
test-train  {19} 1
sum os signs:  1213
train-test  {0, 27, 37} 3
test-train  set() 0
sum os signs:  1213
train-test  {0, 32, 39, 19, 29} 5
test-train  set() 0
sum os signs:  1213
train-test  {16, 20, 37, 31} 4
test-train  set() 0
sum os signs:  1213
train-test  {32, 19, 37, 31} 4
test-train  set() 0
sum os signs:  1213
train-test  {16, 36, 39, 31} 4
test-train  set() 0
sum os signs:  1213
train-test  {19, 37, 31} 3
test-train  set() 0
sum os signs:  1213
train-test  {0, 19, 20, 37} 4
test-train  set() 0
sum os signs:  1213
train-test  {31} 1
test-train  set() 0
sum os signs:  1213
train-test  {19, 37} 2
test-train  set() 0
sum os signs:  1213
train-test  {32, 37, 19, 29} 4
test-train  set() 0
sum os signs:  1213
train-test  {29, 21} 2
test-train  {

sum os signs:  1213
train-test  {24, 19, 37} 3
test-train  set() 0
sum os signs:  1213
train-test  {24, 37, 31} 3
test-train  {19} 1
sum os signs:  1213
train-test  {40, 19, 20, 37} 4
test-train  {31} 1
sum os signs:  1213
train-test  {37, 29, 15} 3
test-train  {19} 1
sum os signs:  1213
train-test  {37, 39, 19, 21, 27} 5
test-train  set() 0
sum os signs:  1213
train-test  {40, 41, 19} 3
test-train  {37} 1
sum os signs:  1213
train-test  set() 0
test-train  {19} 1
sum os signs:  1213
train-test  {0, 19, 39, 31} 4
test-train  set() 0
sum os signs:  1213
train-test  {42, 19, 37, 31} 4
test-train  set() 0
sum os signs:  1213
train-test  {16, 41, 32, 22} 4
test-train  {37} 1
sum os signs:  1213
train-test  {37, 15} 2
test-train  {31} 1
sum os signs:  1213
train-test  {32, 42, 19, 21, 31} 5
test-train  {37} 1
sum os signs:  1213
train-test  {37, 22, 39} 3
test-train  set() 0
sum os signs:  1213
train-test  {0, 27, 37, 39} 4
test-train  set() 0
sum os signs:  1213
train-test  {39, 19, 21, 31

In [101]:
print(df_test.dtypes, "\n")
print(df_train.dtypes)

filename    object
width        int64
height       int64
class        int32
xmin        object
ymin        object
xmax        object
ymax        object
dtype: object 

filename    object
width        int64
height       int64
class        int32
xmin        object
ymin        object
xmax        object
ymax        object
dtype: object


0       00000.ppm
1       00001.ppm
2       00001.ppm
3       00001.ppm
5       00003.ppm
          ...    
1207    00895.ppm
1208    00896.ppm
1209    00897.ppm
1210    00898.ppm
1211    00898.ppm
Name: filename, Length: 856, dtype: object

In [78]:
import imgaug as ia
import imgaug.augmenters as iaa
from imgaug.augmentables.bbs import BoundingBox, BoundingBoxesOnImage
ia.seed(1)

GREEN = [0, 255, 0]
ORANGE = [255, 140, 0]
RED = [255, 0, 0]
# df_test["filename"] = df_test["filename"].str.replace('.ppm','.png')
# df_train["filename"] = df_train["filename"].str.replace('.ppm','.png')
# df_test.replace({"class": d}, inplace = True)
# df_train.replace({"class": d}, inplace = True)
cols=df_copy.columns.tolist()
df_train_labels = pd.DataFrame(columns=cols)
df_test_labels = pd.DataFrame(columns=cols)
df_train_labels_aug = pd.DataFrame(columns=cols)
print(cols)
print(model_code)

['filename', 'width', 'height', 'class', 'xmin', 'ymin', 'xmax', 'ymax']
model3


In [89]:
split = True
isresize = False
if split: 
    print("Splitting actual images...")  
    for idx, file_name in enumerate(test_list_names):
        name = file_name.split(".")[0]
        print("Name in test set", name)
        # Read image
        img = cv2.imread("dataset/"+"FullIJCNN2013/"+file_name)
        img_rgb = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        bboxes = df_test[df_test["filename"]==file_name][["xmin", "ymin", "xmax", "ymax", "class"]].astype(float).values
        
        if isresize == True:
            # Resizing original
            img_, bboxes_ = Resize(300)(img_rgb.copy(), bboxes.copy())
            bboxes_ = np.round(bboxes_, 0).astype(int)
        else:
            img_, bboxes_ = img_rgb.copy(), bboxes.copy()
        
        # save image
        height, width, channels = img_.shape
        for bb in bboxes_:
            df_test_labels = df_test_labels.append({'filename':name+img_ext, 'width':width,
                                                'height':height, 'class':bb[:][-1], 'xmin':bb[:][0],
                                                    'ymin':bb[:][1], 'xmax':bb[:][2], 'ymax':bb[:][3]}, ignore_index=True)
        cv2.imwrite("dataset/"+model_code+"/test/"+ name + img_ext, cv2.cvtColor(img_, cv2.COLOR_RGB2BGR))
    


    for idx, file_name in enumerate(train_list_names): # train_list_names
        name = file_name.split(".")[0]
        print("Name in train set", name)
        
        
        # Read image
        img = cv2.imread("dataset/"+"FullIJCNN2013/"+file_name)
        img_rgb = cv2.cvtColor(img,cv2.COLOR_BGR2RGB)
        bboxes = df_train[df_train["filename"]==file_name][["xmin", "ymin", "xmax", "ymax", "class"]].astype(float).values
        
        
        
        if isresize == True:
            # Resizing original
            img_, bboxes_ = Resize(300)(img_rgb.copy(), bboxes.copy())
            bboxes_ = np.round(bboxes_, 0).astype(int)
        else:
            img_, bboxes_ = img_rgb.copy(), bboxes.copy()
        
            
        # save image
        height, width, channels = img_.shape
        for bb in bboxes_:
            df_train_labels = df_train_labels.append({'filename':name+img_ext, 'width':width,
                                                'height':height, 'class':bb[:][-1], 'xmin':bb[:][0],
                                                    'ymin':bb[:][1], 'xmax':bb[:][2], 'ymax':bb[:][3]}, ignore_index=True)
            df_train_labels_aug = df_train_labels_aug.append({'filename':name+img_ext, 'width':width,
                                                'height':height, 'class':bb[:][-1], 'xmin':bb[:][0],
                                                    'ymin':bb[:][1], 'xmax':bb[:][2], 'ymax':bb[:][3]}, ignore_index=True)
            
        cv2.imwrite("dataset/"+model_code+"/train/"+ name + img_ext, cv2.cvtColor(img_, cv2.COLOR_RGB2BGR))
        cv2.imwrite("dataset/"+model_code+"/train_aug/"+ name + img_ext, cv2.cvtColor(img_, cv2.COLOR_RGB2BGR))
           
            
        
        # Sequence 1
        result = False
        while result == False:
            try:
                if isresize == True:
                    seq = Sequence([RandomHSV(hue = 30, saturation = 30, brightness = 20),
                                RandomScale(0.15, diff = False), RandomTranslate(0.15, diff = False), RandomRotate(angle = 5),
                                RandomShear(shear_factor = 0.15), Resize(300)])
                else:
                    seq = Sequence([RandomHSV(hue = 30, saturation = 30, brightness = 20),
                                RandomScale(0.15, diff = False), RandomTranslate(0.15, diff = False), RandomRotate(angle = 5),
                                RandomShear(shear_factor = 0.15)])                    

                img_, bboxes_ = seq(img_rgb.copy(), bboxes.copy())
                result = True
            except:
                result =  False
        

        bboxes_ = np.round(bboxes_, 0).astype(int)
        # save image
        height, width, channels = img_.shape

        for bb in bboxes_:
            df_train_labels_aug = df_train_labels_aug.append({'filename':name+"a"+img_ext, 'width':width,
                                                'height':height, 'class':bb[:][-1], 'xmin':bb[:][0],
                                                    'ymin':bb[:][1], 'xmax':bb[:][2], 'ymax':bb[:][3]}, ignore_index=True)
            
        cv2.imwrite("dataset/"+model_code+"/train_aug/"+ name + "a"+img_ext, cv2.cvtColor(img_, cv2.COLOR_RGB2BGR))
         
        if isresize == True:
            # Sequence 2
            result = False
            while result == False:
                try:
                    seq = Sequence([RandomHSV(hue=30, saturation=30, brightness=25), RandomExpand(0.5), RandomCrop(1), Resize(300)])                
                    img_, bboxes_ = seq(img_rgb.copy(), bboxes.copy())
                    result = True
                except:
                    result =  False
            bboxes_ = np.round(bboxes_, 0).astype(int)
            # save image
            height, width, channels = img_.shape
            for bb in bboxes_:
                df_train_labels = df_train_labels.append({'filename':name+"b"+img_ext, 'width':width,
                                                    'height':height, 'class':bb[:][-1], 'xmin':bb[:][0],
                                                        'ymin':bb[:][1], 'xmax':bb[:][2], 'ymax':bb[:][3]}, ignore_index=True)
            cv2.imwrite("dataset/"+model_code+"/train/"+ name + "b"+img_ext, cv2.cvtColor(img_, cv2.COLOR_RGB2BGR))

        
        


        


    
        


Splitting actual images...
Name in test set 00002
Name in test set 00010
Name in test set 00016
Name in test set 00017
Name in test set 00018
Name in test set 00019
Name in test set 00020
Name in test set 00021
Name in test set 00025
Name in test set 00026
Name in test set 00027
Name in test set 00031
Name in test set 00034
Name in test set 00043
Name in test set 00049
Name in test set 00057
Name in test set 00059
Name in test set 00063
Name in test set 00067
Name in test set 00069
Name in test set 00075
Name in test set 00076
Name in test set 00077
Name in test set 00079
Name in test set 00080
Name in test set 00085
Name in test set 00090
Name in test set 00092
Name in test set 00096
Name in test set 00102
Name in test set 00103
Name in test set 00104
Name in test set 00105
Name in test set 00106
Name in test set 00111
Name in test set 00116
Name in test set 00117
Name in test set 00125
Name in test set 00129
Name in test set 00130
Name in test set 00133
Name in test set 00137
Name in

Name in train set 00191
Name in train set 00192
Name in train set 00194
Name in train set 00195
Name in train set 00198
Name in train set 00199
Name in train set 00200
Name in train set 00201
Name in train set 00203
Name in train set 00204
Name in train set 00205
Name in train set 00206
Name in train set 00207
Name in train set 00208
Name in train set 00209
Name in train set 00210
Name in train set 00211
Name in train set 00212
Name in train set 00214
Name in train set 00216
Name in train set 00218
Name in train set 00219
Name in train set 00220
Name in train set 00222
Name in train set 00223
Name in train set 00225
Name in train set 00226
Name in train set 00227
Name in train set 00228
Name in train set 00229
Name in train set 00230
Name in train set 00232
Name in train set 00238
Name in train set 00240
Name in train set 00241
Name in train set 00242
Name in train set 00243
Name in train set 00245
Name in train set 00246
Name in train set 00247
Name in train set 00248
Name in train se

Name in train set 00801
Name in train set 00805
Name in train set 00806
Name in train set 00808
Name in train set 00809
Name in train set 00810
Name in train set 00811
Name in train set 00813
Name in train set 00816
Name in train set 00818
Name in train set 00820
Name in train set 00821
Name in train set 00822
Name in train set 00823
Name in train set 00828
Name in train set 00829
Name in train set 00833
Name in train set 00834
Name in train set 00836
Name in train set 00837
Name in train set 00841
Name in train set 00844
Name in train set 00845
Name in train set 00850
Name in train set 00852
Name in train set 00853
Name in train set 00854
Name in train set 00858
Name in train set 00860
Name in train set 00862
Name in train set 00863
Name in train set 00864
Name in train set 00865
Name in train set 00866
Name in train set 00867
Name in train set 00870
Name in train set 00872
Name in train set 00879
Name in train set 00881
Name in train set 00884
Name in train set 00886
Name in train se

In [153]:
# df_test_labels.replace({"class": d}, inplace = True)
# df_train_labels.replace({"class": d}, inplace = True)
df_test_labels = df_test_labels.replace({"class": class_cat})
df_train_labels = df_train_labels.replace({"class": class_cat})
df_train_labels_aug = df_train_labels_aug.replace({"class": class_cat})
df_test_labels.to_csv("dataset/"+model_code+"/test_labels.csv", sep=',', encoding='utf-8', index=False)
df_train_labels.to_csv("dataset/"+model_code+"/train_labels.csv", sep=',', encoding='utf-8', index=False)
df_train_labels_aug.to_csv("dataset/"+model_code+"/train_labels_aug.csv", sep=',', encoding='utf-8', index=False)
print("Successfully created csv label files")

# Check if coordinates right

Successfully created csv label files


In [158]:
df_train_labels_aug["class"].value_counts()

prohibitory    790
other          370
danger         292
mandatory      240
Name: class, dtype: int64