# Notebook Configuration

In [1]:
import os
import sys
import shutil
import re
import pandas as pd
import numpy as np

from collections import Counter
from sklearn.model_selection import train_test_split

sys.path.append('/home/rodari78/tesis/CNN_bike_lanes/')
from libs.YOLO.preprocessing import *

Declare paths and constants

In [7]:
# declare data version
PRM_VERSION = 11
PRM_VERSION = str(PRM_VERSION).zfill(3)

PRM_DATASET_NAME = f'datasets_track/subsets_v{PRM_VERSION}.csv'

# dataset paths
PRM_PATH_IN_ROOT = 'labelstudio_output'
PRM_PATH_IN_IMG = 'datasets_raw/lima_bike_lanes'
PRM_PATH_OUT_ROOT = 'datasets/lima_bike_lanes'
PRM_PATH_OUT_ROOT_SUBSET = 'datasets/lima_bike_lanes_subsets'

# declare output image and label path
PRM_PATH_OUT_IMG = f'{PRM_PATH_OUT_ROOT}/images'
PRM_PATH_OUT_LBL = f'{PRM_PATH_OUT_ROOT}/labels'
PRM_PATH_OUT_IMG_SUB = f'{PRM_PATH_OUT_ROOT_SUBSET}/images'
PRM_PATH_OUT_LBL_SUB = f'{PRM_PATH_OUT_ROOT_SUBSET}/labels'

# declare subsets
subsets = ['train', 'val', 'test']

# create directories
for subset in subsets:
    if subset == 'test':
        os.makedirs(os.path.join(PRM_PATH_OUT_IMG, subset), exist_ok = True)
        os.makedirs(os.path.join(PRM_PATH_OUT_LBL, subset), exist_ok = True)

    os.makedirs(os.path.join(PRM_PATH_OUT_IMG_SUB, subset), exist_ok = True)
    os.makedirs(os.path.join(PRM_PATH_OUT_LBL_SUB, subset), exist_ok = True)

PRM_PATH_OUT_IMG = os.path.join(PRM_PATH_OUT_IMG, 'test')
PRM_PATH_OUT_LBL = os.path.join(PRM_PATH_OUT_LBL, 'test')

# select groups
# PRM_GROUPS = ['0-499',
#               '500-999',
#               '2000-2999',              
#               '4000-4499',
#               '4500-4999',
#               '5000-5499',
#               '5500-5999',
#               '6000-6499',
#               '6500-6999',
#               '7000-7499',
#               '7500-7999',
#               '8000-8499',                     
#               '10000-10499',
#               '11000-11499',
#               '11500-11999',
#               '12000-12499',
#               '12500-12999',
#               '13000-13499',
#               '13500-13999',
#               '14000-14499',
#               '15500-15999',
#               '26500-26999']

image_groups = sorted([col for col in os.listdir(PRM_PATH_IN_ROOT) if 'grupo' in col.lower()])
PRM_GROUPS = [f"{int(re.findall('[0-9]+', string)[0])}-{int(re.findall('[0-9]+', string)[1])}" for string in image_groups]
# zfill parameter
PRM_ZFILL = 6

# words to remove
PRM_FILES_TO_REMOVE = ['Zone.Identifier', '.cache']

Delete nasty files and copy labels from group to collector

In [5]:
for group in PRM_GROUPS:
    clean_nasty_files_from_group(PRM_PATH_IN_ROOT, group, PRM_FILES_TO_REMOVE)
    delete_nasty_files(PRM_PATH_IN_IMG, PRM_FILES_TO_REMOVE)
    copy_img_files(group, PRM_PATH_IN_IMG, PRM_PATH_OUT_IMG)
    copy_txt_files(PRM_PATH_IN_ROOT, group, PRM_PATH_OUT_LBL)

# Create subsets

Get labels per image

In [8]:
# declare empty dataframe to use it as holder
images = pd.DataFrame()
# image_groups = sorted(os.listdir(PRM_PATH_IN_ROOT))
# img_ids = [f"{int(re.findall('[0-9]+', string)[0])}-{int(re.findall('[0-9]+', string)[1])}" for string in image_groups]
# img_ids = ['0-499', '500-999', '4000-4499', '4500-4999', '10000-10499', '26500-26999']
img_ids = PRM_GROUPS
for group in img_ids:
    print(group)
    _, _, group_name = get_group_name(group)
    group_path = os.path.join(PRM_PATH_IN_ROOT, group_name)
    image = retrieve_labels_from_group(group_path)

    # save image with labels
    images = pd.concat([images, image], axis = 0, ignore_index = True)

0-499
500-999
1000-1999
2000-2999
3000-3999
4000-4499
4500-4999
5000-5499
5500-5999
6000-6499
6500-6999
7000-7499
7500-7999
8000-8499
8500-8999
9000-9499
9500-9999
10000-10499
10500-10999
11000-11499
11500-11999
12000-12499
12500-12999
13000-13499
13500-13999
14000-14499
14500-14999
15000-15499
15500-15999
26500-26999


Stratify partitions

In [9]:
# create column to stratify
images['strat'] = ('S') + \
                  (images['D00'] > 0).astype(int).astype(str) + \
                  (images['D10'] > 0).astype(int).astype(str) + \
                  (images['D20'] > 0).astype(int).astype(str) + \
                  (images['D40'] > 0).astype(int).astype(str)

# split in subsets
train_val, test = train_test_split(images,
                                   train_size = 0.70,
                                   stratify = images['strat'],
                                   shuffle = True,
                                   random_state = 123)

train, val = train_test_split(train_val,
                              train_size = 0.80,
                              stratify = train_val['strat'],
                              shuffle = True,
                              random_state = 123)

# concat images with subset
images_set = pd.concat([train.assign(subset = 'train'),
                        val.assign(subset = 'val'),
                        test.assign(subset = 'test')], axis = 0, ignore_index = True)

# add count of nulls
images_set.insert(loc = 1,
                  column = 'DNL',
                  value = np.where(images_set['strat'] == 'S0000', 1, 0))

# save images set
images_set.to_csv(PRM_DATASET_NAME, index = False)

In [10]:
images_set

Unnamed: 0,image,DNL,D00,D10,D20,D40,strat,subset
0,15603,1,0,0,0,0,S0000,train
1,10360,1,0,0,0,0,S0000,train
2,7416,0,1,0,0,0,S1000,train
3,12031,1,0,0,0,0,S0000,train
4,1026,0,2,2,0,6,S1101,train
...,...,...,...,...,...,...,...,...
15186,15318,0,3,0,0,0,S1000,test
15187,14803,0,1,2,0,0,S1100,test
15188,3081,0,1,1,0,0,S1100,test
15189,13503,0,0,1,0,0,S0100,test
