In [55]:
# The point of this program is to take the "merged_dataset" data and labels,
# split it into 80% training/10% validation/10% test 
# and create ground truth files for each of these 
# and then to output these into the "split_dataset" directory
#
from random import shuffle
import imageio as io
import os
import re


In [56]:
# First, we are going to load up the names of all of the files into an array
# We are also going to read in the entire groundtruth file to an array of arrays representing
# each line
#
merged_dir = './merged_dataset/data/'
in_files = os.listdir(merged_dir)


# Sort the directory of image files 
sort_list = []
for file in in_files:
    ind = int(re.findall(r'\d+', file)[0])
    sort_list.append({'file': file, 'ind': ind})
    
sort_list = sorted(sort_list, key=lambda k: k['ind'])
in_files = [item['file'] for item in sort_list]


in_gt = open('./merged_dataset/gt16490.txt', 'r')
original_gt = []
for line in in_gt:
    line = line.split(' ')
    original_gt.append(line)

print('SHOULD BE THE SAME: ', len(in_files), len(original_gt))

SHOULD BE THE SAME:  16490 16490


In [57]:
# Now we want to combine the ground truth and the file names into one array, where we can then 
# test that each ground truth is with the correct file.
total_data = []
for i in range(len(in_files)):
    total_data.append({'file': in_files[i], 'gt': original_gt[i]}) 

# SHOULD OUTPUT NOTHING!!!    
for item in total_data:
    if int(item['gt'][0]) != int(re.findall(r'\d+', item['file'])[0]): 
        print('FAILURE')

In [86]:
# Now we need will find the quantities to be included in the test/training/validation split
# Then we completely stochastically select MUTUALLY EXCLUSIVE SUBSETS of items from this list for each of
# the sets
#    - Shuffle the list completely randomly 
#    - First 80% go to training, 10% validation, 10% test 

# !!! WATCH OUT FOR OFF-BY-ONES: These numbers work here, but if total changes, this could crash
train_split = .8
val_split = .1
test_split = .1

total_images = len(total_data)
train_split_count = round(total_images * train_split)
validation_split_count = round(total_images * val_split)
test_split_count = round(total_images * test_split)

shuffle(total_data) 
train_data = total_data[:train_split_count] 
validation_data = total_data[train_split_count:(train_split_count + validation_split_count)] 
test_data = total_data[(train_split_count + validation_split_count)
                        :(train_split_count + validation_split_count + test_split_count)] 


#
#
#
print('Just some rudimentary tests that the datasets are the correct size and sufficiently scrambled')
print('============================================================================================= \n')
print(len(train_data))
for item in train_data[:3]:
    print(item)
    
print('\n' + str(len(validation_data)))
for item in validation_data[:3]:
    print(item)
    
print('\n' + str(len(test_data)))
for item in test_data[:3]:
    print(item)

Just some rudimentary tests that the datasets are the correct size and sufficiently scrambled

13192
{'gt': ['14988', '0', '\n'], 'file': 'UNR_14988.jpg'}
{'gt': ['6568', '5', '709', '199', '54', '47', '812', '200', '92', '77', '18', '271', '209', '174', '1201', '248', '80', '95', '504', '186', '93', '89', '\n'], 'file': 'BG_UNR_6568.jpg'}
{'gt': ['9062', '3', '1008', '367', '193', '148', '1163', '395', '117', '131', '363', '344', '523', '222', '\n'], 'file': 'UNR_9062.jpg'}

1649
{'gt': ['9875', '1', '163', '452', '202', '76', '\n'], 'file': 'UNR_9875.jpg'}
{'gt': ['11470', '0', '\n'], 'file': 'UNR_11470.jpg'}
{'gt': ['8101', '1', '864', '341', '329', '203', '\n'], 'file': 'UNR_8101.jpg'}

1649
{'gt': ['15829', '1', '967', '251', '225', '128', '\n'], 'file': 'UNR_15829.jpg'}
{'gt': ['5720', '1', '344', '229', '261', '181', '\n'], 'file': 'BG_UNR_5720.jpg'}
{'gt': ['1350', '4', '334', '186', '33', '25', '300', '185', '27', '19', '196', '166', '57', '40', '251', '174', '26', '29\n'], 'f

In [91]:
# Now Let's sort the training/validation/test arrays by their indices (same way we did in second block) 
def sort_by_file_name(unsorted):
    indexed_list = []
    for item in unsorted:
        ind = int(re.findall(r'\d+', item['file'])[0])
        indexed_list.append({'contents': item, 'ind': ind})
    
    indexed_list = sorted(indexed_list, key=lambda k: k['ind'])
    return [item['contents'] for item in indexed_list]

sorted_train = sort_by_file_name(train_data)
sorted_validation = sort_by_file_name(validation_data)
sorted_test = sort_by_file_name(test_data)


{'gt': ['3', '2', '295', '151', '37', '25', '237', '147', '48', '29\n'], 'file': 'BG_SYSU_3.jpg'}
{'gt': ['7', '1', '289', '147', '57', '33\n'], 'file': 'BG_SYSU_7.jpg'}
{'gt': ['19', '3', '253', '190', '49', '36', '304', '191', '30', '26', '332', '193', '19', '19\n'], 'file': 'BG_SYSU_19.jpg'}
{'gt': ['36', '1', '284', '195', '44', '33\n'], 'file': 'BG_SYSU_36.jpg'}
{'gt': ['41', '1', '288', '190', '48', '39\n'], 'file': 'BG_SYSU_41.jpg'}
{'gt': ['56', '6', '242', '188', '79', '67', '325', '191', '43', '42', '368', '196', '95', '66', '142', '183', '47', '41', '183', '180', '36', '50', '212', '176', '44', '50\n'], 'file': 'BG_SYSU_56.jpg'}
{'gt': ['67', '3', '84', '114', '123', '74', '204', '118', '60', '50', '313', '123', '99', '69\n'], 'file': 'BG_SYSU_67.jpg'}
{'gt': ['74', '2', '33', '111', '110', '69', '187', '101', '120', '102\n'], 'file': 'BG_SYSU_74.jpg'}
{'gt': ['79', '3', '461', '109', '179', '162', '352', '124', '43', '36', '294', '121', '34', '26\n'], 'file': 'BG_SYSU_79.jp

In [103]:
# Now we have 3 arrays of file names and their ground truths  
# 
# Write a function that will write all of the files contained in each one to their respective directories
# and write their ground truths to the file in the same order
test_dir = './split_dataset/test/'
train_dir = './split_dataset/train/'
validation_dir = './split_dataset/validation/'


def mv_img_and_write_gt(data_list, out_dir, in_dir):
    # Initialize a string for the ground truth
    gt_txt = '' 

    for item in data_list:
        img = io.imread(in_dir + item['file'])
        io.imwrite(out_dir + 'data/' + item['file'], img)
        
        # Deal with gt stuff
        line = ' '.join(item['gt'])
        gt_txt = gt_txt + line 
            
    gt_out = open(out_dir + 'gt.txt', 'w') 
    gt_out.write(gt_txt) 

    

In [104]:
mv_img_and_write_gt(sorted_test, test_dir, merged_dir)
mv_img_and_write_gt(sorted_validation, validation_dir, merged_dir)
mv_img_and_write_gt(sorted_train, train_dir, merged_dir)
