In [1]:
# High level overview:
#   Split the SYSU and the UNR data into an 80/10/10 training/validation/test split
#   and put them into the "merged_dataset/data" directory. 
#   There will also be a text file in the merged_dataset directory with the vehicular coordinates concatenated 
#
# Medium level overview:
#   Each of the filenames will have a prefix "SYSU" or "UNR" to describe which dataset it came from
#
#   If the file is good to go for background extraction, it will have be given the "BG" prefix 
#      Files in the first 2007 images of the UNR dataset, and all images
# 
#
import imageio as io
import glob
import os
import matplotlib.pyplot as plt
import re

merged_directory = './merged_dataset/'
merged_data_directory = merged_directory + 'data/'

total_file_count = 0

In [8]:
# First intake the data from the SYSU dataset, and intake the text from its ground truth file 
#   In the end you will want an array full of objects which contain a new filename in a 'file' field,
#   and the actual data in a 'data' field
SYSU_GT = []
SYSU_data = []
SYSU_data_directory = './SYSU_data/data/'

with open('./SYSU_data/GT5576.txt') as f:
    for line in f:
        line = line.split(' ')
        line[0] = str(int(line[0]) - 1)
        SYSU_GT.append(line)
        total_file_count = total_file_count + 1
        
SYSU_files = os.listdir(SYSU_data_directory)        


for file in SYSU_files:
    new_file_name = 'BG_SYSU_' + file
    img = io.imread(SYSU_data_directory + file)
    SYSU_data.append({'file': new_file_name, 'data': img}) 

    
# Now we need to write to the new directory
for item in SYSU_data:
    io.imwrite((merged_data_directory + item['file']), item['data'])
    
# Freeeeeee the memory brothaman 
SYSU_data = []

In [11]:
# !!! Now always remember that the total_file_count MAY BE OUT OF SYNC UNLESS THIS WHOLE PROGRAM IS RUN AGAIN
#
# Now intake the data from the UNR dataset, and intake the text from its ground truth file 
#   In the end you will want an array full of objects which contain a new filename in a 'file' field,
#   and the actual data in a 'data' field
# 
#   Remember that the first 2007 images will have the "BG" prefix in the file name
#
#   Also, remember to order the numbering here starting with the last number of the SYSU dataset 
#
#   Then, Iterate through the ground truth text here, and make sure that the new file number labels are synced 
#   up with the order of the rest of it all
#
UNR_GT = []
UNR_data_directory = './ARL_data/data/'

# !!! Hardcoded for simplicity, be cautious!!!
total_file_count = len(SYSU_files)

with open('./ARL_data/gt10913.txt') as f:
    for i, line in enumerate(f):
        line = line.split(' ')
        line[0] = str(total_file_count + i)
        UNR_GT.append(line)
        
        
UNR_files = os.listdir(UNR_data_directory)        
new_unr_files = []

for file in UNR_files:
    ind = int(re.findall(r'\d+', file)[0])
    new_unr_files.append({'file': file, 'i': ind}) 

sorted_list = sorted(new_unr_files, key=lambda k: k['i'])
UNR_files = [item['file'] for item in sorted_list]
    
print(total_file_count)
print(UNR_GT[0])
    
for i, file in enumerate(UNR_files):
    if i < 2008:
        new_file_name = 'BG_UNR_' + str(total_file_count) + '.jpg'
    else:
        new_file_name = 'UNR_' + str(total_file_count) + '.jpg'
   
    total_file_count = total_file_count + 1
    img = io.imread(UNR_data_directory + file)
    item = {'file': new_file_name, 'data': img} 
    io.imwrite((merged_data_directory + item['file']), item['data'])


5576
['5576', '4', '139', '248', '163', '80', '272', '230', '132', '49', '382', '219', '66', '36', '425', '208', '49', '39', '\n']


In [23]:
# Now we will have the text for an entirely new merged_groundtruth, 
# and we will have two arrays both fulll of the respective datum and their filenames 
#
# We will now write the ground truth to a file in the "merged_dataset" directory, and 
# iterate through the arrays to write their contents to the "merged_dataset/data" directory 
#
# UPDATE: The SYSU dataset will already  have been written to the directory by this point 
#
full_gt_string = ''
length = len(SYSU_GT) + len(UNR_GT)
gt_file = './merged_dataset/gt' + str(length) + '.txt'


for line in SYSU_GT:
    line = ' '.join(line)
    full_gt_string = full_gt_string + line 
    
for line in UNR_GT:
    line = ' '.join(line)
    full_gt_string = full_gt_string + line 
    
out_file = open(gt_file, 'w')
out_file.write(full_gt_string)

548665