In [1]:
# import some common libraries
import os, json, cv2, random, shutil, time
import numpy as np
import pandas as pd
from tqdm import tqdm

In [2]:
# global variables and constants
datasets_dir = os.path.join('..', 'datasets')
TRAIN_SPLIT = 0.7
RNG_SEED = 117

In [3]:
def create_empty_dir(directory: str):
    '''
    Create an empty directory
    @param  string      The path to the directory that we want to create,
                        this can be an existing directory that will be 
                        recreated. NOTE: this will remove the current content
                        of that directory
    '''
    # remove the dir if it exists and recreate
    if os.path.isdir(directory): 
        shutil.rmtree(directory)
    os.makedirs(directory)

In [4]:
def copy_file(filename: str, destination_dir: str):
    '''
    Copy an image from the complete dataset to a destination directory
    @param  string      The name of the image
    @param  string      The path to the directory we want to copy the file to
    '''
    # copy the file from the folder with all 
    # images to the destination directory
    source = os.path.join(datasets_dir, 'complete', filename)
    destination = os.path.join(destination_dir, filename)
    shutil.copyfile(source, destination)

In [5]:
def split_data(train_split: float=TRAIN_SPLIT, force_new_split: bool=False, extended: bool=False):
    '''
    Split the dataset into a train and test set
    @param  float      The split of the data that should go to the train set
    @param  bool       Should we force a new split of the data?
    '''
    
    # get the path to the train and test files
    train_dir = os.path.join(datasets_dir, 'train')
    test_dir = os.path.join(datasets_dir, 'test')
    
    # if we want to create the extended tra
    if extended:
        train_dir = train_dir + '_extended'
        test_dir = test_dir + '_extended'

    # if we already have the directories and we don't want to 
    # force a new split of the directories we need can stop
    if os.path.isdir(train_dir) and os.path.isdir(test_dir) and not force_new_split: return
    
    # empty the directories
    create_empty_dir(train_dir)
    create_empty_dir(test_dir)
    
    # load the csv data of the images with the redaction types
    data_csv = pd.read_csv(os.path.join(datasets_dir, 'data_complete.csv'))

    # group the data per label and create a split per label
    for label, label_df in data_csv.groupby('type'):

        # if we don't want to create an extended set,
        # we should skip the 'no_annotation' images
        if not extended and label == 'no_annotation': continue

        # create the split for this label
        train_df = label_df.sample(frac = TRAIN_SPLIT, random_state = RNG_SEED)
        test_df = label_df.drop(train_df.index)

        # copy the training files into the train directory
        for filename in train_df['File']:
            copy_file(filename, train_dir)

        # copy the test files into the test directory
        for filename in test_df['File']:
            copy_file(filename, test_dir)