In [1]:
import glob, os
import shutil
import json
import random

import numpy as np
import pandas as pd

In [2]:
num_workers = 2
dataset = 'kermany'
dataset_info = 'kermany_metadata.csv'
worker_path = '..'

metadata = {
    "datatype": "image",
    "operations": {
        'use_grayscale': True,
        'use_alpha': False,
        'use_deepaugment': True
    }
}

In [3]:
for i in range(num_workers):
    data_path = '/{1}/data{0}'.format(i+1, dataset)
    os.makedirs(worker_path + data_path + '/train')
    os.makedirs(worker_path + data_path + '/evaluate')
    os.makedirs(worker_path + data_path + '/predict')
    # create metadata.json
    with open(worker_path + data_path + '/train/metadata.json', 'w') as f:
        json.dump(metadata, f)
    with open(worker_path + data_path + '/evaluate/metadata.json', 'w') as f:
        json.dump(metadata, f)
    with open(worker_path + data_path + '/predict/metadata.json', 'w') as f:
        json.dump(metadata, f)

In [4]:
df = pd.read_csv('kermany_metadata.csv')
df.head()

Unnamed: 0,path,target,subject,serial,image,source
0,../train/DRUSEN/DRUSEN-3086272-13.jpeg,1,3086272,13,DRUSEN-3086272-13.jpeg,train
1,../train/DRUSEN/DRUSEN-5743416-32.jpeg,1,5743416,32,DRUSEN-5743416-32.jpeg,train
2,../train/DRUSEN/DRUSEN-5434336-6.jpeg,1,5434336,6,DRUSEN-5434336-6.jpeg,train
3,../train/DRUSEN/DRUSEN-9547888-18.jpeg,1,9547888,18,DRUSEN-9547888-18.jpeg,train
4,../train/DRUSEN/DRUSEN-9059831-2.jpeg,1,9059831,2,DRUSEN-9059831-2.jpeg,train


In [16]:
unique_train_subjects = df[df['source'] == 'train']['subject'].unique()
unique_test_subjects = df[df['source'] == 'test']['subject'].unique()
random.Random(42).shuffle(unique_train_subjects)
random.Random(42).shuffle(unique_test_subjects)
unique_val_subjects, unique_train_subjects = np.split(unique_train_subjects, [600])

train_workers = np.array_split(unique_train_subjects, num_workers)
val_workers = np.array_split(unique_val_subjects, num_workers)
test_workers = np.array_split(unique_test_subjects, num_workers)
for i in range(num_workers):
    print("worker_" + str(i), len(train_workers[i]), len(val_workers[i]), len(test_workers[i]))

worker_0 2093 300 318
worker_1 2092 300 317


In [17]:
for i in range(num_workers):
    # create mapping.csv
    data_path = '/{1}/data{0}/'.format(i+1, dataset)
    df[df.subject.isin(train_workers[i])][['image', 'target']].to_csv(worker_path + data_path + '/train/mapping.csv', index=False)
    df[df.subject.isin(val_workers[i])][['image', 'target']].to_csv(worker_path + data_path + '/evaluate/mapping.csv', index=False)
    df[df.subject.isin(test_workers[i])][['image', 'target']].to_csv(worker_path + data_path + '/predict/mapping.csv', index=False)
    
    # distribute the images
    for file in df[df.subject.isin(train_workers[i])].path.values:
        shutil.copy2(file, worker_path + data_path + "/train/")
    print('{1} train data for worker_{0} is transferred'.format(i, dataset))
    
    for file in df[df.subject.isin(val_workers[i])].path.values:
        shutil.copy2(file, worker_path + data_path + "/evaluate/")
    print('{1} val data for worker_{0} is transferred'.format(i, dataset))
    
    for file in df[df.subject.isin(test_workers[i])].path.values:
        shutil.copy2(file, worker_path + data_path + "/predict/")
    print('{1} test data for worker_{0} is transferred'.format(i, dataset))

kermany train data for worker_0 is transferred
kermany val data for worker_0 is transferred
kermany test data for worker_0 is transferred
kermany train data for worker_1 is transferred
kermany val data for worker_1 is transferred
kermany test data for worker_1 is transferred
