In [1]:
import numpy as np
import ndjson
import json
import os

In [2]:
numpy_path = 'data/numpy/'
datapath = 'data/images_{0}.npy'
sequencepath = 'data/sequences_{0}.npy'
labelpath = 'data/labels_{0}.json'
testsize = 10000
valsize = 5000

# Get all the categories

In [3]:
def file_name(file_dir): 
    names = []
    for root, dirs, files in os.walk(file_dir):
        names.append(files) 
    names = [file.split('.')[0] for file in names[0]]
    return names

In [4]:
names = file_name(numpy_path)
name_dict = {}
for i in range(len(names)):
    name_dict[i] = names[i]

In [5]:
with open('categories.json', 'w') as fd:
    json.dump(names, fd)
with open('categories_dict.json', 'w') as fd:
    json.dump(name_dict, fd)

# Combine all the numpy file together, generate category, train/val/test split

In [None]:
# generate category, data
category_train = []
category_test = []
category_val = []
image_train = None
image_test = None
image_val = None
for i in range(len(names)):
    data = np.load(numpy_path+names[i]+'.npy')
    instance_num = data.shape[0]
    
    image_test = np.concatenate((image_test, data[-testsize:]), axis=0) if image_test is not None else data[-testsize:]
    image_val = np.concatenate((image_val, data[-testsize-valsize:-testsize]), axis=0) if image_val is not None else data[-testsize-valsize:-testsize]
    image_train = np.concatenate((image_train, data[:-testsize-valsize]), axis=0) if image_train is not None else data[:-testsize-valsize]
    # set category
    category_test.extend([i] * (testsize))
    category_val.extend([i] * (valsize))
    category_train.extend([i] * (instance_num-testsize-valsize))
    

In [None]:
# save data
with open(labelpath.format('train'), 'w') as fd:
    json.dump(category_train, fd)
with open(labelpath.format('val'), 'w') as fd:
    json.dump(category_val, fd)
with open(labelpath.format('test'), 'w') as fd:
    json.dump(category_test, fd)
    
np.save(datapath.format('train'), image_train)
np.save(datapath.format('val'), image_val)
np.save(datapath.format('test'), image_test)

In [None]:
del image_train, image_val, image_test
len(name_dict)

# Preprocessing NDJSON file

In [21]:
#first: stroke
#second: x/y
#third: point

# [1~254], 255: new stroke, 0: the end/default
# maximum length: 300
MAX_LENGTH = 200
simplify_path = 'data/simplify/'

In [33]:
def check_outrange(np_data, i, marker):
    if marker + 1 >= MAX_LENGTH:
        np_data[i,marker,0] = 0
        np_data[i,marker,1] = 0
        return True
    return False

In [24]:
def clamp(n, minn, maxn):
    return max(min(maxn, n), minn)

In [34]:
# preprocessing
for k in range(len(names)):
    op = open(simplify_path+names[k]+'.ndjson')
    data = ndjson.load(op)
    data_size = len(data)
    np_data = np.zeros((data_size,MAX_LENGTH,2), dtype=np.uint8)
    reverse_data = np.zeros((data_size,MAX_LENGTH,2), dtype=np.uint8)
    print('saving '+names[k])
    for i in range(len(data)):
        instance = data[i]['drawing']
        marker = 0
        for stroke in instance:
            for point in range(len(stroke[0])):
                # out of range, mannually end
                if check_outrange(np_data, i, marker):
                    continue
                else:
                    np_data[i,marker,0] = clamp(stroke[0][point], 1, 254)
                    np_data[i,marker,1] = clamp(stroke[1][point], 1, 254)
                    marker += 1
            # End of stroke
            if check_outrange(np_data, i, marker):
                continue
            else:
                np_data[i,marker,0] = 255
                np_data[i,marker,1] = 255
                marker += 1
        if check_outrange(np_data, i, marker):
            continue
        else:
            np_data[i,marker,0] = 0
            np_data[i,marker,1] = 0
        idx = np.where(np_data[i][:,0] == 0)[0][0]
        reverse_data[i][-(idx+1):, 0] = np_data[i][:idx+1, 0]
        reverse_data[i][-(idx+1):, 1] = np_data[i][:idx+1, 1]
    np.save(simplify_path+names[k]+'.npy', np_data)
    np.save(simplify_path+names[k]+'_new.npy', reverse_data)

saving squiggle
saving bread
saving violin
saving bush
saving eyeglasses
saving soccer ball
saving string bean
saving shovel
saving zebra
saving kangaroo
saving spoon
saving submarine
saving underwear
saving hot air balloon
saving pickup truck
saving snowman
saving chair
saving cloud
saving giraffe
saving axe
saving matches
saving aircraft carrier
saving camel
saving saxophone
saving streetlight
saving drums
saving camouflage
saving grass
saving snorkel
saving laptop
saving hot tub
saving car
saving passport
saving flying saucer
saving lobster
saving cactus
saving apple
saving helicopter
saving compass
saving pear
saving cannon
saving spider
saving fan
saving bandage
saving cruise ship
saving blueberry
saving crab
saving elbow
saving cooler
saving circle
saving sleeping bag
saving yoga
saving owl
saving cup
saving backpack
saving castle
saving couch
saving sword
saving bottlecap
saving flashlight
saving suitcase
saving key
saving jacket
saving The Mona Lisa
saving tooth
saving goatee
s

In [35]:
# generate category, data
sequence_train = None
sequence_test = None
sequence_val = None

for i in range(len(names)):
    np_data = np.load(simplify_path+names[i]+'_new.npy')
    sequence_test = np.concatenate((sequence_test, np_data[-testsize:]), axis=0) if sequence_test is not None else np_data[-testsize:]
    sequence_val = np.concatenate((sequence_val, np_data[-testsize-valsize:-testsize]), axis=0) if sequence_val is not None else np_data[-testsize-valsize:-testsize]
    sequence_train = np.concatenate((sequence_train, np_data[:-testsize-valsize]), axis=0) if sequence_train is not None else np_data[:-testsize-valsize]

np.save(sequencepath.format('train'), sequence_train)
np.save(sequencepath.format('val'), sequence_val)
np.save(sequencepath.format('test'), sequence_test)


    #new_data = np.full(np_data.shape, -3, dtype=np.int16)
    #for j in range(np_data.shape[0]):
    #    idx = np.where(np_data[j][:,0] == 0)[0][0]
    #    new_data[j][-(idx+1):, 0] = np_data[j][:idx+1, 0]
    #    new_data[j][-(idx+1):, 1] = np_data[j][:idx+1, 1]
    #print('processing '+names[i])
    #np.save(simplify_path+names[i]+'_new.npy', new_data)
    
    #sequence_test = np.concatenate((sequence_test, new_data[-testsize:]), axis=0) if sequence_test is not None else new_data[-testsize:]
    #sequence_val = np.concatenate((sequence_val, new_data[-testsize-valsize:-testsize]), axis=0) if sequence_val is not None else new_data[-testsize-valsize:-testsize]
    #sequence_train = np.concatenate((sequence_train, new_data[:-testsize-valsize]), axis=0) if sequence_train is not None else new_data[:-testsize-valsize]

#np.save(sequencepath.format('train'), sequence_train)
#np.save(sequencepath.format('val'), sequence_val)
#np.save(sequencepath.format('test'), sequence_test)

In [6]:
len(sequence_train)
# equence 45251266

NameError: name 'sequence_train' is not defined

In [9]:
len(names)

345