In [1]:
import numpy as np
import ndjson
import json
import os

In [7]:
numpy_path = 'data/numpy/'
datapath = 'data/images_{0}.npy'
sequencepath = 'data/sequences3d_{0}.npy'
labelpath = 'data/labels_{0}.json'
testsize = 10000
valsize = 5000

#first: stroke
#second: x/y
#third: point

# [1~254], 255: new stroke, 0: the end/default
# maximum length: 300
MAX_LENGTH = 200
simplify_path = 'data/simplify/'

# Select 100 Categories, Get all the categories

In [8]:
def file_name(file_dir): 
    names = []
    for root, dirs, files in os.walk(file_dir):
        names.append(files) 
    names = [file.split('.')[0] for file in names[0]]
    return names

In [9]:
names = file_name(numpy_path)[:100]
name_dict = {}
for i in range(len(names)):
    name_dict[i] = names[i]

In [10]:
with open('categories.json', 'w') as fd:
    json.dump(names, fd)
with open('categories_dict.json', 'w') as fd:
    json.dump(name_dict, fd)

In [12]:
names[1]

'key'

# Filter images with too few strokes

In [25]:
select_mask = {}

for k in range(len(names)):
    op = open(simplify_path+names[k]+'.ndjson')
    data = ndjson.load(op)
    data_size = len(data)
    select_length = np.zeros((data_size), dtype=np.int16)
    
    for i in range(len(data)):
        instance = data[i]['drawing']
        count = 0
        for stroke in instance:
            count += len(stroke[0])
        select_length[i] = count
    mean_length = int(select_length.mean())
    mask = (select_length > (mean_length - 50)) * (select_length < (mean_length + 50)) * (select_length < MAX_LENGTH)
    print('processing '+names[k]+' '+str(mask.astype(int).sum()))
    np.save(simplify_path+names[k]+'_length.npy', select_length)
    print(select_length.shape)
    select_mask[names[k]] = mask
    np.save(simplify_path+names[k]+'_mask.npy', mask)

processing squiggle 111792
(118441,)
processing bread 119123
(120570,)
processing violin 213951
(217260,)
processing bush 107552
(120520,)
processing eyeglasses 222123
(225762,)
processing soccer ball 112554
(125349,)
processing string bean 111364
(119083,)
processing shovel 116184
(117194,)
processing zebra 140548
(144608,)
processing kangaroo 171430
(174470,)
processing spoon 123767
(125028,)
processing submarine 121899
(124362,)
processing underwear 123378
(124548,)
processing hot air balloon 125357
(126350,)
processing pickup truck 126779
(130740,)
processing snowman 337081
(340029,)
processing chair 219567
(222706,)
processing cloud 118882
(120265,)
processing giraffe 125214
(127182,)
processing axe 122798
(124122,)
processing matches 136000
(143969,)
processing aircraft carrier 111974
(116504,)
processing camel 120620
(121399,)
processing saxophone 115938
(118107,)
processing streetlight 121459
(123280,)
processing drums 133960
(137299,)
processing camouflage 134987
(172710,)
pro

# Combine all the numpy file together, generate category, train/val/test split

In [35]:
# generate category, data
category_train = []
category_test = []
category_val = []
image_train = None
image_test = None
image_val = None
for i in range(len(names)):
    data = np.load(numpy_path+names[i]+'.npy')
    data = data[select_mask[names[i]]]
    print(names[i])
    print(data.shape)
    instance_num = data.shape[0]
    
    image_test = np.concatenate((image_test, data[-testsize:]), axis=0) if image_test is not None else data[-testsize:]
    image_val = np.concatenate((image_val, data[-testsize-valsize:-testsize]), axis=0) if image_val is not None else data[-testsize-valsize:-testsize]
    image_train = np.concatenate((image_train, data[:-testsize-valsize]), axis=0) if image_train is not None else data[:-testsize-valsize]
    # set category
    category_test.extend([i] * (testsize))
    category_val.extend([i] * (valsize))
    category_train.extend([i] * (instance_num-testsize-valsize))
    

squiggle
(111792, 784)
bread
(119123, 784)
violin
(213951, 784)
bush
(107552, 784)
eyeglasses
(222123, 784)
soccer ball
(112554, 784)
string bean
(111364, 784)
shovel
(116184, 784)
zebra
(140548, 784)
kangaroo
(171430, 784)
spoon
(123767, 784)
submarine
(121899, 784)
underwear
(123378, 784)
hot air balloon
(125357, 784)
pickup truck
(126779, 784)
snowman
(337081, 784)
chair
(219567, 784)
cloud
(118882, 784)
giraffe
(125214, 784)
axe
(122798, 784)
matches
(136000, 784)
aircraft carrier
(111974, 784)
camel
(120620, 784)
saxophone
(115938, 784)
streetlight
(121459, 784)
drums
(133960, 784)
camouflage
(134987, 784)
grass
(120876, 784)
snorkel
(150871, 784)
laptop
(252804, 784)
hot tub
(113760, 784)
car
(179553, 784)
passport
(146016, 784)
flying saucer
(146286, 784)
lobster
(136723, 784)
cactus
(129078, 784)
apple
(143992, 784)
helicopter
(156378, 784)
compass
(126530, 784)
pear
(116156, 784)
cannon
(136894, 784)
spider
(200796, 784)
fan
(132048, 784)
bandage
(142039, 784)
cruise ship
(119

In [36]:
# save data
with open(labelpath.format('train'), 'w') as fd:
    json.dump(category_train, fd)
with open(labelpath.format('val'), 'w') as fd:
    json.dump(category_val, fd)
with open(labelpath.format('test'), 'w') as fd:
    json.dump(category_test, fd)
    
np.save(datapath.format('train'), image_train)
np.save(datapath.format('val'), image_val)
np.save(datapath.format('test'), image_test)

In [37]:
print('image_train: ' + str(len(image_train)))
print('image_val: ' + str(len(image_val)))
print('image_test: ' + str(len(image_test)))
del image_train, image_val, image_test
len(name_dict)

image_train: 12871701
image_val: 500000
image_test: 1000000


100

# Preprocessing NDJSON file

In [26]:
def check_outrange(np_data, i, marker):
    if marker + 1 >= MAX_LENGTH:
        np_data[i,marker,0] = 0
        np_data[i,marker,1] = 0
        np_data[i,marker,2] = 0
        return True
    return False

In [27]:
def clamp(n, minn, maxn):
    return max(min(maxn, n), minn)

In [28]:
# preprocessing
for k in range(len(names)):
    op = open(simplify_path+names[k]+'.ndjson')
    data = ndjson.load(op)
    data_size = len(data)
    np_data = np.zeros((data_size,MAX_LENGTH,3), dtype=np.uint8)
    #reverse_data = np.zeros((data_size,MAX_LENGTH,2), dtype=np.uint8)
    print('saving '+names[k])
    for i in range(len(data)):
        instance = data[i]['drawing']
        marker = 0
        for stroke in instance:
            for point in range(len(stroke[0])):
                # out of range, mannually end
                if check_outrange(np_data, i, marker):
                    continue
                else:
                    np_data[i,marker,0] = clamp(stroke[0][point], 0, 255)
                    np_data[i,marker,1] = clamp(stroke[1][point], 0, 255)
                    np_data[i,marker,2] = 0
                    marker += 1
            # End of stroke
            if check_outrange(np_data, i, marker):
                continue
            else:
                np_data[i,marker,0] = 0
                np_data[i,marker,1] = 0
                np_data[i,marker,2] = 255
                marker += 1
        #idx = np.where(np_data[i][:,0] == 0)[0][0]
        #reverse_data[i][-(idx+1):, 0] = np_data[i][:idx+1, 0]
        #reverse_data[i][-(idx+1):, 1] = np_data[i][:idx+1, 1]
    np.save(simplify_path+names[k]+'_3d.npy', np_data[select_mask[names[k]]])
    #np.save(simplify_path+names[k]+'_new.npy', reverse_data[select_mask[names[k]]])
    print(np_data[select_mask[names[k]]].shape)

saving squiggle
(111792, 200, 3)
saving bread
(119123, 200, 3)
saving violin
(213951, 200, 3)
saving bush
(107552, 200, 3)
saving eyeglasses
(222123, 200, 3)
saving soccer ball
(112554, 200, 3)
saving string bean
(111364, 200, 3)
saving shovel
(116184, 200, 3)
saving zebra
(140548, 200, 3)
saving kangaroo
(171430, 200, 3)
saving spoon
(123767, 200, 3)
saving submarine
(121899, 200, 3)
saving underwear
(123378, 200, 3)
saving hot air balloon
(125357, 200, 3)
saving pickup truck
(126779, 200, 3)
saving snowman
(337081, 200, 3)
saving chair
(219567, 200, 3)
saving cloud
(118882, 200, 3)
saving giraffe
(125214, 200, 3)
saving axe
(122798, 200, 3)
saving matches
(136000, 200, 3)
saving aircraft carrier
(111974, 200, 3)
saving camel
(120620, 200, 3)
saving saxophone
(115938, 200, 3)
saving streetlight
(121459, 200, 3)
saving drums
(133960, 200, 3)
saving camouflage
(134987, 200, 3)
saving grass
(120876, 200, 3)
saving snorkel
(150871, 200, 3)
saving laptop
(252804, 200, 3)
saving hot tub
(11

In [32]:
# generate category, data
sequence_train = None
sequence_test = None
sequence_val = None

for i in range(len(names)):
    print('processing '+names[i])
    np_data = np.load(simplify_path+names[i]+'_3d.npy')
    sequence_test = np.concatenate((sequence_test, np_data[-testsize:]), axis=0) if sequence_test is not None else np_data[-testsize:]
    sequence_val = np.concatenate((sequence_val, np_data[-testsize-valsize:-testsize]), axis=0) if sequence_val is not None else np_data[-testsize-valsize:-testsize]
    sequence_train = np.concatenate((sequence_train, np_data[:-testsize-valsize]), axis=0) if sequence_train is not None else np_data[:-testsize-valsize]

np.save(sequencepath.format('train'), sequence_train)
np.save(sequencepath.format('val'), sequence_val)
np.save(sequencepath.format('test'), sequence_test)

processing squiggle
processing bread
processing violin
processing bush
processing eyeglasses
processing soccer ball
processing string bean
processing shovel
processing zebra
processing kangaroo
processing spoon
processing submarine
processing underwear
processing hot air balloon
processing pickup truck
processing snowman
processing chair
processing cloud
processing giraffe
processing axe
processing matches
processing aircraft carrier
processing camel
processing saxophone
processing streetlight
processing drums
processing camouflage
processing grass
processing snorkel
processing laptop
processing hot tub
processing car
processing passport
processing flying saucer
processing lobster
processing cactus
processing apple
processing helicopter
processing compass
processing pear
processing cannon
processing spider
processing fan
processing bandage
processing cruise ship
processing blueberry
processing crab
processing elbow
processing cooler
processing circle
processing sleeping bag
processing 

In [34]:
print('sequence_train: ' + str(len(sequence_train)))
print('sequence_val: ' + str(len(sequence_val)))
print('sequence_test: ' + str(len(sequence_test)))

sequence_train: 12871701
sequence_val: 500000
sequence_test: 1000000


In [36]:
sequence_train[123]

array([[  0, 102,   0],
       [ 17,  74,   0],
       [ 32,  63,   0],
       [ 44,  65,   0],
       [ 51,  73,   0],
       [ 66, 101,   0],
       [ 84,  75,   0],
       [ 93,  68,   0],
       [108,  70,   0],
       [131,  82,   0],
       [136,  79,   0],
       [149,  53,   0],
       [155,  49,   0],
       [179,  57,   0],
       [188,  54,   0],
       [204,  42,   0],
       [232,  49,   0],
       [238,  47,   0],
       [255,  21,   0],
       [236,  26,   0],
       [223,  25,   0],
       [206,  17,   0],
       [190,   4,   0],
       [179,   0,   0],
       [150,   2,   0],
       [104,  18,   0],
       [119,  33,   0],
       [  0,   0, 255],
       [146,  64,   0],
       [133,  86,   0],
       [121,  99,   0],
       [126,  86,   0],
       [138,  74,   0],
       [153,  70,   0],
       [162,  76,   0],
       [  0,   0, 255],
       [  0,   0,   0],
       [  0,   0,   0],
       [  0,   0,   0],
       [  0,   0,   0],
       [  0,   0,   0],
       [  0,   0

In [None]:
del sequence_train, sequence_val, sequence_test