In [1]:
!pip install ndjson
!pip install cairocffi

Collecting ndjson
  Downloading https://files.pythonhosted.org/packages/70/c9/04ba0056011ba96a58163ebfd666d8385300bd12da1afe661a5a147758d7/ndjson-0.3.1-py2.py3-none-any.whl
Installing collected packages: ndjson
Successfully installed ndjson-0.3.1
Collecting cairocffi
[?25l  Downloading https://files.pythonhosted.org/packages/84/ca/0bffed5116d21251469df200448667e90acaa5131edea869b44a3fbc73d0/cairocffi-1.2.0.tar.gz (70kB)
[K     |████████████████████████████████| 71kB 2.6MB/s 
Building wheels for collected packages: cairocffi
  Building wheel for cairocffi (setup.py) ... [?25l[?25hdone
  Created wheel for cairocffi: filename=cairocffi-1.2.0-cp37-none-any.whl size=89548 sha256=2aa2d25ca9873e2f2672592cb565e8fba81c2c8ec510e65e476610f6c81fa549
  Stored in directory: /root/.cache/pip/wheels/40/76/48/f1effadceea83b32e7d957dd0f92db4db8b537d7b72b4ef374
Successfully built cairocffi
Installing collected packages: cairocffi
Successfully installed cairocffi-1.2.0


In [33]:
import os
import ndjson
import pandas as pd
import cairocffi as cairo
import numpy as np
import torch.utils.data as data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader

In [41]:
category = ['apple','face'] # add training classes
image_size = 28
number_per_catogory = 1000
dataset_seed = 42
train_ratio = 0.7
all_samples = len(category)*number_per_catogory

np.random.seed(dataset_seed)
TRAIN_INDICES = list(np.random.choice(all_samples, int(all_samples*train_ratio)))
VAL_INDICES = list(set(range(all_samples)) - set(TRAIN_INDICES))



In [4]:
# credit to:
#https://github.com/googlecreativelab/quickdraw-dataset/issues/19

def vector_to_raster(vector_images, side=28, line_diameter=16, padding=16, bg_color=(0,0,0), fg_color=(1,1,1)):
    """
    padding and line_diameter are relative to the original 256x256 image.
    """
    
    original_side = 256.
    
    surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, side, side)
    ctx = cairo.Context(surface)
    ctx.set_antialias(cairo.ANTIALIAS_BEST)
    ctx.set_line_cap(cairo.LINE_CAP_ROUND)
    ctx.set_line_join(cairo.LINE_JOIN_ROUND)
    ctx.set_line_width(line_diameter)

    # scale to match the new size
    # add padding at the edges for the line_diameter
    # and add additional padding to account for antialiasing
    total_padding = padding * 2. + line_diameter
    new_scale = float(side) / float(original_side + total_padding)
    ctx.scale(new_scale, new_scale)
    ctx.translate(total_padding / 2., total_padding / 2.)

    raster_images = []
    for vector_image in vector_images:
        # clear background
        ctx.set_source_rgb(*bg_color)
        ctx.paint()
        
        bbox = np.hstack(vector_image).max(axis=1)
        offset = ((original_side, original_side) - bbox) / 2.
        offset = offset.reshape(-1,1)
        centered = [stroke + offset for stroke in vector_image]

        # draw strokes, this is the most cpu-intensive part
        ctx.set_source_rgb(*fg_color)        
        for xv, yv in centered:
            ctx.move_to(xv[0], yv[0])
            for x, y in zip(xv, yv):
                ctx.line_to(x, y)
            ctx.stroke()

        data = surface.get_data()
        raster_image = np.copy(np.asarray(data)[::4])
        raster_images.append(raster_image)
    
    return raster_images

In [5]:
# download simplified data
!mkdir quickDrawData
for item in category:
  path=os.path.join('gs://quickdraw_dataset/full/simplified',item+'.ndjson')
  print(path)
  !gsutil -m cp $path ./quickDrawData/

gs://quickdraw_dataset/full/simplified/apple.ndjson
Copying gs://quickdraw_dataset/full/simplified/apple.ndjson...
\ [1/1 files][ 56.1 MiB/ 56.1 MiB] 100% Done                                    
Operation completed over 1 objects/56.1 MiB.                                     
gs://quickdraw_dataset/full/simplified/face.ndjson
Copying gs://quickdraw_dataset/full/simplified/face.ndjson...
| [1/1 files][ 89.4 MiB/ 89.4 MiB] 100% Done                                    
Operation completed over 1 objects/89.4 MiB.                                     


In [48]:
class DrawDataset(data.Dataset):
    def __init__(self, train, image_size, categories, number_per_catogory, transformation=None):
        all_images = []
        all_labels = []
        all_keys = []
        for item in categories:
          with open(os.path.join('quickDrawData',item+'.ndjson')) as f:
            print(item)
            data = ndjson.load(f)
            df=pd.DataFrame.from_dict(data)
            new_df=df[df['countrycode']=='CA']
            new_df=new_df[new_df['recognized']==True]
            all_images+=(list(new_df['drawing'].values)[:number_per_catogory])
            all_labels+=(list(new_df['word'].values)[:number_per_catogory])
            all_keys+=(list(new_df['key_id'].values)[:number_per_catogory])

        arr=vector_to_raster(all_images,side=image_size)
        images = [x.reshape(image_size,image_size) for x in arr]

        all_labels=np.array(all_labels)
        images=np.array(images)
        all_keys=np.array(all_keys)


        if train==True:
          all_labels=all_labels[TRAIN_INDICES]
          images=images[TRAIN_INDICES]
          all_keys=all_keys[TRAIN_INDICES]
          images=[np.pad(x, [(6, 6), (6, 6)], mode='constant', constant_values=0) for x in images]
        else:
          # TO DO: ADD TRANSFORMATION FOR TEST SET according to the following link
          # https://www.cs.toronto.edu/~tijmen/affNIST/
          if transformation==None:
            all_labels=all_labels[VAL_INDICES]
            images=images[VAL_INDICES]
            all_keys=all_keys[VAL_INDICES]
            images=[np.pad(x, [(6, 6), (6, 6)], mode='constant', constant_values=0) for x in images]


        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(all_labels)
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
        
        self.X = images
        self.y = onehot_encoded
        self.labels = all_labels
        self.keys = all_keys
        # normalize

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index],self.y[index]

In [49]:
train_dataset=DrawDataset(True, image_size, category, number_per_catogory)
val_dataset=DrawDataset(False, image_size, category, number_per_catogory)

apple
face
apple
face


In [50]:
batch_size = 32
train_loader=DataLoader(train_dataset, batch_size=batch_size,shuffle=True)
val_loader=DataLoader(val_dataset, batch_size=batch_size,shuffle=False)