# *Set up*

In [23]:
!pip install ndjson
!pip install cairocffi
!pip install imutils



In [24]:
import os
import ndjson
import pandas as pd
import cairocffi as cairo
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import cv2
import imutils
import torch.utils.data as data
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from torch.utils.data import DataLoader
import random

In [25]:
category = ['triangle', 'square', 'mushroom', 'crown', 'envelope'] # add training classes
batch_size = 8
print("batch_size: {}".format(batch_size))
image_size = 28
number_per_catogory = 1000
dataset_seed = 42
train_ratio = 0.6
validation_ratio = 0.2
all_samples = len(category)*number_per_catogory

np.random.seed(dataset_seed)
random.seed(dataset_seed)

TRAIN_INDICES = random.sample(list(range(all_samples)), int(all_samples*train_ratio))
temp_INDICES = list(set(range(all_samples)) - set(TRAIN_INDICES))
VAL_INDICES = random.sample(list(temp_INDICES), int(all_samples*validation_ratio))
TEST_INDICES = list(set(temp_INDICES) - set(VAL_INDICES))
print("total training samples:",len(TRAIN_INDICES))
print("total validatoin samples:",len(VAL_INDICES))
print("total test samples:",len(TEST_INDICES))

batch_size: 8
total training samples: 15000
total validatoin samples: 5000
total test samples: 5000


In [26]:
# Download simplified data
!mkdir quickDrawData
for item in category:
  path=os.path.join('gs://quickdraw_dataset/full/simplified',item+'.ndjson')
  !gsutil -m cp $path ./quickDrawData/

mkdir: cannot create directory ‘quickDrawData’: File exists
Copying gs://quickdraw_dataset/full/simplified/triangle.ndjson...
/ [1/1 files][ 30.5 MiB/ 30.5 MiB] 100% Done                                    
Operation completed over 1 objects/30.5 MiB.                                     
Copying gs://quickdraw_dataset/full/simplified/square.ndjson...
/ [1/1 files][ 32.0 MiB/ 32.0 MiB] 100% Done                                    
Operation completed over 1 objects/32.0 MiB.                                     
Copying gs://quickdraw_dataset/full/simplified/mushroom.ndjson...
- [1/1 files][ 53.5 MiB/ 53.5 MiB] 100% Done                                    
Operation completed over 1 objects/53.5 MiB.                                     
Copying gs://quickdraw_dataset/full/simplified/crown.ndjson...
- [1/1 files][ 47.7 MiB/ 47.7 MiB] 100% Done                                    
Operation completed over 1 objects/47.7 MiB.                                     
Copying gs://quickdraw_datase

# *Helper functions*

In [27]:
# credit to:
#https://github.com/googlecreativelab/quickdraw-dataset/issues/19

def vector_to_raster(vector_images, side=28, line_diameter=16, padding=16, bg_color=(0,0,0), fg_color=(1,1,1)):
    """
    padding and line_diameter are relative to the original 256x256 image.
    """
    
    original_side = 256.
    
    surface = cairo.ImageSurface(cairo.FORMAT_ARGB32, side, side)
    ctx = cairo.Context(surface)
    ctx.set_antialias(cairo.ANTIALIAS_BEST)
    ctx.set_line_cap(cairo.LINE_CAP_ROUND)
    ctx.set_line_join(cairo.LINE_JOIN_ROUND)
    ctx.set_line_width(line_diameter)

    # scale to match the new size
    # add padding at the edges for the line_diameter
    # and add additional padding to account for antialiasing
    total_padding = padding * 2. + line_diameter
    new_scale = float(side) / float(original_side + total_padding)
    ctx.scale(new_scale, new_scale)
    ctx.translate(total_padding / 2., total_padding / 2.)

    raster_images = []
    for vector_image in vector_images:
        # clear background
        ctx.set_source_rgb(*bg_color)
        ctx.paint()
        
        bbox = np.hstack(vector_image).max(axis=1)
        offset = ((original_side, original_side) - bbox) / 2.
        offset = offset.reshape(-1,1)
        centered = [stroke + offset for stroke in vector_image]

        # draw strokes, this is the most cpu-intensive part
        ctx.set_source_rgb(*fg_color)        
        for xv, yv in centered:
            ctx.move_to(xv[0], yv[0])
            for x, y in zip(xv, yv):
                ctx.line_to(x, y)
            ctx.stroke()

        data = surface.get_data()
        raster_image = np.copy(np.asarray(data)[::4])
        raster_images.append(raster_image)
    
    return raster_images

In [28]:
def rotate_img(img,ang=None):
  if ang==None:
    angle = np.random.randint(20, 160)  # chosen uniformly between -20 and +20.
  else:
    angle = ang
  # Determine the centre
  height, width = img.shape[:2]
  cX, cY = (width // 2, height // 2)
  # Get the rotation matrix 
  M = cv2.getRotationMatrix2D((cX, cY), angle, 1.0)
  cos = np.abs(M[0, 0])
  sin = np.abs(M[0, 1])
  # Compute the new bounding dimensions of the image
  nW = int((height * sin) + (width * cos))
  nH = int((height * cos) + (width * sin))
  # Adjust the rotation matrix
  M[0, 2] += (nW / 2) - cX
  M[1, 2] += (nH / 2) - cY
  # Perform the actual rotation
  img = cv2.warpAffine(img, M, (nW, nH))
  new_height, new_width = img.shape[:2]
  new_cX, new_cY = (new_width // 2, new_height // 2)
  crop_img = img[new_cY-20:new_cY+20, new_cX-20:new_cX+20]
  return crop_img

In [29]:
def shear_img(img):
  shear_factor = np.random.randint(10, 20)
  shear_factor /= 100
  height, width = img.shape[:2]
  M = np.array([[1, abs(shear_factor), 0],[0,1,0]])
  nW =  img.shape[1] + abs(shear_factor*img.shape[0])
  img = cv2.warpAffine(img, M, (int(nW), img.shape[0]))
  img = cv2.resize(img, (width,height))
  return img

# *DrawDataset*

In [30]:
class DrawDataset(data.Dataset):
    def __init__(self, dataset_type, image_size, categories, number_per_catogory, transformation='rotate', angle=0):
        all_images = []
        all_labels = []
        all_keys = []
        for item in categories:
          with open(os.path.join('quickDrawData',item+'.ndjson')) as f:
            data = ndjson.load(f)
            df=pd.DataFrame.from_dict(data)
            #new_df=df[df['countrycode']=='US']
            new_df=df[df['countrycode'].isin(['CA','US'])]
            new_df=new_df[new_df['recognized']==True]
            all_images+=(list(new_df['drawing'].values)[:number_per_catogory])
            all_labels+=(list(new_df['word'].values)[:number_per_catogory])
            all_keys+=(list(new_df['key_id'].values)[:number_per_catogory])
            

        arr=vector_to_raster(all_images,side=image_size)
        images = [x.reshape(image_size,image_size) for x in arr]

        all_labels=np.array(all_labels)
        images=np.array(images)
        all_keys=np.array(all_keys)

        if dataset_type=='test':
          # TO DO: ADD TRANSFORMATION FOR TEST SET according to the following link
          # https://www.cs.toronto.edu/~tijmen/affNIST/
          if transformation==None:
            all_labels=all_labels[TEST_INDICES]
            images=images[TEST_INDICES]
            all_keys=all_keys[TEST_INDICES]
            images=[np.pad(x, [(6, 6), (6, 6)], mode='constant', constant_values=0) for x in images]
            images=[x/255. for x in images]
          elif transformation=='rotate':
            all_labels=all_labels[TEST_INDICES]
            images=[np.pad(x, [(26, 26), (26, 26)], mode='constant', constant_values=0) for x in images[TEST_INDICES]]
            images = [rotate_img(img, angle) for img in images]
            images=[x/255. for x in images]
            all_keys=all_keys[TEST_INDICES]
          elif transformation=='shear':
            all_labels=all_labels[TEST_INDICES]
            images = [shear_img(img) for img in images[TEST_INDICES]]
            all_keys=all_keys[TEST_INDICES]
            images=[np.pad(x, [(6, 6), (6, 6)], mode='constant', constant_values=0) for x in images]
            images=[x/255. for x in images]
        elif dataset_type=='train':
          all_labels=all_labels[TRAIN_INDICES]
          images=images[TRAIN_INDICES]
          all_keys=all_keys[TRAIN_INDICES]
          images=[np.pad(x, [(6, 6), (6, 6)], mode='constant', constant_values=0) for x in images]
          images=[x/255. for x in images]
        else:
          all_labels=all_labels[VAL_INDICES]
          images=images[VAL_INDICES]
          all_keys=all_keys[VAL_INDICES]
          images=[np.pad(x, [(6, 6), (6, 6)], mode='constant', constant_values=0) for x in images]
          images=[x/255. for x in images]


        label_encoder = LabelEncoder()
        integer_encoded = label_encoder.fit_transform(all_labels)
        onehot_encoder = OneHotEncoder(sparse=False)
        integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
        onehot_encoded = onehot_encoder.fit_transform(integer_encoded)
        
        self.X = images
        self.y = onehot_encoded
        self.labels = all_labels
        self.keys = all_keys

    def __len__(self):
        return len(self.X)

    def __getitem__(self, index):
        return self.X[index],self.y[index]

In [31]:
# Normal train, validation, test datasets (without any transformation)
train_dataset=DrawDataset('train', image_size, category, number_per_catogory)
valid_dataset=DrawDataset('valid', image_size, category, number_per_catogory)
#test_dataset=DrawDataset('test', image_size, category, number_per_catogory)

In [32]:
# Rotated test datasets
angles = [ang for ang in range(-180, 181, 10)]
datasets_rotate = {}
# for ang in angles:
#     datasets_rotate[ang] = DrawDataset('test', image_size, category, number_per_catogory, transformation='rotate', angle=ang)

# *DataLoader*

In [33]:
# Normal data loaders
train_loader=DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
valid_loader=DataLoader(valid_dataset, batch_size=batch_size, shuffle=False)
test_loader = None   # Change when testing
#test_loader=DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [34]:
# Rotated test data loaders
loaders_rotate = {}
for ang, dataset in datasets_rotate.items():
    loaders_rotate[ang] = DataLoader(dataset, batch_size=batch_size, shuffle=False)

In [35]:
# from google.colab import drive
# drive.mount('mnt')
# %cd "mnt/My Drive"

In [36]:
# Save datasets
# import torch
# %rm -rf datasets
# %mkdir datasets
# %cd datasets
# torch.save(train_dataset, 'train_dataset.pth')
# torch.save(valid_dataset, 'valid_dataset.pth')
# torch.save(test_dataset, 'test_dataset_original.pth')
# for ang, data in datasets_rotate.items():
#     torch.save(data, 'test_dataset_r{}'.format(ang))

# %rm -rf ../data_loaders
# %mkdir ../data_loaders
# %cd ../data_loaders
# torch.save(train_loader, 'train_loader.pth')
# torch.save(valid_loader, 'valid_loader.pth')
# torch.save(test_loader, 'test_loader_original.pth')
# for ang, data in loaders_rotate.items():
#     torch.save(data, 'test_loader_r{}'.format(ang))

In [37]:
#plt.imshow(loaders_rotate[40].dataset[0][0])

KeyError: ignored