In [31]:
# Intellisence (Learning for the dat :-)
%config IPCompleter.greedy=True

In [32]:
#Traditional neural networks can’t do this, and it seems like a major shortcoming. For example, imagine you want to 
#classify what kind of event is happening at every point in a movie. It’s unclear how a traditional neural 
#network could use its reasoning about previous events in the film to inform later ones.

#Recurrent neural networks address this issue. They are networks with loops in them, allowing information to persist.

#LSTMs are explicitly designed to avoid the long-term dependency problem. 
#Remembering information for long periods of time is practically their default behavior, 
#not something they struggle to learn!

# Best Read http://colah.github.io/posts/2015-08-Understanding-LSTMs/

# THE BELOW PROBLEM CAN BE SOLVED USING CNN ALSO. 
# But it is worth to solve using LSTM.



In [33]:
# Decalring Model Parameters
batch_size = 4096
STROKE_COUNT = 196
TRAIN_SAMPLES = 750
VALID_SAMPLES = 75
TEST_SAMPLES = 50 

In [34]:
# Importing standard libraries 
%matplotlib inline
import os
import numpy as np
import matplotlib.pyplot as plt
from keras.utils.np_utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import pandas as pd

In [35]:

#A metric is a function that is used to judge the performance of your model. 
#Metric functions are to be supplied in the metrics parameter when a model is compiled.

from keras.metrics import top_k_categorical_accuracy

#Available metrics
#binary_accuracy , categorical_accuracy, sparse_categorical_accuracy, top_k_categorical_accuracy, 
#sparse_top_k_categorical_accuracy

#AND 

#Custom Metrics

In [36]:
def top_3_accuracy(x,y): return top_k_categorical_accuracy(x,y, 3)

In [37]:
#A callback is a set of functions to be applied at given stages of the training procedure. 
#You can use callbacks to get a view on internal states and statistics of the model during training. 
#You can pass a list of callbacks (as the keyword argument callbacks) to the .fit() method 
#of the Sequential or Model classes. The relevant methods of the callbacks will then be called 
#at each stage of the training.

# ModelCheckpoint - Save the model after every epoch.
# LearningRateScheduler - Learning rate scheduler.
# EarlyStopping - top training when a monitored quantity has stopped improving.
# ReduceLROnPlateau - Reduce learning rate when a metric has stopped improving.

from keras.callbacks import ModelCheckpoint, LearningRateScheduler, EarlyStopping, ReduceLROnPlateau
from glob import glob
import gc
gc.enable()

In [38]:
# Checking for available GPU, but for our learnings lets skip this.
def get_available_gpus():
    from tensorflow.python.client import device_lib
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

'..\\..\\data'

'..\\..\\data\\doodleoutput\test_simplified.csv'

In [41]:
# ast.literal_eval raises an exception if the input isn't a valid Python datatype, so the code won't be executed if it's not.
from ast import literal_eval

In [49]:
# Training data path
ALL_TRAIN_PATHS = glob('..\..\data\doodle-dataset\*.csv')
ALL_TRAIN_PATHS

['..\\..\\data\\doodle-dataset\\airplane.csv',
 '..\\..\\data\\doodle-dataset\\alarm clock.csv',
 '..\\..\\data\\doodle-dataset\\ambulance.csv',
 '..\\..\\data\\doodle-dataset\\animal migration.csv']

In [50]:
COL_NAMES = ['countrycode', 'drawing', 'key_id', 'recognized', 'timestamp', 'word']

In [52]:
train_args = dict(samples=TRAIN_SAMPLES, 
                  start_row=0, 
                  max_rows=int(TRAIN_SAMPLES*1.5))
train_args

{'samples': 750, 'start_row': 0, 'max_rows': 1125}

In [54]:
valid_args = dict(samples=VALID_SAMPLES, 
                  start_row=train_args['max_rows']+1, 
                  max_rows=VALID_SAMPLES+25)
valid_args

{'samples': 75, 'start_row': 1126, 'max_rows': 100}

In [55]:
test_args = dict(samples=TEST_SAMPLES, 
                 start_row=valid_args['max_rows']+train_args['max_rows']+1, 
                 max_rows=TEST_SAMPLES+25)
test_args

{'samples': 50, 'start_row': 1226, 'max_rows': 75}

In [59]:
# process the csv Files 





In [60]:
ALL_TRAIN_PATHS

['..\\..\\data\\doodle-dataset\\airplane.csv',
 '..\\..\\data\\doodle-dataset\\alarm clock.csv',
 '..\\..\\data\\doodle-dataset\\ambulance.csv',
 '..\\..\\data\\doodle-dataset\\animal migration.csv']

In [127]:
#for eachDrawingItem in full_df['drawing']:
def _stack_it(raw_strokes):
    #print(eachDrawingItem)
    stroke_vec = literal_eval(raw_strokes) # string->list
    #print(stroke_vec)
    # unwrap the list - Here they are converting 2 dimension objects to 3 dimesnionsby and for the 3rd y axis 
    # filling with 0 to  3 
    in_strokes = [(xi,yi,i)  
     for i,(x,y) in enumerate(stroke_vec) 
     for xi,yi in zip(x,y)]
    #print(in_strokes)
    # joins the sequence of arrays along a new axis.
    c_strokes = np.stack(in_strokes)
    #print(c_strokes)
    #print(c_strokes[:,2])
    
    #>>> x = np.array([1, 2, 4, 7, 0])
    #>>> np.diff(x)
        #array([ 1,  2,  3, -7])
    #>>> np.diff(x, n=2)
    #array([  1,   1, -10])
    
    #print(np.diff(c_strokes[:,2]))
    #print([1]+np.diff(c_strokes[:,2]).tolist())
    c_strokes[:,2] = [1]+np.diff(c_strokes[:,2]).tolist()
    c_strokes[:,2] += 1 # since 0 is no stroke
    #print(c_strokes[:,2])
    #print(c_strokes.swapaxes(0, 1))
    # pad_sequences is used to ensure that all sequences in a list have the same length. 
    #By default this is done by padding 0 in the beginning of each sequence until each 
    # sequence has the same length as the longest sequence.
    return pad_sequences(c_strokes.swapaxes(0, 1), maxlen=196, padding='post').swapaxes(0, 1)
    

In [130]:
def read_batch(samples=5, 
               start_row=0,
               max_rows = 1000):
    
    # For Ebvery files in the location. ( Training data set)
    out_df_list = []
    for c_path in ALL_TRAIN_PATHS:
            #c_df = pd.read_csv(c_path, nrows=1125, skiprows=0)
            c_df = pd.read_csv(c_path, nrows=max_rows, skiprows=start_row)
            #print(c_df) 
            c_df.columns=COL_NAMES
            #print(c_df)
            # Just taking 750 records in each file (airplane, clock)
            #out_df_list += [c_df.sample(750)[['drawing', 'word']]]
            out_df_list += [c_df.sample(samples)[['drawing', 'word']]]

    full_df = pd.concat(out_df_list)
    #print(full_df)
    full_df['drawing'] = full_df['drawing'].\
            map(_stack_it)

    return full_df

In [131]:
train_df = read_batch(**train_args)

In [132]:
print(train_df)

                                                drawing              word
678   [[91, 89, 2], [94, 100, 1], [114, 120, 1], [12...          airplane
28    [[39, 150, 2], [117, 168, 1], [182, 139, 1], [...          airplane
62    [[16, 39, 2], [32, 38, 1], [49, 25, 1], [47, 2...          airplane
560   [[54, 36, 2], [35, 50, 1], [7, 61, 1], [0, 71,...          airplane
404   [[36, 89, 2], [56, 70, 1], [88, 55, 1], [145, ...          airplane
892   [[147, 43, 2], [47, 73, 1], [16, 73, 1], [4, 7...          airplane
195   [[114, 101, 2], [33, 107, 1], [9, 123, 1], [0,...          airplane
91    [[208, 65, 2], [158, 65, 1], [111, 72, 1], [25...          airplane
396   [[103, 68, 2], [92, 26, 1], [103, 4, 1], [117,...          airplane
1034  [[37, 11, 2], [42, 44, 1], [49, 52, 1], [77, 5...          airplane
123   [[217, 47, 2], [189, 33, 1], [178, 31, 1], [98...          airplane
841   [[214, 35, 2], [184, 34, 1], [89, 49, 1], [90,...          airplane
177   [[114, 40, 2], [119, 28, 1], [14

In [133]:
valid_df = read_batch(**valid_args)
test_df = read_batch(**test_args)

In [134]:
#Encode labels with value between 0 and n_classes-1.
word_encoder = LabelEncoder()

In [136]:
word_encoder.fit(train_df['word'])


LabelEncoder()

In [137]:
print('words', len(word_encoder.classes_), '=>', ', '.join([x for x in word_encoder.classes_]))


words 4 => airplane, alarm clock, ambulance, animal migration
