## Visualizing Deep-Learning in Keras
#### This program was created based on the following links: 
http://fizzylogic.nl/2017/05/08/monitor-progress-of-your-keras-based-neural-network-using-tensorboard/

####  The target of this coding is:
1. Understanding how the text / tensors are transformed in the whole process by what functions.
2. Intuitively knowing how RNN / GRU / LSTM / Attention are workings, and comparing the difference among them.
3. Trying to make the online / increamental training worked.
4. Starting to understand the transferring learning.

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [65]:
import itertools
import os
from time import time

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix


from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding,LSTM
from keras.preprocessing import text, sequence
from keras import utils
from keras.callbacks import TensorBoard
from keras.utils.vis_utils import plot_model
import keras.backend as K
from keras.utils.np_utils import to_categorical

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

You have TensorFlow version 1.12.0


### Load the mock dataset

In [3]:
df=pd.DataFrame(columns=["sentence","label"])
df["sentence"]=["I like apple","I hate banana","grape is great, but I am not highly interested"]
df["label"]=["positive","negative","neutral"]
df.head()

Unnamed: 0,sentence,label
0,I like apple,positive
1,I hate banana,negative
2,"grape is great, but I am not highly interested",neutral


In [4]:
df.isnull().sum()

sentence    0
label       0
dtype: int64

In [5]:
df['label'].value_counts()

positive    1
negative    1
neutral     1
Name: label, dtype: int64

In [18]:
# Split data into train and test
#train_size = int(len(df) * .8)
train_size = len(df)
print ("Train size: %d" % train_size)
#print ("Test size: %d" % (len(df) - train_size))

Train size: 3


### Convert the words to integers

In [19]:
train_narrative = df["sentence"][:train_size]
train_product = df["label"][:train_size]

In [30]:
# Open question: why the max_words is required?
max_words = 10
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [31]:
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)

In [32]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)

In [33]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)

In [34]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

x_train shape: (3, 10)
y_train shape: (3, 3)


### Build the model

In [56]:
# Open questions: why are those parameters used for?
batch_size = 32
epochs = 5

In [68]:
# Build the model
model = Sequential()
#model.add(Dense(512, input_shape=(max_words,)))
model.add(Dense(5, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

### Loss function and optimizer
A model needs a loss function and an optimizer for training. Since this is a categorical classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), we'll use the categorical_crossentropy loss function.
This isn't the only choice for a loss function, you could, for instance, choose mean_squared_error. But, generally, categorical_crossentropy is better for dealing with probabilities—it measures the "distance" between probability distributions, or in our case, between the ground-truth distribution and the predictions.
Later, when we are exploring regression problems (say, to predict the price of a house), we will see how to use another loss function called mean squared error.
Now, configure the model to use an optimizer and a loss function:

In [69]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Train the model

In [70]:
# Configure the tensorboard with specifying the log folder and timing.
tensorboard = TensorBoard(log_dir="logs/{}".format(time()))

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    callbacks=[tensorboard])


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [71]:
print(model.summary())

# Output the model into the built-in imge in Keras
# The image can be opened in the next Markdown block
plot_model(model, to_file='./img/model_plot.png', 
           show_shapes=True, show_layer_names=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_15 (Dense)             (None, 5)                 55        
_________________________________________________________________
activation_15 (Activation)   (None, 5)                 0         
_________________________________________________________________
dropout_8 (Dropout)          (None, 5)                 0         
_________________________________________________________________
dense_16 (Dense)             (None, 3)                 18        
_________________________________________________________________
activation_16 (Activation)   (None, 3)                 0         
Total params: 73
Trainable params: 73
Non-trainable params: 0
_________________________________________________________________
None


![title](./img/model_plot.png)

### Output the configuration, weights, gradients of each trainable layer

In [116]:
# Output the configuration and weights of each trainable layer
for layer in model.layers:
    g=layer.get_config()
    h=layer.get_weights()
    print (g)
    print (h)

{'name': 'dense_15', 'trainable': True, 'batch_input_shape': (None, 10), 'dtype': 'float32', 'units': 5, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}
[array([[-0.17943698,  0.02047008,  0.22457647, -0.07057941,  0.216304  ],
       [ 0.32789394,  0.4145531 , -0.37396854, -0.05724477,  0.6279318 ],
       [-0.2653203 , -0.31804854, -0.23155889,  0.31240806,  0.2095395 ],
       [-0.11158746, -0.27157277,  0.0618481 ,  0.54593146,  0.29985598],
       [-0.32085595, -0.60659975, -0.604598  , -0.3313076 ,  0.08065505],
       [-0.4895798 ,  0.4468087 , -0.21289891,  0.07549638, -0.60003054],
       [-0.60563123, -0.06905765,  0.22890788, -0.53977466,  0.2012727

In [117]:
# Obtain the gradients

weights = model.trainable_weights # weight tensors
weights = [weight for weight in weights if model.get_layer(weight.name.split("/")[0]).trainable] # filter down weights tensors to only ones which are trainable
gradients = model.optimizer.get_gradients(model.total_loss, weights) # gradient tensors

#print(weights)

In [102]:

input_tensors = [model.inputs[0], # input data
                 model.sample_weights[0], # how much to weight each sample by
                 model.targets[0], # labels
                 K.learning_phase(), # train or test mode
]

get_gradients = K.function(inputs=input_tensors, outputs=gradients)

In [118]:


nb_sample = 3

inputs = [x_train, # X
          np.ones(nb_sample), # sample weights
          y_train, # y
          0 # learning phase in TEST mode
]


print(list(zip(weights, get_gradients(inputs))))

[(<tf.Variable 'dense_15/kernel:0' shape=(10, 5) dtype=float32_ref>, array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [-0.1210086 , -0.01304542,  0.        , -0.09490197, -0.01825884],
       [ 0.        ,  0.        ,  0.        , -0.09490197, -0.02066237],
       [ 0.        ,  0.        ,  0.        , -0.09490197, -0.02066237],
       [ 0.        ,  0.08331225,  0.        ,  0.        , -0.12017602],
       [ 0.        ,  0.08331225,  0.        ,  0.        , -0.12017602],
       [-0.1210086 , -0.09635767,  0.        ,  0.        ,  0.12257954],
       [-0.1210086 , -0.09635767,  0.        ,  0.        ,  0.12257954],
       [-0.1210086 , -0.09635767,  0.        ,  0.        ,  0.12257954],
       [-0.1210086 , -0.09635767,  0.        ,  0.        ,  0.12257954]],
      dtype=float32)), (<tf.Variable 'dense_15/bias:0' shape=(5,) dtype=float32_ref>, array([-0.1210086 , -0.01304542,  0.        , -0.09490197, -0.01825884],
      dtype=float32)), (<tf.Var