## Visualizing Deep-Learning in Keras
#### This program was created based on the following links: 
http://fizzylogic.nl/2017/05/08/monitor-progress-of-your-keras-based-neural-network-using-tensorboard/

####  The target of this coding is:
1. Understanding how the text / tensors are transformed in the whole process by what functions.
2. Intuitively knowing how RNN / GRU / LSTM / Attention are workings, and comparing the difference among them.
3. Trying to make the online / increamental training worked.
4. Starting to understand the transferring learning.

Since this code will be visualized by tensorboard, it is proposed to start tensorboard in the folder where this code is located in the command windows by the following command:
> tensorboard --logdir=logs/

The usage of tensorboard can be found in the following link:
https://www.analyticsvidhya.com/blog/2017/07/debugging-neural-network-with-tensorboard/

In [1]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [2]:
import itertools
import os
from time import time

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

import tensorflow as tf
from tensorflow import keras

from sklearn.preprocessing import LabelBinarizer, LabelEncoder
from sklearn.metrics import confusion_matrix


from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout, Embedding,LSTM
from keras.preprocessing import text, sequence
from keras import utils
from keras.callbacks import TensorBoard
from keras.utils.vis_utils import plot_model
import keras.backend as K
from keras.utils.np_utils import to_categorical

# This code was tested with TensorFlow v1.4
print("You have TensorFlow version", tf.__version__)

  from ._conv import register_converters as _register_converters


You have TensorFlow version 1.12.0


Using TensorFlow backend.


### Load the mock dataset

In [3]:
df=pd.DataFrame(columns=["sentence","label"])
df["sentence"]=["I like apple","I hate banana"]
df["label"]=["positive","negative"]
df.head()

Unnamed: 0,sentence,label
0,I like apple,positive
1,I hate banana,negative


In [4]:
df.isnull().sum()

sentence    0
label       0
dtype: int64

In [5]:
df['label'].value_counts()

negative    1
positive    1
Name: label, dtype: int64

In [6]:
# Split data into train and test
#train_size = int(len(df) * .8)
train_size = len(df)
print ("Train size: %d" % train_size)
#print ("Test size: %d" % (len(df) - train_size))

Train size: 2


### Convert the words to integers

In [7]:
train_narrative = df["sentence"][:train_size]
train_product = df["label"][:train_size]

In [8]:
# Open question: why the max_words is required?
max_words = 5
tokenize = text.Tokenizer(num_words=max_words, char_level=False)

In [9]:
tokenize.fit_on_texts(train_narrative) # only fit on train
x_train = tokenize.texts_to_matrix(train_narrative)

In [10]:
# Use sklearn utility to convert label strings to numbered index
encoder = LabelEncoder()
encoder.fit(train_product)
y_train = encoder.transform(train_product)

In [11]:
# Converts the labels to a one-hot representation
num_classes = np.max(y_train) + 1
y_train = utils.to_categorical(y_train, num_classes)

In [12]:
# Inspect the dimenstions of our training and test data (this is helpful to debug)
print('x_train shape:', x_train.shape)
print('y_train shape:', y_train.shape)

x_train shape: (2, 5)
y_train shape: (2, 2)


In [13]:
print(x_train)
print(y_train)

[[0. 1. 1. 1. 0.]
 [0. 1. 0. 0. 1.]]
[[0. 1.]
 [1. 0.]]


### Build the model

In [14]:
# Open questions: why are those parameters used for?
batch_size = 2
epochs = 1
hidden_units=2

In [15]:
# Build the model
model = Sequential()
#model.add(Dense(512, input_shape=(max_words,)))
model.add(Dense(hidden_units, input_shape=(max_words,)))
model.add(Activation('relu'))
model.add(Dropout(0.5))
model.add(Dense(num_classes))
model.add(Activation('softmax'))

summary = model.summary()
W_Input_Hidden = model.layers[0].get_weights()[0]
B_Input_Hidden= model.layers[0].get_weights()[1]

W_Output_Hidden = model.layers[3].get_weights()[0]
B_Output_Hidden= model.layers[3].get_weights()[1]

print(summary)
print('INPUT-HIDDEN LAYER WEIGHTS:')
print(W_Input_Hidden)

print('INPUT-HIDDEN LAYER BIASES:')
print(B_Input_Hidden)

print('HIDDEN-OUTPUT LAYER WEIGHTS:')
print(W_Output_Hidden)

print('OUT-HIDDEN LAYER BIASES:')
print(B_Output_Hidden)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 2)                 12        
_________________________________________________________________
activation_1 (Activation)    (None, 2)                 0         
_________________________________________________________________
dropout_1 (Dropout)          (None, 2)                 0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 6         
_________________________________________________________________
activation_2 (Activation)    (None, 2)                 0         
Total params: 18
Trainable params: 18
Non-trainable params: 0
_________________________________________________________________
None
INPUT-HIDDEN LAYER WEIGHTS:
[[ 0.8273283   0.7599194 ]
 [ 0.07757568  0.19754624]
 [ 0.8081137  -0.30484623]
 [ 0.40900648 -0.68801194]
 [ 0.

### Loss function and optimizer
A model needs a loss function and an optimizer for training. Since this is a categorical classification problem and the model outputs a probability (a single-unit layer with a sigmoid activation), we'll use the categorical_crossentropy loss function.
This isn't the only choice for a loss function, you could, for instance, choose mean_squared_error. But, generally, categorical_crossentropy is better for dealing with probabilities—it measures the "distance" between probability distributions, or in our case, between the ground-truth distribution and the predictions.
Later, when we are exploring regression problems (say, to predict the price of a house), we will see how to use another loss function called mean squared error.
Now, configure the model to use an optimizer and a loss function:

In [16]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

### Train the model

In [17]:
# Configure the tensorboard with specifying the log folder and timing.
#tensorboard = TensorBoard(log_dir="logs/{}".format(time()), histogram_freq=1, write_graph=True, write_images=True)
tensorboard = TensorBoard(log_dir="logs/{}".format(time()), write_graph=True, write_images=True)

history = model.fit(x_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    callbacks=[tensorboard])


Epoch 1/1


In [18]:
W_Input_Hidden = model.layers[0].get_weights()[0]
B_Input_Hidden= model.layers[0].get_weights()[1]

W_Output_Hidden = model.layers[3].get_weights()[0]
B_Output_Hidden= model.layers[3].get_weights()[1]

print(summary)
print('INPUT-HIDDEN LAYER WEIGHTS:')
print(W_Input_Hidden)

print('INPUT-HIDDEN LAYER BIASES:')
print(B_Input_Hidden)

print('HIDDEN-OUTPUT LAYER WEIGHTS:')
print(W_Output_Hidden)

print('OUT-HIDDEN LAYER BIASES:')
print(B_Output_Hidden)

None
INPUT-HIDDEN LAYER WEIGHTS:
[[ 0.8273283   0.7599194 ]
 [ 0.07683155  0.19754624]
 [ 0.8081137  -0.30484623]
 [ 0.40900648 -0.68801194]
 [ 0.47826025  0.14262772]]
INPUT-HIDDEN LAYER BIASES:
[-0.00074413  0.        ]
HIDDEN-OUTPUT LAYER WEIGHTS:
[[-0.25202236  0.85256875]
 [-0.89327574 -1.1070108 ]]
OUT-HIDDEN LAYER BIASES:
[ 0.00074412 -0.00074412]


In [19]:
# Output the model into the built-in imge in Keras
# The image can be opened in the next Markdown block
plot_model(model, to_file='./img/model_plot.png', 
           show_shapes=True, show_layer_names=True)

![title](./img/model_plot.png)

### Output the configuration, weights, gradients of each trainable layer

In [20]:
# Output the configuration and weights of each trainable layer
for layer in model.layers:
    g=layer.get_config()
    h=layer.get_weights()
    print (g)
    print (h)

{'name': 'dense_1', 'trainable': True, 'batch_input_shape': (None, 5), 'dtype': 'float32', 'units': 2, 'activation': 'linear', 'use_bias': True, 'kernel_initializer': {'class_name': 'VarianceScaling', 'config': {'scale': 1.0, 'mode': 'fan_avg', 'distribution': 'uniform', 'seed': None}}, 'bias_initializer': {'class_name': 'Zeros', 'config': {}}, 'kernel_regularizer': None, 'bias_regularizer': None, 'activity_regularizer': None, 'kernel_constraint': None, 'bias_constraint': None}
[array([[ 0.8273283 ,  0.7599194 ],
       [ 0.07683155,  0.19754624],
       [ 0.8081137 , -0.30484623],
       [ 0.40900648, -0.68801194],
       [ 0.47826025,  0.14262772]], dtype=float32), array([-0.00074413,  0.        ], dtype=float32)]
{'name': 'activation_1', 'trainable': True, 'activation': 'relu'}
[]
{'name': 'dropout_1', 'trainable': True, 'rate': 0.5, 'noise_shape': None, 'seed': None}
[]
{'name': 'dense_2', 'trainable': True, 'units': 2, 'activation': 'linear', 'use_bias': True, 'kernel_initializer'

In [21]:
# Obtain the gradients

weights = model.trainable_weights # weight tensors
weights = [weight for weight in weights if model.get_layer(weight.name.split("/")[0]).trainable] # filter down weights tensors to only ones which are trainable
gradients = model.optimizer.get_gradients(model.total_loss, weights) # gradient tensors

#print(weights)

In [22]:

input_tensors = [model.inputs[0], # input data
                 model.sample_weights[0], # how much to weight each sample by
                 model.targets[0], # labels
                 K.learning_phase(), # train or test mode
]

get_gradients = K.function(inputs=input_tensors, outputs=gradients)

In [23]:


nb_sample = 2

inputs = [x_train, # X
          np.ones(nb_sample), # sample weights
          y_train, # y
          0 # learning phase in TEST mode
]


print(list(zip(weights, get_gradients(inputs))))

[(<tf.Variable 'dense_1/kernel:0' shape=(5, 2) dtype=float32_ref>, array([[ 0.        ,  0.        ],
       [ 0.24179928, -0.0674738 ],
       [-0.10690789,  0.        ],
       [-0.10690789,  0.        ],
       [ 0.34870717, -0.0674738 ]], dtype=float32)), (<tf.Variable 'dense_1/bias:0' shape=(2,) dtype=float32_ref>, array([ 0.24179928, -0.0674738 ], dtype=float32)), (<tf.Variable 'dense_2/kernel:0' shape=(2, 2) dtype=float32_ref>, array([[-0.04983826,  0.04983826],
       [-0.10738914,  0.10738914]], dtype=float32)), (<tf.Variable 'dense_2/bias:0' shape=(2,) dtype=float32_ref>, array([-0.21890387,  0.21890387], dtype=float32))]


In [24]:
text_labels = encoder.classes_ 
input_sentence = ''
while(1):
    try:
        # Get input sentence
        input_sentence = input('> ')
        # Check if it is quit case
        if input_sentence == 'q' or input_sentence == 'quit': 
            break
        # Normalize sentence
        input_sentence = tokenize.texts_to_matrix([input_sentence])       
        pred_output = model.predict(np.array(input_sentence))
        pred_prob=model.predict_proba(np.array(input_sentence))
        predicted_label = text_labels[np.argmax(pred_output)]
        print('Bot:', ' '.join([predicted_label,'Probality:',str(pred_prob[0,np.argmax(pred_output)])]))
    except KeyError:
        print("Error: Encountered unknown word.")

> q


In [None]:
model.layers[0].trainable=False