# Step 1. Data Processing

In [1]:
#1.a. Importing libraries
import sys
import os
import json
import pandas
import numpy
import optparse

from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from keras import backend as K

Using TensorFlow backend.


In [2]:
#1.b reading the code
dataframe = pandas.read_csv("dev-access.csv", engine='python', quotechar='|', header=None)

In [3]:
#1.c We then need to convert to a numpy.ndarray type:
dataset = dataframe.values

In [4]:
#1.d.Check the shape of the data set
dataset.shape

(26773, 2)

In [6]:
#1.e. Store all rows and the 0th index as the feature data: 
X = dataset[:,0]

In [7]:
#1.f. Store all rows and index 1 as the target variable: 
Y = dataset[:,1]

In [8]:
#1.g. In the next step, we will clean up the predictors. This includes removing features that are not valuable, such as timestamp and source. 
for index, item in enumerate(X):
# Quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [9]:
#1.h We next will tokenize our data
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

# we will need this later
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

In [10]:
#i. Need to pad our data as each observation has a different length
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [11]:
X_train,X_test,Y_train, Y_test = train_test_split(X_processed,Y,test_size=.25,random_state=0)

# Step 2. Model 1 - RNN: The first model will be a pretty minimal RNN with only an embedding layer, LSTM layer, and Dense layer.

In [12]:
#Using the relu activation function for the output layer
K.clear_session()
model = Sequential()
model.add(Embedding(num_words,32,input_length=max_log_length))
model.add(LSTM(64,recurrent_dropout=0.5))
model.add(Dense(1,activation='relu'))

model.compile(optimizer='adam', loss='binary_crossentropy',metrics=['acc'])

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [None]:
history = model.fit(X_train,Y_train, epochs=3, batch_size=128,validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3


In [13]:
model.evaluate(X_test,Y_test, batch_size=128)



[0.566360255444552, 0.6411711979631812]

In [18]:
#running the same model but with a sigmoid activation function in the output layer
K.clear_session()
modelt = Sequential()
modelt.add(Embedding(num_words,32,input_length=max_log_length))
modelt.add(LSTM(64,recurrent_dropout=0.5))
modelt.add(Dense(1,activation='sigmoid'))

modelt.compile(optimizer='adam', loss='binary_crossentropy',metrics=['acc'])

modelt.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [19]:
history = modelt.fit(X_train,Y_train, epochs=3, batch_size=128,validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [21]:
modelt.evaluate(X_test,Y_test, batch_size=128)



[0.12534750444179582, 0.9740065731751516]

# Step 3. Model 2 - RNN + Dropout Layers + New Activation Function:

In [22]:
K.clear_session()
model2 = Sequential()
model2.add(Embedding(num_words,32,input_length=max_log_length))
model2.add(Dropout(0.5))
model2.add(LSTM(64,recurrent_dropout=0.5))
model2.add(Dropout(0.5))
model2.add(Dense(1,activation='sigmoid'))

model2.compile(optimizer='adam', loss='binary_crossentropy',metrics=['acc'])

model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [23]:
history = model2.fit(X_train,Y_train, epochs=3, batch_size=128,validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [24]:
model2.evaluate(X_test,Y_test, batch_size=128)



[0.19024268955588874, 0.9514490589833381]

#  Step 4. Build Your Own Model

In [25]:
K.clear_session()
model3 = Sequential()
model3.add(Embedding(num_words,32,input_length=max_log_length))
model3.add(Dropout(0.5))
model3.add(LSTM(64,recurrent_dropout=0.5,return_sequences= True))
model3.add(Dropout(0.5))
model3.add(LSTM(64,recurrent_dropout=0.5))
model3.add(Dropout(0.5))
model3.add(Dense(1,activation='sigmoid'))

model3.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])

model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
dropout_1 (Dropout)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 1024, 64)          24832     
_________________________________________________________________
dropout_2 (Dropout)          (None, 1024, 64)          0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_3 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total para

In [26]:
history = model3.fit(X_train,Y_train, epochs=3, batch_size=128,validation_split=0.25)

Train on 15059 samples, validate on 5020 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [27]:
model3.evaluate(X_test,Y_test, batch_size=128)



[0.11744119359159741, 0.9687780102830094]

# 5.  Explain the difference between the relu activation function and the sigmoid activation function.

The ReLu function is simply y(x)=max(0,x). It is not differentiable at 0 and may result in exploding gradients. The function and its derivative are both monotonic. ReLu does not suffer from the vanishing gradients problem. The ReLU function is non linear, which means we can easily backpropagate the errors and have multiple layers of neurons being activated by the ReLU function.
The main advantage of using the ReLU function over other activation functions is that it does not activate all the neurons at the same time. If the input is negative it will convert it to zero and the neuron does not get activated. This means that at a time only a few neurons are activated making the network sparse making it efficient and easy for computation.

The sigmoid function is of the form y(x)=1/(1+e^-x) and exists between (0 to 1). It Therefore, it is especially used for models where we have to predict the probability as an output.The function is differentiable.That means, we can find the slope of the sigmoid curve at any two points. The function is monotonic but function’s derivative is not. The biggest con of this function is that it suffers from the vanishing gradients problem.

The Sigmoid works better as a binary classifier than ReLu as because of the steep gradient in the central portion of the function, it pushes outputs towards the two extreme ends. ReLu works better in the hidden layers as it is less computationally expensive than sigmoid because it involves simpler mathematical operations, and also does not lead to vanishing gradients.

# 6. In regards to question 5, which of these activation functions performed the best (they were used in Model 1 & Model 2) ? Why do you think that is?

With regards to question 5, sigmoid performs better. Running the same model in Question 2 with sigmoid and relu for activation function in the output layer, we see that sigmoid does a much better job of classifying the output as 1 or 0. This is because it has a very steep gradient in the central part of the function (between x values of -2 to 2). Which means, any small changes in the values of X in that region will cause values of Y to change significantly. This has a tendency to bring the Y values to either end of the curve and makes it better as a classifier. If we were using these activation functions in hidden layers, then relu would have been a better choice, as it does not suffer from the vanishing gradients problem.

# 7. Explain how dropout works (you can look at the keras code) for (a) training, and (b) test data sets.

(a)For training - During the train phase, Dropout is easily implemented by randomly selecting nodes to be dropped-out with a given probability (e.g. 20%) each weight update cycle. This helps prevent overfitting. Dropout is mainly applied during the train phase.  

(b)For testing - Dropout is only used during the training of a model and is not used when evaluating the skill of the model. Keras. Keras disables the dropout during the test phase. 


# 8. Explain why problems such as this are better modeled with RNNs than CNNs.

The problem given has data which is temporally related. The convolution operation in a CNN allows a network to share parameters across time, but is shallow. The output of convolution is a sequence where each member of the output is a function of a small number of neighboring members of the input. The idea of parameter sharing manifests in the application of the same convolution kernel at each time step.

Recurrent networks share parameters in a diﬀerent way. Each member of the output is a function of the previous members of the output. Each member of the output is produced using the same update rule applied to the previous outputs. This recurrent formulation results in the sharing of parameters through a very deep computational graph. 

Parameter sharing makes it possible to extend and apply the model to examples of diﬀerent forms (diﬀerent lengths, here) and generalize across them. If we had separate parameters for each value of the time index, we could not generalize to sequence lengths not seen during training, nor share statistical strength across diﬀerent sequence lengths and across diﬀerent positions in time. 

It is dues to this property of parameter sharing that makes it better to model problems such as this with RNNs rather than CNNs.

# 9. Explain what RNN problem is solved using LSTM and briefly describe how.

LSTM solves the vanishing (and exploding gradient problem of RNNs). In a multi-layer network, gradients for deeper layers are calculated as products of many gradients (of activation functions). When those gradients are small or zero, it will easily vanish. (On the other hand, when they’re bigger than 1, it will possibly explode.) So it becomes very hard to calculate and update. 

In the recurrency of the LSTM the activation function is the identity function with a derivative of 1.0. So, the backpropagated gradient neither vanishes or explodes when passing through, but remains constant.

The effective weight of the recurrency is equal to the forget gate activation. So, if the forget gate is on (activation close to 1.0), then the gradient does not vanish. Since the forget gate activation is never >1.0, the gradient can't explode either.
So that's why LSTM is so good at learning long range dependencies.