In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

import sys
import os
import json
import pandas
import numpy
import optparse

from keras.callbacks import TensorBoard
from keras.models import Sequential, load_model
from keras.layers import LSTM, Dense, Dropout, SimpleRNN, Masking, Embedding
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from collections import OrderedDict
from tensorflow.keras.optimizers import SGD


import sklearn
from sklearn.model_selection import train_test_split

### 1. Data Processing

In [2]:
# read the code
dataframe = pandas.read_csv("dev-access.csv", engine='python', quotechar='|', header=None)

In [3]:
# convert to a numpy.ndarray type
dataset = dataframe.values

In [4]:
# check the shape of the data se
dataset.shape 

(26773, 2)

In [5]:
# store all rows and the 0th index as the feature data
X = dataset[:,0]

In [6]:
# store all rows and index 1 as the target variable
Y = dataset[:,1]

In [7]:
# clean up the predictors (including removing features that are not valuable, such as timestamp and source)
for index, item in enumerate(X):
    # Quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [8]:
# tokenize our data, which just means vectorizing our text
# given the data we will tokenize every character (thus char_level = True)
tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

# we will need this later
num_words = len(tokenizer.word_index)+1
X = tokenizer.texts_to_sequences(X)

In [9]:
# pad our data as each observation has a different length
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [10]:
# create the train set to be 75% of the data and the test set to be 25%
np.random.seed(42)
X_train, X_test, y_train, y_test = train_test_split(X_processed, Y, test_size=0.25, random_state=42)

X_train = X_train.astype('float64')
y_train = y_train.astype('float64')
X_test = X_test.astype('float64')
y_test = y_test.astype('float64')

### 2. Model 1 - RNN

In [11]:
model = Sequential()

Embedding(
    input_dim = num_words,
    output_dim = 32,
    input_length = max_log_length)

SimpleRNN(
    units = 32,
    activation = 'relu')

Dense(
    units = 1,
    activation = 'relu')

model.add(Embedding(input_dim = num_words,output_dim = 32,input_length = max_log_length))
model.add(SimpleRNN(units = 32,activation = 'relu'))
model.add(Dense(units = 1,activation = 'relu'))

model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])   

In [12]:
# print the model summary
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
simple_rnn_1 (SimpleRNN)     (None, 32)                2080      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 4,129
Trainable params: 4,129
Non-trainable params: 0
_________________________________________________________________


In [13]:
# use the .fit() method to fit the model on the train data
model.fit(X_train, y_train, validation_split = 0.25, epochs = 3, batch_size = 128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1652a4970>

In [15]:
# use the .evaluate() method to get the loss value & the accuracy value on the test data
scores = model.evaluate(X_test, y_test, batch_size = 128)
#print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
scores



[0.6351475715637207, 0.649387538433075]

### 3. Model 2 - LSTM + Dropout Layers

In [16]:
model_2 = Sequential()

model_2.add(Embedding(input_dim = num_words,output_dim = 32,input_length = max_log_length))
model_2.add(LSTM(units = 64,recurrent_dropout = 0.5))
model_2.add(Dropout(0.5))
model_2.add(Dense(units = 1,activation = 'relu'))

model_2.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 

In [17]:
# print the model summary
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [18]:
# use the .fit() method to fit the model on the train data
model_2.fit(X_train, y_train, validation_split = 0.25, epochs = 3, batch_size = 128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1276b01f0>

In [19]:
# use the .evaluate() method to get the loss value & the accuracy value on the test data
scores = model_2.evaluate(X_test, y_test, batch_size = 128)
#print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
scores



[0.29662224650382996, 0.9626531004905701]

### 4. Recurrent Neural Net Model 3: Build Your Own

In [30]:
model_3 = Sequential()

# Embedding layer
model_3.add(Embedding(input_dim=num_words, input_length = max_log_length, output_dim=32))

# Masking layer for pre-trained embeddings
model_3.add(Masking(mask_value=0.0))

# Recurrent layer
model_3.add(LSTM(64, return_sequences=False, recurrent_dropout=0.5))

# Fully connected layer
model_3.add(Dense(64, activation='relu'))

# Dropout for regularization
model_3.add(Dropout(0.5))

# Output layer
model_3.add(Dense(units = 1, activation='sigmoid'))

# Compile the model
model_3.compile(optimizer='SGD', loss='binary_crossentropy', metrics=['accuracy'])

In [31]:
model_3.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
masking_3 (Masking)          (None, 1024, 32)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 64)                24832     
_________________________________________________________________
dense_9 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout_4 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_10 (Dense)             (None, 1)                 65        
Total params: 31,073
Trainable params: 31,073
Non-trainable params: 0
__________________________________________________

In [32]:
model_3.fit(X_train, y_train, validation_split = 0.25, epochs = 3, batch_size = 128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x15abae490>

In [33]:
# use the .evaluate() method to get the loss value & the accuracy value on the test data
scores = model_3.evaluate(X_test, y_test, batch_size = 128)
#print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
scores



[0.6925635933876038, 0.488646537065506]

### Conceptual Questions

* Explain the difference between the relu activation function and the sigmoid activation function.

    * They are both activation function, which decides the output of a particular node in the neural network. Both function should differentiable, non-linear, and easy to handle.
    * The Sigmoid output is defined by following equation: 
    $y(x) = \frac{1}{1 + e^{-x}}$ 
        * Pros: Sigmoid is differentiable, non-linear, produces non-binary activations and it is bounded between (0,1).
        * Cons: It can cause neural networks to suffer from the vanishing gradient problem since error is backpropagated through the layers and decreases dramatically with each hidden layer. The values are between 0 & 1 and it will be zero when value of the activation reaches 0 or 1 (the horizontal part of the curve). Also, it becomes increasingly more difficult for the neural network to adapt as the layers go up, thus imporve performance. 
        
    * ReLU takes an input and directly outputs the input if positive and outputs 0 if negative. The function of Relu is:
    $𝑦(𝑥)=𝑚𝑎𝑥(0,𝑥)$
        * Pros: Relu doesn’t have a flat curve, it avoids vanishing gradient problem. Unlike sigmoid, reLU is called a piecewise function, because half of the output is linear (the positive output) while the other half is nonlinear. 
        * Cons: Relu is not differentiable at 0 and may result in exploding gradients.

* Describe what one epoch actually is (epoch was a parameter used in the .fit() method).
    * The number of epochs is a hyperparameter of gradient descent that controls the number of complete passes through the training dataset. One epoch means that each sample in the training dataset has had an opportunity to update the internal model parameters. An epoch is comprised of one or more batches.

* Explain how dropout works (you can look at the keras code and/or documentation) for (a) training, and (b) test data sets.
    * Dropout works by randomly setting the outgoing edges of hidden units (neurons that make up hidden layers) to 0 at each update of the training phase. It is a technique used to prevent a model from overfitting.

* Explain why problems such as this homework assignment are better modeled with RNNs than CNNs. What type of problem will CNNs outperform RNNs on?

    * RNN is designed to work with sequence prediction problems, which were best described by the types of inputs and outputs supported. RNNs in general works very well with sequences of words and paragraphs (NLP), such as text data. Moreover, RNN is used in classification prediction problems, regression prediction problems, and generative models. However, RNNs are not appropriate for tabular datasets or image data input.
    * However, CNNs were designed to map image data to an output variable. The benefit of using CNNs is the ability to develop an internal representation of a two-dimensional image. This allows the model to learn position and scale in variant structures in the data, which is important when working with images. More generally, CNNs work better with data that has a spatial relationship.
    * In this case, our homework assignment are better modeled with RNNs than CNNs.

* Explain what RNN problem is solved using LSTM and briefly describe how.
    * The Long Short-Term Memory (LSTM) networks are capable of learning long-term dependencies. They are designed to avoid the long-term dependency problem. All recurrent neural networks have the form of a chain of repeating modules of neural network. In standard RNNs, this repeating module will have a very simple structure, such as a single tanh layer. LSTMs also have this chain like structure, but the repeating module has a different structure. Instead of having a single neural network layer, there are four, interacting in a very special way. LSTM overcomes the problem that RNNs have -- the problems of training a recurrent network, and it in turn has been used on a wide range of applications.