In [1]:
 # a) Import the following libraries:
import pandas as pd
import numpy as np
import os
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
import sys
from keras.layers.embeddings import Embedding
from keras.layers import LSTM,Dense,Dropout
from keras.models import Sequential,load_model
from tensorflow.keras.callbacks import TensorBoard
from collections import OrderedDict
import optparse
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
import json
from keras.layers import SimpleRNN

# 1. Data Processing:

In [2]:
# b) We will read the code in slightly differently than before:
df = pd.read_csv("dev-access.csv", engine='python', quotechar='|', header=None)

In [3]:
 # c) We then need to convert to a numpy.ndarray type:
df = df.values
df[0]

array(['{"timestamp":1502738402847,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"88.141.113.237","referer":"http://localhost:8002/enter"},"route":"/login","headers":{"host":"localhost:8002","accept-language":"en-us","accept-encoding":"gzip, deflate","connection":"keep-alive","accept":"*/*","referer":"http://localhost:8002/enter","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","content-type":"application/json","content-length":"36"},"requestPayload":{"username":"Carl2","password":"bo"},"responsePayload":{"statusCode":401,"error":"Unauthorized","message":"Invalid Login"}}',
       0], dtype=object)

In [4]:
 # d) Check the shape of the data set - it should be (26773, 2). Spend some time looking at the data.
df.shape


(26773, 2)

In [5]:
df[0:6]

array([['{"timestamp":1502738402847,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"88.141.113.237","referer":"http://localhost:8002/enter"},"route":"/login","headers":{"host":"localhost:8002","accept-language":"en-us","accept-encoding":"gzip, deflate","connection":"keep-alive","accept":"*/*","referer":"http://localhost:8002/enter","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","content-type":"application/json","content-length":"36"},"requestPayload":{"username":"Carl2","password":"bo"},"responsePayload":{"statusCode":401,"error":"Unauthorized","message":"Invalid Login"}}',
        0],
       ['{"timestamp":1502738402849,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"88.141.113.237"},"route":"/login","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6","content-t

In [6]:
# e) Store all rows and the 0th index as the feature data:
X = df[:,0]
X

array(['{"timestamp":1502738402847,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"88.141.113.237","referer":"http://localhost:8002/enter"},"route":"/login","headers":{"host":"localhost:8002","accept-language":"en-us","accept-encoding":"gzip, deflate","connection":"keep-alive","accept":"*/*","referer":"http://localhost:8002/enter","cache-control":"no-cache","x-requested-with":"XMLHttpRequest","content-type":"application/json","content-length":"36"},"requestPayload":{"username":"Carl2","password":"bo"},"responsePayload":{"statusCode":401,"error":"Unauthorized","message":"Invalid Login"}}',
       '{"timestamp":1502738402849,"method":"post","query":{},"path":"/login","statusCode":401,"source":{"remoteAddress":"88.141.113.237"},"route":"/login","headers":{"host":"localhost:8002","connection":"keep-alive","cache-control":"no-cache","accept":"*/*","accept-encoding":"gzip, deflate, br","accept-language":"en-US,en;q=0.8,es;q=0.6","content-type":"applicat

In [7]:
# f) Store all rows and index 1 as the target variable:
Y = df[:,1]
Y

array([0, 0, 0, ..., 1, 1, 1], dtype=object)

In [8]:
# g) In the next step, we will clean up the predictors. This includes removing features that are not valuable,
#such as timestamp and source.

for index, item in enumerate(X):
    # Quick hack to space out json elements
    reqJson = json.loads(item, object_pairs_hook=OrderedDict)
    del reqJson['timestamp']
    del reqJson['headers']
    del reqJson['source']
    del reqJson['route']
    del reqJson['responsePayload']
    X[index] = json.dumps(reqJson, separators=(',', ':'))

In [9]:
X[0]

'{"method":"post","query":{},"path":"/login","statusCode":401,"requestPayload":{"username":"Carl2","password":"bo"}}'

In [10]:
# h) We next will tokenize our data, which just means vectorizing our text. Given the data we will tokenize every character 
# (thus char_level = True)

tokenizer = Tokenizer(filters='\t\n', char_level=True)
tokenizer.fit_on_texts(X)

# we will need this later
num_words = len(tokenizer.word_index) + 1
X = tokenizer.texts_to_sequences(X)

In [137]:
dict(list(tokenizer.word_index.items())[0:20])

{'"': 1,
 'e': 2,
 't': 3,
 ':': 4,
 'a': 5,
 's': 6,
 'o': 7,
 'u': 8,
 'r': 9,
 ',': 10,
 'd': 11,
 'l': 12,
 'p': 13,
 'h': 14,
 'y': 15,
 'q': 16,
 'c': 17,
 '{': 18,
 '}': 19,
 'm': 20}

In [138]:
X[0][0:20]

[18, 1, 20, 2, 3, 14, 7, 11, 1, 4, 1, 13, 7, 6, 3, 1, 10, 1, 16, 8]

18 means '{', 1 means '"' as definied byt the word index. 
So this is just the first row but as numbers now.

In [139]:
X[0]

[18,
 1,
 20,
 2,
 3,
 14,
 7,
 11,
 1,
 4,
 1,
 13,
 7,
 6,
 3,
 1,
 10,
 1,
 16,
 8,
 2,
 9,
 15,
 1,
 4,
 18,
 19,
 10,
 1,
 13,
 5,
 3,
 14,
 1,
 4,
 1,
 25,
 12,
 7,
 26,
 24,
 21,
 1,
 10,
 1,
 6,
 3,
 5,
 3,
 8,
 6,
 17,
 7,
 11,
 2,
 1,
 4,
 23,
 22,
 29,
 10,
 1,
 9,
 2,
 16,
 8,
 2,
 6,
 3,
 13,
 5,
 15,
 12,
 7,
 5,
 11,
 1,
 4,
 18,
 1,
 8,
 6,
 2,
 9,
 21,
 5,
 20,
 2,
 1,
 4,
 1,
 17,
 5,
 9,
 12,
 28,
 1,
 10,
 1,
 13,
 5,
 6,
 6,
 32,
 7,
 9,
 11,
 1,
 4,
 1,
 40,
 7,
 1,
 19,
 19]

In [11]:
# i) Need to pad our data as each observation has a different length
max_log_length = 1024
X_processed = sequence.pad_sequences(X, maxlen=max_log_length)

In [12]:
X_processed

array([[ 0,  0,  0, ...,  1, 19, 19],
       [ 0,  0,  0, ...,  1, 19, 19],
       [ 0,  0,  0, ...,  1, 19, 19],
       ...,
       [ 0,  0,  0, ...,  1, 19, 19],
       [ 0,  0,  0, ...,  1, 19, 19],
       [ 0,  0,  0, ...,  1, 19, 19]], dtype=int32)

In [21]:
np.unique(X_processed[0], return_counts=True)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
        17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 28, 29, 32, 40],
       dtype=int32),
 array([909,  22,   7,   6,   7,   7,   7,   7,   4,   5,   5,   4,   3,
          4,   2,   2,   2,   2,   3,   3,   2,   2,   1,   1,   1,   1,
          1,   1,   1,   1,   1]))

In [13]:
# j) Create your train set to be 75% of the data and your test set to be 25%

X_train, X_test, y_train, y_test = train_test_split(X_processed, Y, test_size=.25,random_state=0)

# Model l: Base RNN

In [14]:
# a) Start by creating an instance of a Sequential model:
model_1 = Sequential()


In [15]:
# b) From there, add an Embedding layer: https://keras.io/layers/embeddings/

# Params:

# input_dim = num_words (the variable we created above)
# output_dim = 32
# input_length = max_log_length (we also created this above)
# Keep all other variables as the defaults (shown below)


model_1.add(Embedding(input_dim = num_words, output_dim = 32, input_length = max_log_length))


In [16]:
# c) Add a SimpleRNN layer:
# Params:
# units = 32
# activation = 'relu'

model_1.add(SimpleRNN(units = 32, activation = "relu"))


In [17]:
# d) Finally, we will add a Dense layer:
# Params:
# units = 1 (this will be our output)
# activation --> you can choose to use either relu or sigmoid. 

model_1.add(Dense(units=1, activation='sigmoid'))


In [18]:
# e) Compile model using the .compile() method:
# Params:
# loss = binary_crossentropy
# optimizer = adam
# metrics = accuracy
model_1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])


In [19]:
model_1.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 1024, 32)          2016      
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 32)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 33        
Total params: 4,129
Trainable params: 4,129
Non-trainable params: 0
_________________________________________________________________


In [33]:
y_train = np.asarray(y_train).astype(np.int32)

In [38]:
y_test = np.asarray(y_test).astype(np.int32)

In [34]:
# g) Use the .fit() method to fit the model on the train data. Use validation_split=0.25, epochs=3 batch_size=128.

model_1.fit(X_train, y_train,  epochs = 3, validation_split = 0.25, batch_size = 128, verbose =1)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fefe26e5450>

In [55]:
model_1_metrics = model_1.evaluate(X_test, y_test, batch_size = 128)
print('Model 1 Test loss:', model_1_metrics[0])
print('Model 1 Test accuracy:', model_1_metrics[1])

Model 1 Test loss: 0.06656002998352051
Model 1 Test accuracy: 0.9840155243873596


# Part III. Model 2 - LSTM + Dropout Layers

a) This RNN needs to have the following layers (add in this order):
Embedding Layer (use same params as before)
LSTM Layer (units = 64, recurrent_dropout = 0.5)
Dropout Layer - use a value of 0.5
Dense Layer - (use same params as before)

In [40]:
model_2 = Sequential()


In [41]:
 #Embedding layer
model_2.add(Embedding(input_dim = num_words, output_dim = 32, input_length = max_log_length))


In [42]:
# LSTM layer
model_2.add(LSTM(units = 64, recurrent_dropout = 0.5))


In [43]:
# Dropout layer for regularization
model_2.add(Dropout(rate = 0.5))


In [44]:
# Dense layer
model_2.add(Dense(units = 1, activation = "sigmoid"))

In [45]:
# b) Compile model using the .compile() method:
# Params:
# loss = binary_crossentropy
# optimizer = adam
# metrics = accuracy

model_2.compile(optimizer="adam", loss="binary_crossentropy", metrics=['accuracy'])

In [46]:
model_2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm (LSTM)                  (None, 64)                24832     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 26,913
Trainable params: 26,913
Non-trainable params: 0
_________________________________________________________________


In [47]:
model_2.fit(X_train, y_train, epochs = 3, validation_split = 0.25, batch_size = 128)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fefc4f73c10>

In [56]:
# e) Use the .evaluate() method to get the loss value & the accuracy value on the test data. Use a batch size of
# 128 again.
model_2_metrics = model_2.evaluate(X_test, y_test, batch_size = 128)
print('Model 2 Test loss:', model_2_metrics[0])
print('Model 2 Test accuracy:', model_2_metrics[1])

Model 2 Test loss: 0.10024955868721008
Model 2 Test accuracy: 0.977293074131012


# 4) Recurrent Neural Net Model 3: Build Your Own

You wil now create your RNN based on what you have learned from Model 1 & Model 2:
a) RNN Requirements:
Use 5 or more layers
Add a layer that was not utilized in Model 1 or Model 2 (Note: This could be a new Dense layer or an
additional LSTM)

In [61]:
model_3 = Sequential()
model_3.add(Embedding(num_words, 32, input_length=max_log_length))
model_3.add(LSTM(64,recurrent_dropout=0.5,return_sequences= True))
model_3.add(Dropout(0.5))
model_3.add(LSTM(64, recurrent_dropout=0.5))
model_3.add(Dropout(0.5))
model_3.add(Dense(1, activation="sigmoid"))

model_3.compile(optimizer="rmsprop", loss="binary_crossentropy", metrics=['accuracy'])

model_3.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 1024, 32)          2016      
_________________________________________________________________
lstm_5 (LSTM)                (None, 1024, 64)          24832     
_________________________________________________________________
dropout_6 (Dropout)          (None, 1024, 64)          0         
_________________________________________________________________
lstm_6 (LSTM)                (None, 64)                33024     
_________________________________________________________________
dropout_7 (Dropout)          (None, 64)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 59,937
Trainable params: 59,937
Non-trainable params: 0
__________________________________________________

In [62]:
model_3.fit(X_train, y_train, epochs=3, batch_size=128, validation_split=0.25)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7fef55113950>

In [63]:
model_3_metrics = model_3.evaluate(X_test, y_test, batch_size = 128)
print('Model 3 Test loss:', model_3_metrics[0])
print('Model 3 Test accuracy:', model_3_metrics[1])

Model 3 Test loss: 0.07269810140132904
Model 3 Test accuracy: 0.9796832799911499


# Conceptual Questions:

### 5) Explain the difference between the relu activation function and the sigmoid activation function.

The activation function in the deep learning realm decides what is to be fired to the next neuron within th network or whether a neuron gets activated or not. This mechanism plays a crucial role for the deep learning network in learning complex patterns in the data that as fed to it. The relu and the sigmoid activation functions are one of the most widely used activation functions for the deep learning.

The sigmoid fucntion is one of the most popular activation functions that is widely used in binary classification problems and its output lies between 0 and 1. It is curvy which is makes it differential at any point between this range . The sigmoid function causes the input values that range between -2 and 2 to be very sensitive to change and if the input values are outside of this range then this sigmoid function becomes saturated between the output values of 0 and 1. this bounded nature of the sigmoid function creates a limitation and a weakness where the error gradients drop close to zero called as vanishing gradient problem where the model stagnates and doesnt learn or improve in performance.As we all know that a typical deep learning  neura network uses backpropagation mechanism in order to calculate the error gradients and the perform an update on the weights. If the output of the sigmoid function turns out to be very small,then this could lead to the vanishing gradient problem where the weights of the lower layer  weights do not unchanged and the training never really results or converges to a good solution.That is the reason  ReLU is prefered to sigmoid.

ReLU(REctified linear unit) is an activation function that never really becomes flat because it never saturates as it output values range from 0 to infinity. This is one of the reasons why the gradient descent really works well for ReLU. The ReLU function gives 0 output value if the input is non-positive and it gives positive values if the input is positive. ReLU is considered to be a piecewise function because half of its output is linear because of positive output and the rest/other half is non-linear. The ReLU fucntion helps in preserving the compelx properties of the input data because linear part of this function makes the linear models easy to optimize using gradient descent with no vanishing gradient issue/problem. But it has been noticed that the grdient breaks  with zero or negative value inputs. That is why it is recommended to start with very small but positive weights for this reason. The ReLU function is fast and its efficient because it doesnt activate all of the neurons within the network and that is one of the main advantages of using ReLU.  The ReLu is perfectly capable of giving an output of a true 0 value  in contrast to the sigmoid function that only learns to approximate a 0 output (i.e. a value very close to 0 but not an actual true 0). This is largely driven by how the ReLu oprrates or functions and allowance of such true 0
values means certain neurons will not get activated leading to a sparse representation of a matrix or a sparse matrix. This happens when ReLu converts negative values to 0 and this is a desirable property because it accelerates the process of learning,simplies the model and causes the reduction in the computation making ReLu less
computationally expensive or taxing compared to that of the sigmoid function. Even an activation fucntion like ReLu has a limitation and a drawback where gradient is 0 and the neurons become stuck/inactive/dead neurons
as this is caused by certain neurons that stop responding to variations in the input/error. This particular problem is called as the dying ReLu problem which can solved by using a variant of the ReLu function such as the LeaklyReLu which allows a small, positive gradient when the unit is not active, or a smaller learning rate.





### 6) Describe what one epoch actually is (epoch was a parameter used in the .fit() method).

If we take a particular neural network into consideration, an epoch is considered to be completed or finished when the entirety of the input training dataset is passed both backward and forward through the entire neural network just once. During such each process, an epoch refers to a hyperparameter that indicates how many times all of the training vectors are used once in order to update the weights of the nodes in the neural network or the internal model parameters. Epoch indicates the number of compelete passes through the entire training dataset.

we divide the entire training input datadset into multiple smaller batches as processing the entire dataset in just 1 epoch is too big to be fed into the model. That is why one more parameter into consideration called the batch_size. A single epoch is said to be complete when all of the batches within the dataset have been trained.
Lets take an example where say we have a data of 5000 datapoints/rows. We cam divide the entire data into batches of 1000. In this case 1000 will be the batch_size and to complete 1 epoch, it will take 5 iterations. The weights of the
neurons are updated during each iteration. The performance of our deep learning model/network  can only improve when we pass the entire training dataset multiple times to the same deep neural network and compute the error gradients through backpropagation because just passing the entire dataset through a neural network once (i.e. 1 epoch) is not enough.At the same time it is also important to make sure that we do not go through too many training epochs iterations as that might be counter-productive and also  that would lead to the model learning all the unnecssary noise within the training data and overfit.

Accuracy of the model predictions increase with the increase in the number of epochs which in turn leads to the decrease in the model errors or the bias. However for the validation set, a lowest point of global minima is reached for the model errors beyond which the error values increases again . This usually happens when the predictive model starts to overfit and incorporates the noise instead of the general patterns within the training data. So this overfit cannot perform accurately when it is exposed to the new unseen data. Early stopping is a very effective strategy or technique to stop this overfitting where the validation set error value reaches the global minimum so that we dont have to over-train our model.




### 7) Explain how dropout works (you can look at the keras code and/or documentation) for (a) training, and (b) test data sets.

Dropout is a well known regularization technique in the deep learning world which helps in preventing overfitting by  dropping nodes randomly within the neural network between each training iteration. During each training iteration or epoch , certain neurons are selected randomly that are ignored or 'dropped out' and play no role in the calculation of error gradients which leads to a better generalization error. This is because the neural network will not be sensitive to weights of specific neurons or let certain neuron dominate the process of the calculation of the error gradients. The dropout is a  hyperparameter 'p' that controls the dropout rate. In this particular assignment we have set it to 50% or 0.5 for the models 2 and 3. This means that any particular node within the neural network has a 50/50 chance of being included in any given training iteration. This technique helps the neural network to adapt to dead/inactive/missing nodes and it helps the model to prevent incorporating the unnecssary noise within the model or  prevent the training data from being memorized.

Dropout regularization process happens only during training step . At each step of training process, any neuron has a probability p (also called the dropout rate) of being entirely ignored or temporarily dropped out during this training step, but it is very likely that it may be active during the next training step . After training process has completed, the neurons will not be dropped anymore. So if we look from a broader perspective a unique neural network is created/generated at every training epoch/step. The resulting final neural network will be an average ensemble of all these smaller neural networks because we have a different neural network altogether at each training step, . This is what makes the dropout mechanism an effective and popular regularization technique.

dropout applies only to training data and not test data.




### 8) Explain why problems such as this homework assignment are better modeled with RNNs than CNNs. What type of problem will CNNs outperform RNNs on?

This home work assignment dataset involves time series data and requires sequence modeling to be done on top of it. 
The data is in the form of a log file. Recurrent neural networks are typically applied to these kind of problems where a sequence of multiple steps taken as input is mapped to a class prediction( binary or 0 or 1). In other words , it is a many-to-one sequence prediction.

The main differentiating factor for the RNNs is that the output of each layer becomes the input for itself( same layer) in the next training iteration. For this particular reason , RNNs are usually prefered for modelling timeseries data or sequences in the input data where the previous values or events(in this case) in that sequence matter a lot for making prediction( binary class prediction in this case). For this specific assignment, the order in which the sequence of events have taken place  helps to find/spot security breach or hacking and not just the metadata. This is what makes RNN is best suited for detection of security breaches.

Convolutional neural networks that are the traditional feed-forward neural networks do
not share imnformation about the features that have been processed across multiple different positions of the neural network. CNN like models assume independence of outputs from the inputs which makes them incapable of performing well for sequence prediction tasks. In our assignment, since the previous sequences in the inputs are inherently important in predicting the next output, RNN is much more suitable or the best fit for sequence modeling because of many reasons . One of the main reason is that they have memory that can be used or can serve as feedback loops .This looping mechanism then acts as a highway to allow information to flow from one training step to the next training step. This information is stored in the hidden state of neurons within the network, which represents previous inputs. Besides that, since RNNs are able to maintain state between sequence elements ,they are typically used with problems of this sequence modeling nature where mapping takes place between input sequence to fixed-sized output vectors. RNNs can handle are many-to-many , one-to-many, many-to-one types of sequence modeling, including spoken language represented a time series or just raw sequences of text(NLP) . while CNN strictly requires fixed length input , RNNs  can work on or be fed data of different variational lengths. Our assignment had time series data with varying lengths.

Because of CNN's feature extraction property , they can easily outperform RNNs on image processing problems.
CNNs develop an internal representation of an image. This is what amkes them particularly suitable for data with spatial relationship or images in general. Once the CNN model learns to recognize a pattern in one location, it can recognize it in any other location. CNNs can scale in variant structures in the input data and its pattern recognition capability is location invariant.




### 9) Explain what RNN problem is solved using LSTM and briefly describe how.

RNNs require a lot of training which certainly can take up a very long time because it will take hundreds of iterations in order for the model to learn long-term patterns  . The problem of long-term dependencies is one of the inherent problems of the Recurrent neural networks (RNN). The idea of connecting previous information to present tasks or in ther words sequential information was one of the main reasons for the creation of the RNNs. In order to to make a reasonable prediction of the present tasks , there are obviously situations where more context is needed.
The main reason for poor performance by RNN is the gap between relevant information that is needed to make accurate predictions on the sequence modelling data and the present task at hand. So for fixing this particular issue , the most common technique is to simply cap the input sequence. This can be done  by either looking at recent data for timeseries modelling or by looking at a fixed number of inputs.  The problem with this specific approach is that a lot of information is lost during this process . So to encounter this problem a way is needed to include both long-term information  and also the most recent information at each and every training step. The most popular way of solving this problem of long-term dependencies is using Long Short Term Memory networks (LSTM),which is a special kind of RNN. 

The vanishing gradient problem in RNN is caused by its short term memory and this problem can be solved using LSTM .
As RNN goes through many iteration and training steps to process more information,  it has trouble retaining information from previous steps. The nature of back-propagation is the main cause of this issue because its an algorithm that is used in the  optimization of the neural networks and also to train the NN. The main concept behind Back-propagation is that it uses the computed error gradients to update weights of the nodes/neurons within the neural network. The main reason for the short-term memory of the RNNs is because the gradient values will exponentially shrink as it propagates through each training step, leading in a minor adjustment to the weights of the  neural network  and also the early/initial layers contribute very less to the learning process. since the initial layers do not learn much , is it very easy for the neural network to forget what it's seen in longer sequences.


The "gate" mechanism that is used to regulate the flow of information is mainly responsible to make LSTM capable of learning long-term dependencies . In order to learn what information to remove or add to the hidden state of neurons,
thse kind of gates use different tensor operations .The short-term memory is less of an issue for the LSTM because of this specific ability. 


LSTM has two different memory inputs, one for short-term and the other one for long-term . The neural network learns to recognize an important sequence in the input and then store it in the long-term cell using these LSTM cells. Over many training iterations , it can continue to use that short term memory in the input  until it is no longer useful and the its gets dropped from the memory. This is the main reason that makes the LSTM cell very efficient for RNNs while training over long sequences of input data.


The core concept behind the fucntioning of the LSTM networks are its gates  and the state of the LSTM cells. The transfer of the relative information down the sequence chain is handled by the individual cell states which act as a transporter highways. The crucial information from the earlier training sequence iterations or steps can make its way to later time steps to retrieved and used/processed .This happens because of the cell states that are very good in carrying the information that is relevant throughout the processing of the training sequence thereby reducing the short-term memory effects.The crucial information gets removed or added during this process, via these gates to the cell state . These LSTM neural networks can then learn what information is really relevant for the modelling and also which information to forget or which information to keep during training period via the sigmoid activation function using these LSTM gates which are separate neural networks altogether within themselves and they determine which information is allowed on the cell state.

LSTMs networks expand the default chain structure within RNNs to add multiple four additional layers in each chain.
The ability to remove or add information with the help of these gates in the LSTM network makes these layers complex comapred to a simple RNN. These gates include a pointwise multiplication operation and a sigmoid neural network layer. The sigmoid layer outputs a 1 or 10(1 allows ; 0 disallows ).


