In [103]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

11 Recurrent Neural Networks
Recurrent Neural Networks (RNNs) are a class of artificial neural networks designed to process sequential data, where the order of data points matters. Unlike feedforward neural networks, RNNs have connections that form loops, allowing information to persist and be passed from one step to the next. This capability makes RNNs well-suited for tasks involving time series, natural language processing, speech recognition, video analysis, and more.

By leveraging their recurrent connections and hidden state, RNNs excel at capturing temporal dependencies in sequential data. However, training RNNs effectively remains a challenge, particularly for long sequences. Various advanced RNN variants, such as Long Short-Term Memory (LSTM) and Gated Recurrent Unit (GRU), have been introduced to address the vanishing and exploding gradient problem and improve the modeling capabilities of RNNs for a wider range of applications.

Key Concepts of RNNs:
Recurrent Connections:
The defining feature of RNNs is the presence of recurrent connections, which allow information to be retained and propagated through time. In each step of the sequence, an RNN processes the current input along with the hidden state from the previous step, updating the hidden state accordingly. This feedback mechanism enables RNNs to learn dependencies and patterns across different time steps. 2. Hidden State: The hidden state of an RNN acts as its memory and captures relevant information from the past. It is continuously updated at each time step and serves as an internal representation of the input sequence. 3. Training Challenges: Training RNNs can be challenging due to the vanishing and exploding gradient problem. During backpropagation through time (BPTT), gradients can either become too small (vanish) or too large (explode), leading to poor convergence or training instability. This phenomenon occurs when the network has to propagate information over long sequences, and it becomes difficult for the gradients to accurately propagate back to the initial time steps.

RNN Architectures:
There are different variations of RNN architectures, including Elman RNN, Jordan RNN, and bidirectional RNNs.

Elman RNN: In an Elman RNN, the hidden state is fed back to the network's input at the next time step, creating a simple feedback loop. This architecture is suitable for many sequential tasks but can suffer from vanishing gradients for long sequences.
Jordan RNN: In a Jordan RNN, the hidden state is fed back to the network's output at the current time step. This type of architecture can be useful for specific problems but is less commonly used compared to Elman RNNs and other more advanced RNN variants.
Bidirectional RNNs: Bidirectional RNNs process the input sequence in both forward and backward directions, allowing the model to consider future information as well. This is particularly useful for tasks where context from both past and future elements is essential, such as speech recognition and machine translation.

Exercise
Use the IMDB movie reviews dataset to perform sentiment analysis with a simple RNN.

In [104]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense, Bidirectional
import tensorflow as tf

In [105]:
max_features = 5000  # Number of words to consider as features
max_len_short = 100  # Maximum sequence length for short sequences
max_len_long = 500   # Maximum sequence length for long sequences

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

Pad sequences to a fixed length for RNN input

In [106]:
x_train_short = tf.keras.utils.pad_sequences(x_train, maxlen=max_len_short)
x_test_short = tf.keras.utils.pad_sequences(x_test, maxlen=max_len_short)

x_train_long = tf.keras.utils.pad_sequences(x_train, maxlen=max_len_long)
x_test_long =  tf.keras.utils.pad_sequences(x_test, maxlen=max_len_long)

Build the RNN model

In [107]:
def build_rnn_model():
    model = Sequential()
    model.add(Embedding(max_features, 32))
    model.add(SimpleRNN(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [108]:
from tensorflow.keras.layers import LSTM
def build_rnn_model_lstm():
    model = Sequential()
    model.add(Embedding(max_features, 16))
    model.add(LSTM(16, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [109]:
def build_rnn_model_bi():
    model = Sequential()
    model.add(Embedding(max_features, 32))
    model.add(Bidirectional(SimpleRNN(32, activation='relu')))
    model.add(Dense(1, activation='sigmoid'))
    return model

Train and evaluate the RNN model

In [110]:
def train_and_evaluate_model(model, x_train, y_train, x_test, y_test):
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    #model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
    #history = model.fit(x_train, y_train, epochs=5, batch_size=128, validation_split=0.2)
    history = model.fit(x_train, y_train, epochs=4, batch_size=512)
    loss, accuracy = model.evaluate(x_test, y_test)
    return loss, accuracy, history

Train and evaluate RNN on short and long sequences

In [111]:
print("\nTraining SimpleRNN model on short sequences:")
rnn_model_short = build_rnn_model_bi()
loss_short, accuracy_short, history_short = train_and_evaluate_model(
    rnn_model_short, x_train_short, y_train, x_test_short, y_test
)

print("\nTraining SimpleRNN model on long sequences:")
rnn_model_long = build_rnn_model_bi()
loss_long, accuracy_long, history_long = train_and_evaluate_model(
    rnn_model_long, x_train_long, y_train, x_test_long, y_test
)


Training SimpleRNN model on short sequences:
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4

Training SimpleRNN model on long sequences:
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


Compare the results

In [112]:
print("\nResults on Short Sequences:")
print(f"Loss: {loss_short:.4f}, Accuracy: {accuracy_short:.4f}")

print("\nResults on Long Sequences:")
print(f"Loss: {loss_long:.4f}, Accuracy: {accuracy_long:.4f}")




Results on Short Sequences:
Loss: 0.3761, Accuracy: 0.8359

Results on Long Sequences:
Loss: 0.3351, Accuracy: 0.8620


## with build_rnn_model_lstm
Results on Short Sequences:
Loss: 0.5788, Accuracy: 0.7310

Results on Long Sequences:
Loss: nan, Accuracy: 0.5000

## with build_rnn_model_lstm and model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])

Results on Short Sequences:
Loss: 0.5542, Accuracy: 0.7568

Results on Long Sequences:
Loss: nan, Accuracy: 0.5000

## with build_rnn_model_bi and model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
Results on Short Sequences:
Loss: 0.8635, Accuracy: 0.6448

Results on Long Sequences:
Loss: 0.4200, Accuracy: 0.8236
## with build_rnn_model_bi and model.compile(optimizer='rmsprop',loss='binary_crossentropy',metrics=['acc'])
Results on Short Sequences:
Loss: 0.3761, Accuracy: 0.8359

Results on Long Sequences:
Loss: 0.3351, Accuracy: 0.8620