#  Part 1:  Recurrent Neural Network 

###  Importing packages

In [3]:
import re
import numpy as np
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM
from keras.datasets import imdb

from keras.utils.np_utils import to_categorical

import warnings
warnings.filterwarnings('ignore')
import ssl
ssl._create_default_https_context = ssl._create_unverified_context

### Preparing Dataset

In [4]:
max_features = 1000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')

print('Pad sequences (samples x time)')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

Loading data...
Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz
(25000, 'train sequences')
(25000, 'test sequences')
Pad sequences (samples x time)
('x_train shape:', (25000, 80))
('x_test shape:', (25000, 80))


In [7]:
print(x_test[333:335])

[[155  18   2  29  16  11   2   2   6 291 303   2  29  70  30   2   2  10
   10   2  13  81 386 149  14  20   8 316  11   4 130  75  69  87 253 149
   12   2  13   2  25 103  25 340 296   2  30   2  25  70  81  12  25  80
    2   4 651   7 116  11 101 348   2   2  10  10  42  43 140   2 106   2
   31  53  58 198  51 146   2  81]
 [  4   2  78   2 232   2   6 647 428   2   2   2   9   4 124  12  32 255
    2   5  17  35 445  31  32  26 897 156   5 144  28 573 128  74   8   2
   19  14   2   5   4   2 155   9  15  18 148  37  40  14 512   7 920 924
  969   2   4   2   2   2  46  19  49   7   4   2 748   8 569   4 268  11
    6 196  58   6 215 717   2   2]]


### Visualize the data

In [6]:
INDEX_FROM=3   # word index offset

word_to_id = imdb.get_word_index()
word_to_id = {k:(v+INDEX_FROM) for k,v in word_to_id.items()}
word_to_id["<PAD>"] = 0
word_to_id["<START>"] = 1
word_to_id["<UNK>"] = 2

id_to_word = {value:key for key,value in word_to_id.items()}
print(' '.join(id_to_word[id] for id in x_train[0] ))

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


### Building a Model

In [8]:
print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(16, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))
model.summary()

Build model...
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                1600      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 9,617
Trainable params: 9,617
Non-trainable params: 0
_________________________________________________________________


### Model Training

In [9]:
# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Write the training input and output, batch size, and testing input and output

model.fit(x_train, y_train, 
          batch_size=batch_size, 
          epochs=1, 
          validation_data=(x_test, y_test))

Train on 25000 samples, validate on 25000 samples
Epoch 1/1


<keras.callbacks.History at 0x7ff7081e2950>

### Testing

In [10]:
score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

('Test accuracy:', 0.75544)


### Prediction

In [12]:
prediction = model.predict(x_test[333:334])
print('Prediction value:',prediction[0])
print('Test Label:',y_test[333:334])

('Prediction value:', array([0.38003665], dtype=float32))
('Test Label:', array([0]))


### Other RNN Layers

* keras.layers.RNN(cell, return_sequences=False)
* keras.layers.SimpleRNN(units, activation='tanh')
* keras.layers.GRU(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.ConvLSTM2D(filters, kernel_size, strides=(1, 1), padding='valid', )
* keras.layers.SimpleRNNCell(units, activation='tanh')
* keras.layers.GRUCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.LSTMCell(units, activation='tanh', recurrent_activation='hard_sigmoid')
* keras.layers.CuDNNGRU(units, kernel_initializer='glorot_uniform')
* keras.layers.CuDNNLSTM(units, kernel_initializer='glorot_uniform')

# Part 2: Recurrent Neural Network with Custom Dataset

In [21]:
# Credits to Peter Nagy

### Load data

In [13]:
data = pd.read_csv('Senti.csv')
# Keeping only the neccessary columns
data = data[['text','sentiment']]

### Visualize data

In [17]:
data.head(15)

Unnamed: 0,text,sentiment
0,I love this car,Positive
1,This view is amazing,Positive
2,I feel great this morning,Positive
3,I am so excited about the concert,Positive
4,He is my best friend,Positive
5,I do not like this car,Negative
6,This view is horrible,Negative
7,I feel tired this morning,Negative
8,I am not looking forward to the concert,Negative
9,He is my enemy,Negative


### Format data

In [18]:
data = data[data.sentiment != "Neutral"]
data['text'] = data['text'].apply(lambda x: x.lower())
data['text'] = data['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')

max_fatures = 2000
tokenizer = Tokenizer(nb_words=max_fatures, split=' ')
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)
X = pad_sequences(X)

### Training set

In [19]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print('Shape of training samples:',X_train.shape,Y_train.shape)
print('Shape of testing samples:',X_test.shape,Y_test.shape)

('Shape of training samples:', (6, 8), (6, 2))
('Shape of testing samples:', (4, 8), (4, 2))


### Design a model

In [21]:
model = Sequential()
model.add(Embedding(max_fatures, 128 ,input_length = X.shape[1], dropout=0.2))
model.add(LSTM(128))
model.add(Dense(2, activation='softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 8, 128)            256000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 2)                 258       
Total params: 387,842
Trainable params: 387,842
Non-trainable params: 0
_________________________________________________________________
None


### Training 

In [22]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 5, batch_size=batch_size, verbose = 2)

Epoch 1/5
0s - loss: 0.6895 - acc: 0.6667
Epoch 2/5
0s - loss: 0.6817 - acc: 0.6667
Epoch 3/5
0s - loss: 0.6734 - acc: 1.0000
Epoch 4/5
0s - loss: 0.6643 - acc: 1.0000
Epoch 5/5
0s - loss: 0.6539 - acc: 1.0000


<keras.callbacks.History at 0x7ff70fb48910>

### Validation

In [23]:
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
print("Score: %.2f" % (score))
print("Accuracy: %.2f" % (acc))

Score: 0.75
Accuracy: 0.00


### Formatting Test Example

In [26]:
text = 'I do not like him'
tester = np.array([text])
tester = pd.DataFrame(tester)
tester.columns = ['text']

tester['text'] = tester['text'].apply(lambda x: x.lower())
tester['text'] = tester['text'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

max_fatures = 2000
test = tokenizer.texts_to_sequences(tester['text'].values)
test = pad_sequences(test)

if X.shape[1]>test.shape[1]:
    test = np.pad(test[0], (X.shape[1]-test.shape[1],0), 'constant')
    
test = np.array([test])

prediction = model.predict(test)
print('Prediction value:',prediction[0])

('Prediction value:', array([0.49856636, 0.5014336 ], dtype=float32))


# Part 3: RNN Design Choices

## Influence of number of nodes

### LSTM with 8 nodes

In [27]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_4 (LSTM)                (None, 8)                 544       
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 9         
Total params: 8,553
Trainable params: 8,553
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
('Test accuracy:', 0.80448)


### LSTM with 64 nodes

In [28]:
# Write your code here 
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(64, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
# Use the same layer design from the above cell 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, None, 8)           8000      
_________________________________________________________________
lstm_5 (LSTM)                (None, 64)                18688     
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 65        
Total params: 26,753
Trainable params: 26,753
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
('Test score:', 0.40666526790618895)
('Test accuracy:', 0.8162)


## Influence of Embedding

In [29]:
model = Sequential()
model.add(Embedding(max_features, 4))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, None, 4)           4000      
_________________________________________________________________
lstm_6 (LSTM)                (None, 16)                1344      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
Total params: 5,361
Trainable params: 5,361
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
('Test accuracy:', 0.81088)


In [30]:
model = Sequential()
model.add(Embedding(max_features, 16))
model.add(LSTM(16, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_7 (Embedding)      (None, None, 16)          16000     
_________________________________________________________________
lstm_7 (LSTM)                (None, 16)                2112      
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 17        
Total params: 18,129
Trainable params: 18,129
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
('Test accuracy:', 0.81556)


## Influence of Dropout

### Dropout with probability 0.5

In [31]:
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(8, dropout=0.5, recurrent_dropout=0.5))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_8 (Embedding)      (None, None, 32)          32000     
_________________________________________________________________
lstm_8 (LSTM)                (None, 8)                 1312      
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 9         
Total params: 33,321
Trainable params: 33,321
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
('Test accuracy:', 0.7336)


### Dropout with probability 0.9

In [9]:
# Write your code here 
model = Sequential()
model.add(Embedding(max_features, 32))
model.add(LSTM(8, dropout=0.95, recurrent_dropout=0.95))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)
# Use the same model design from the above cell 

## Multilayered RNNs

### RNN with 2 layer LSTM

In [19]:
model = Sequential()
model.add(Embedding(max_features, 8))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0, return_sequences=True))
model.add(LSTM(8, dropout=0.0, recurrent_dropout=0.0))
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(x_train, y_train, batch_size=batch_size, epochs=1, validation_data=(x_test, y_test))

score, acc = model.evaluate(x_test, y_test, batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, None, 32)          32000     
_________________________________________________________________
lstm_9 (LSTM)                (None, None, 8)           1312      
_________________________________________________________________
lstm_10 (LSTM)               (None, 8)                 544       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 9         
Total params: 33,865
Trainable params: 33,865
Non-trainable params: 0
_________________________________________________________________
Train on 25000 samples, validate on 25000 samples
Epoch 1/1
Test accuracy: 0.81376


### RNN with 3 layer LSTM

In [10]:
# Write your code here 

# Use the same node design from the above cell 

### What are your findings?