<a href="https://colab.research.google.com/github/Kabzel55/RNN-text_classifier/blob/master/RNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%bash
rm -rf /content/*

In [27]:
import os
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.layers import Dense, Embedding, Flatten

sns.set()

import warnings
warnings.filterwarnings('ignore')

In [None]:
%%bash
pwd
wget -- output-document=aclImdb_v1.tar.gz --quiet https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
ls

In [4]:
import tarfile
with tarfile.open('aclImdb_v1.tar.gz', 'r:gz') as tar:
    tar.extractall()

In [5]:
base_path = '/content/aclImdb'

labels = {'pos': 1, 'neg':0}
df = pd.DataFrame()
for s in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(base_path, s, l)
        for file in os.listdir(path):
            with open(os.path.join(path, file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
                df = df.append([[txt, labels[l]]], ignore_index=True)
df.columns = ['review', 'sentiment']

In [6]:
np.random.seed(0)
df = df.reindex(np.random.permutation(df.index))
df.to_csv('films_reviews.csv', index=False, encoding='utf-8')

In [7]:
df = pd.read_csv('/content/films_reviews.csv', encoding='utf-8')
df.head(10)

Unnamed: 0,review,sentiment
0,I really liked this movie.<br /><br />Everyone...,1
1,This cheap and rubbish film is about a NASA te...,0
2,The plot is rocky. The acting is somewhere sou...,0
3,Stargate is the best show ever. All the actors...,1
4,This movie starred a totally forgotten star fr...,0
5,I have not seen this movie! At least not in it...,1
6,I caught the last half of this movie on cable ...,1
7,"After The Funeral was absolutely superb, and b...",1
8,A beautiful postcard of New York. The thing I ...,1
9,I've watched the first 17 episodes and this se...,1


In [8]:
max_len = 100
num_words = 10000
embedding_dim = 100

tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(df['review'])

In [9]:
list(tokenizer.index_word.items())[:20]

[(1, 'the'),
 (2, 'and'),
 (3, 'a'),
 (4, 'of'),
 (5, 'to'),
 (6, 'is'),
 (7, 'br'),
 (8, 'in'),
 (9, 'it'),
 (10, 'i'),
 (11, 'this'),
 (12, 'that'),
 (13, 'was'),
 (14, 'as'),
 (15, 'for'),
 (16, 'with'),
 (17, 'movie'),
 (18, 'but'),
 (19, 'film'),
 (20, 'on')]

In [10]:
sequence = tokenizer.texts_to_sequences(df['review'])
print(sequence[:3])

[[10, 62, 425, 11, 17, 7, 7, 304, 45, 137, 5, 26, 7017, 15, 304, 6, 144, 54, 1740, 38, 1072, 118, 29, 102, 7, 7, 10, 339, 106, 400, 818, 133, 4863, 42, 1767, 3873, 2, 42, 609, 80, 5, 4233, 24, 112, 15, 40, 467, 40, 4212, 20, 87, 18, 10, 89, 1020, 7, 7, 10, 255, 1767, 3336, 4636, 419, 393, 2, 10, 436, 112, 15, 40, 7, 7, 59, 13, 3, 2681, 6730, 257, 40, 1640, 70, 21, 3922, 281, 59, 654, 5, 1008, 878, 59, 13, 8, 2951, 2, 115, 106, 3923, 18, 34, 44, 8, 144, 114, 50, 22, 3478, 2136, 22, 23, 1279, 5, 77, 3923, 30, 1, 375, 220, 7, 7, 13, 59, 3, 120, 96, 273, 18, 73, 206, 382, 12, 59, 6, 636, 16, 40, 1998, 15, 1, 86, 55, 59, 677, 12, 609, 100, 29, 6, 3, 49, 128, 273, 3, 120, 96, 3589, 599, 2, 158, 4133, 18, 59, 81, 802, 1, 4317, 5, 420, 1, 114, 14, 813, 7, 7, 6, 59, 5, 1755, 51, 71, 87, 10, 101, 54, 7, 7, 18, 1, 476, 1462, 82, 65, 468, 47, 1328, 7, 7, 609, 298, 4, 3006, 24, 325, 139, 41, 4, 24, 114, 14, 79, 818, 4446, 14, 1, 203, 1111, 8697, 1, 3971, 2, 9408, 1400, 15, 1, 126, 28, 537, 5, 385, 

In [11]:
word_index = tokenizer.word_index
print(f'{len(word_index)} unique words.')

124252 unique words.


In [12]:
data = pad_sequences(sequence, maxlen=max_len)
data.shape

(50000, 100)

In [13]:
labels = np.asarray(df['sentiment'])
labels

array([1, 0, 0, ..., 0, 0, 1])

In [14]:
X_train = data[0:30000]
y_train = labels[0:30000]

X_valid = data[30000:40000]
y_valid = labels[30000:40000]

X_test = data[40000:]
y_test = labels[40000:]

In [15]:
filepath ='best_basic_model.hdf5'

my_calbacks = [
ModelCheckpoint(filepath=filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max'),
EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2) 
]

basic = Sequential()
basic.add(Embedding(num_words, embedding_dim, input_length=max_len))
basic.add(Flatten())
basic.add(Dense(16, activation='relu'))
basic.add(Dense(1, activation='sigmoid'))
basic.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 100)          1000000   
_________________________________________________________________
flatten (Flatten)            (None, 10000)             0         
_________________________________________________________________
dense (Dense)                (None, 16)                160016    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 1,160,033
Trainable params: 1,160,033
Non-trainable params: 0
_________________________________________________________________


In [16]:
basic.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [17]:
history = basic.fit(X_train, y_train, batch_size=32, epochs=5, validation_data=(X_valid, y_valid), callbacks=my_calbacks)

Epoch 1/5
Epoch 00001: val_accuracy improved from -inf to 0.85450, saving model to best_basic_model.hdf5
Epoch 2/5
Epoch 00002: val_accuracy did not improve from 0.85450
Epoch 3/5
Epoch 00003: val_accuracy did not improve from 0.85450
Epoch 00003: early stopping


In [18]:
def plot_hist(history):
    import pandas as pd
    import plotly.graph_objects as go
    hist = pd.DataFrame(history.history)
    hist['epoch'] = history.epoch

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['accuracy'], name='accuracy', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_accuracy'], name='val_accuracy', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='accuracy vs. val accuracy', xaxis_title='Epoch', yaxis_title='accuracy', yaxis_type='log')
    fig.show()

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['loss'], name='loss', mode='markers+lines'))
    fig.add_trace(go.Scatter(x=hist['epoch'], y=hist['val_loss'], name='val_loss', mode='markers+lines'))
    fig.update_layout(width=1000, height=500, title='loss vs. val loss', xaxis_title='Epoch', yaxis_title='loss', yaxis_type='log')
    fig.show()

plot_hist(history)

In [19]:
from tensorflow.keras.layers import SimpleRNN, LSTM
filepath ='best_SIMPLERNN_model.hdf5'
my_calbacks = [
ModelCheckpoint(filepath=filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max'),
EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2) 
]
model_SimpleRNN = Sequential()
model_SimpleRNN.add(Embedding(10000,32))
model_SimpleRNN.add(SimpleRNN(16)) 
model_SimpleRNN.add(Dense(16, activation='relu'))
model_SimpleRNN.add(Dense(1, activation='sigmoid'))
model_SimpleRNN.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 16)                784       
_________________________________________________________________
dense_2 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 17        
Total params: 321,073
Trainable params: 321,073
Non-trainable params: 0
_________________________________________________________________


In [20]:
model_SimpleRNN.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [21]:
history_SimpleRNN = model_SimpleRNN.fit(X_train, y_train, batch_size=32, epochs=10,
                                        validation_data=(X_valid,y_valid), callbacks=my_calbacks)

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.84890, saving model to best_SIMPLERNN_model.hdf5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.84890 to 0.85800, saving model to best_SIMPLERNN_model.hdf5
Epoch 3/10
Epoch 00003: val_accuracy did not improve from 0.85800
Epoch 4/10
Epoch 00004: val_accuracy did not improve from 0.85800
Epoch 5/10
Epoch 00005: val_accuracy did not improve from 0.85800
Epoch 00005: early stopping


In [22]:
plot_hist(history_SimpleRNN)

In [23]:
filepath ='best_LSTM_model.hdf5'
my_calbacks = [
ModelCheckpoint(filepath=filepath, monitor='val_accuracy', verbose=1, save_best_only=True, mode='max'),
EarlyStopping(monitor='val_loss', mode='min', verbose=1, patience=2) 
]
model_LSTM = Sequential()
model_LSTM.add(Embedding(10000,32))
model_LSTM.add(LSTM(16)) 
model_LSTM.add(Dense(16, activation='relu'))
model_LSTM.add(Dense(1, activation='sigmoid'))
model_LSTM.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm (LSTM)                  (None, 16)                3136      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 323,425
Trainable params: 323,425
Non-trainable params: 0
_________________________________________________________________


In [24]:
model_LSTM.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['accuracy'])

In [25]:
history_LSTM = model_LSTM.fit(X_train, y_train, batch_size=32, epochs=10,
                              validation_data=(X_valid,y_valid),callbacks=my_calbacks)

Epoch 1/10
Epoch 00001: val_accuracy improved from -inf to 0.85060, saving model to best_LSTM_model.hdf5
Epoch 2/10
Epoch 00002: val_accuracy improved from 0.85060 to 0.86070, saving model to best_LSTM_model.hdf5
Epoch 3/10
Epoch 00003: val_accuracy did not improve from 0.86070
Epoch 4/10
Epoch 00004: val_accuracy improved from 0.86070 to 0.87130, saving model to best_LSTM_model.hdf5
Epoch 5/10
Epoch 00005: val_accuracy improved from 0.87130 to 0.87280, saving model to best_LSTM_model.hdf5
Epoch 6/10
Epoch 00006: val_accuracy did not improve from 0.87280
Epoch 00006: early stopping


In [26]:
plot_hist(history_LSTM)

In [28]:
new_model = load_model('/content/best_LSTM_model.hdf5')
new_model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 32)          320000    
_________________________________________________________________
lstm (LSTM)                  (None, 16)                3136      
_________________________________________________________________
dense_4 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 17        
Total params: 323,425
Trainable params: 323,425
Non-trainable params: 0
_________________________________________________________________


In [29]:
loss, acc = new_model.evaluate(X_test, y_test, verbose=2)
print(acc)

313/313 - 1s - loss: 0.3180 - accuracy: 0.8667
0.8666999936103821
