In [None]:
# Text Classification / tc-nltk-lstm-rnn.ipynb
# Gourav Siddhad
# 16-Mar-2019

In [None]:
print('Importing Libraries', end='')

import pandas as pd
import numpy as np
from numpy.random import seed
import re
import os
import pandas as pd
import time

import matplotlib.pyplot as plt

import nltk
from nltk import word_tokenize
from nltk.corpus import reuters, stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale, MinMaxScaler
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split

import keras
from keras.models import Model, Sequential, load_model, model_from_json
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding, SpatialDropout1D, TimeDistributed, Flatten
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical, plot_model
from keras.callbacks import EarlyStopping
from keras.utils.np_utils import to_categorical


%matplotlib inline

from scipy import interp
from itertools import cycle

print(' - Done')

import warnings
warnings.filterwarnings("ignore")

In [None]:
documents = reuters.fileids()
print('Total Documents -', len(documents))

print('Extracting (Id, Docs and Labels)', end='')
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
all_docs = train_docs
all_docs += test_docs

train_labels = [reuters.categories(doc_id) for doc_id in train_docs_id]
test_labels  = [reuters.categories(doc_id) for doc_id in test_docs_id]
all_labels = train_labels
all_labels += test_labels
print(' - Done')

del train_docs
del test_docs
del train_labels
del test_labels
del train_docs_id
del test_docs_id

print('Documents - ', len(all_docs))
print('Labels  - ', len(all_labels))

# List of categories
categories = reuters.categories()
print('Categories - ', len(categories))

In [None]:
print('Tokenizing', end='')
tk = Tokenizer(num_words=None, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n', lower=True, split=' ', char_level=False, oov_token=None)
tk.fit_on_texts(all_docs)
index_docs = tk.texts_to_sequences(all_docs)
print(' - Done')

print('Binarizing MultiLabels', end='')
lb = MultiLabelBinarizer()
index_labels = lb.fit_transform(all_labels)
print(' - Done')

print('Sorting Train:Test Docs', end='')
X_train, X_test, y_train, y_test = train_test_split(index_docs, index_labels, test_size=0.2, random_state=42)
print(' - Done')

In [None]:
print('Calculating Vocabulary', end='')
vocab = set()
for sent in X_train:
    for word in sent:
        vocab.add(word)
for sent in X_test:
    for word in sent:
        vocab.add(word)
print(' - Done')
vocabulary = len(vocab)

del vocab
del all_docs
del all_labels

In [None]:
print('Padding Sequences', end='')
X_train = sequence.pad_sequences(X_train, maxlen=vocabulary)
X_test = sequence.pad_sequences(X_test, maxlen=vocabulary)
print(' - Done')

print('Scaling Data to Range', end='')
mm_scaler = MinMaxScaler()
X_train = mm_scaler.fit_transform(X_train)
X_test = mm_scaler.transform(X_test)
print(' - Done')

In [None]:
hidden_size = 512
num_steps = 32

model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=vocabulary))
model.add(LSTM(512, return_sequences=True))
model.add(LSTM(512, return_sequences=True))
model.add(Dropout(0.5))
model.add(Flatten())
model.add(Dense(len(categories)))
model.add(Activation('softmax'))
model.summary()
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])
# model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])

In [None]:
batch_size = 256
epochs = 50

history = model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.20)
#, callbacks=[EarlyStopping(monitor='val_loss', min_delta=0.0001)])

In [None]:
# Save Complete Model
model.save('tc-nltk-lstm-rnn.h5')

# Load a Saved Model
# model = load_model('tc-nltk-lstm-rnn.h5')

# Delete a model
# del model

# Save Model Configuration to JSON
model_json = model.to_json()
with open('tc-nltk-lstm-rnn.json', 'w') as json_file:
    json_file.write(model_json)
model.save_weights('tc-nltk-lstm-rnn-weights.h5')

# Load Model Configuration from JSON
# json_file = open('tc-nltk-lstm-rnn.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = model_from_json(loaded_model_json)
# loaded_model.load_weights('tc-nltk-lstm-rnn-weights.h5')
# loaded_model.save('tc-nltk-lstm-rnn-weights.hdf5')
# loaded_model=load_model('tc-nltk-lstm-rnn-weights.hdf5')

In [None]:
accr = model.evaluate(x_test, y_test)
print()
print('Loss: {:0.3f}\tAccuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
# plot_model(model, to_file='tc-nltk-lstm-2-model.png')

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Training and Validation Accuracy - LSTM RNN')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig('tc-nltk-lstm-rnn-acc.png', dpi=300, pad_inches=0.1)
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training and Validation Loss - LSTM RNN')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig('tc-nltk-lstm-rnn-loss.png', dpi=300, pad_inches=0.1)
plt.show()