In [1]:
# Text Classification / tc-nltk-lstm-rnn-10-mix.ipynb
# Gourav Siddhad
# 27-Mar-2019

In [21]:
print('Importing Libraries', end='')

import pandas as pd
import numpy as np
from numpy.random import seed
import re
import os
import pandas as pd
import time

import matplotlib.pyplot as plt

import seaborn as sns

import nltk
from nltk import word_tokenize
from nltk.corpus import reuters, stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split

import tensorflow as tf
import keras
from keras.models import Model, load_model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical, plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

%matplotlib inline

from scipy import interp
from itertools import cycle

print(' - Done')

Importing Libraries - Done


In [3]:
documents = reuters.fileids()
print('Total Documents -', len(documents))

print('Extracting (Id, Docs and Labels)', end='')
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
all_docs = train_docs
all_docs += test_docs

train_labels = [reuters.categories(doc_id) for doc_id in train_docs_id]
test_labels  = [reuters.categories(doc_id) for doc_id in test_docs_id]
all_labels = train_labels
all_labels += test_labels
print(' - Done')

del train_docs
del test_docs
del train_labels
del test_labels

print('Documents - ', len(all_docs))
print('Labels  - ', len(all_labels))

# List of categories
categories = reuters.categories()
print('Categories - ', len(categories))

Total Documents - 10788
Extracting (Id, Docs and Labels) - Done
Documents -  10788
Labels  -  10788
Categories -  90


In [4]:
print('Caching Stop Words', end='')
cachedStopWords = stopwords.words("english")
print(' - Done')

def tokenize(text):
    min_length = 3
    words = map(lambda word: word.lower(), word_tokenize(text))
    words = [word for word in words if word not in cachedStopWords]
    tokens =(list(map(lambda token: PorterStemmer().stem(token), words)))
    p = re.compile('[a-zA-Z]+')
    filtered_tokens = list(filter(lambda token: p.match(token) and len(token)>=min_length, tokens))
    return filtered_tokens

Caching Stop Words - Done


In [5]:
print('Total Articles - ', len(all_docs))
allwords = set()
i = 0
for doc in all_docs:
    if i % 100 is 0:
        print(i, end=' ')
    doc = tokenize(doc)
    for word in doc:
        allwords.add(word)
    i += 1

Total Articles -  10788
0 100 200 300 400 500 600 700 800 900 1000 1100 1200 1300 1400 1500 1600 1700 1800 1900 2000 2100 2200 2300 2400 2500 2600 2700 2800 2900 3000 3100 3200 3300 3400 3500 3600 3700 3800 3900 4000 4100 4200 4300 4400 4500 4600 4700 4800 4900 5000 5100 5200 5300 5400 5500 5600 5700 5800 5900 6000 6100 6200 6300 6400 6500 6600 6700 6800 6900 7000 7100 7200 7300 7400 7500 7600 7700 7800 7900 8000 8100 8200 8300 8400 8500 8600 8700 8800 8900 9000 9100 9200 9300 9400 9500 9600 9700 9800 9900 10000 10100 10200 10300 10400 10500 10600 10700 

In [6]:
print('All Words - ', len(allwords))
# print(allwords)

All Words -  25170


In [7]:
print('Sorting Train:Test Docs', end='')
X_train, X_test, y_train, y_test = train_test_split(all_docs, all_labels, test_size=0.2, random_state=42)
print(' - Done')

maxwords = len(allwords)

print('Tokenizing', end='')
tk = Tokenizer(num_words=maxwords)
tk.fit_on_texts(X_train)
tk.fit_on_texts(X_test)
index_list_train = tk.texts_to_sequences(X_train)
index_list_test = tk.texts_to_sequences(X_test)
print(' - Done')

Sorting Train:Test Docs - Done
Tokenizing - Done


In [8]:
# max of index_list_train
# max of index_list_test

maxlen = 200

for i in index_list_train:
    if len(i)>maxlen:
        maxlen = len(i)
print(maxlen)

for i in index_list_test:
    if len(i)>maxlen:
        maxlen = len(i)
print(maxlen)

2379
2379


In [9]:
# maxlen = 1600
print('MaxLen - ', maxlen)
print('Padding Sequences', end='')
x_train = sequence.pad_sequences(index_list_train, maxlen=maxlen)
x_test = sequence.pad_sequences(index_list_test, maxlen=maxlen)
print(' - Done')

print('Binarizing MultiLabels', end='')
lb = MultiLabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)
print(' - Done')

MaxLen -  2379
Padding Sequences - Done
Binarizing MultiLabels - Done


In [10]:
del all_docs
del all_labels

In [13]:
config = tf.ConfigProto(device_count={"CPU":8})
keras.backend.tensorflow_backend.set_session(tf.Session(config=config))

In [14]:
def RNN():
    with tf.device('/gpu:0'):
        inputs = Input(name='inputs', shape=[maxlen])
        layer1 = Embedding(maxwords, 1024)(inputs)
    with tf.device('/cpu:0'):
        layer2 = LSTM(256)(layer1)
        layer3 = Dense(128)(layer2)
    with tf.device('/gpu:0'):
        layer4 = Activation('relu')(layer3)
        layer5 = Dropout(rate = 0.1)(layer4) # rate = 1-keep_prob, keep_prob=0.5
        layer6 = Dense(len(categories))(layer5)
        layer7 = Activation('softmax')(layer6)
    model = Model(inputs=inputs, outputs=layer7)
    return model

In [15]:
model = RNN()
model.summary()
# model.compile(loss='categorical_crossentropy', optimizer=RMSprop(lr=0.001), metrics=['accuracy'])
model.compile(loss='mse', optimizer='adam', metrics=['accuracy'])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 2379)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 2379, 1024)        25774080  
_________________________________________________________________
lstm_1 (LSTM)                (None, 256)               1311744   
_________________________________________________________________
dense_1 (Dense)              (None, 128)               32896     
_________________________________________________________________
activation_1 (Activation)    (None, 128)               0         
_________________________________________________________________
dropout_1 (Dropout)  

In [16]:
checkpoint = ModelCheckpoint('model-topcat-10-mix-{epoch:03d}-{acc:03f}-{val_acc:03f}.h5', 
                             verbose=1, monitor='val_loss', save_best_only=True, mode='auto')
history = model.fit(x_train, y_train, batch_size=128, epochs=50, validation_split=0.3, shuffle=True, callbacks=[checkpoint])

Instructions for updating:
Use tf.cast instead.
Train on 6041 samples, validate on 2589 samples
Epoch 1/50

Epoch 00001: val_loss improved from inf to 0.01142, saving model to model-topcat-10-mix-001-0.313359-0.376593.h5
Epoch 2/50

Epoch 00002: val_loss improved from 0.01142 to 0.00934, saving model to model-topcat-10-mix-002-0.435358-0.520278.h5
Epoch 3/50

Epoch 00003: val_loss improved from 0.00934 to 0.00870, saving model to model-topcat-10-mix-003-0.504056-0.548474.h5
Epoch 4/50

Epoch 00004: val_loss improved from 0.00870 to 0.00819, saving model to model-topcat-10-mix-004-0.534680-0.558517.h5
Epoch 5/50

Epoch 00005: val_loss did not improve from 0.00819
Epoch 6/50

Epoch 00006: val_loss improved from 0.00819 to 0.00798, saving model to model-topcat-10-mix-006-0.577884-0.565083.h5
Epoch 7/50

Epoch 00007: val_loss improved from 0.00798 to 0.00723, saving model to model-topcat-10-mix-007-0.603542-0.621862.h5
Epoch 8/50

Epoch 00008: val_loss improved from 0.00723 to 0.00709, sav

KeyboardInterrupt: 

In [17]:
# Save Complete Model
model.save('tc-nltk-lstm-rnn-10-mix.h5')

# # Save Model Configuration to JSON
# model_json = model.to_json()
# with open('tc-nltk-lstm-rnn-10-mix.json', 'w') as json_file:
#     json_file.write(model_json)

# # Load a Saved Model
# # model = load_model('tc-nltk-lstm-rnn.h5')

In [22]:
# # Load Model Configuration from JSON
# json_file = open('tc-nltk-lstm-rnn-topcat-10-mix.json', 'r')
# loaded_model_json = json_file.read()
# json_file.close()
# loaded_model = model_from_json(loaded_model_json)
# loaded_model.load_weights('model-10-mix-009-0.917161-0.840018.h5') # Change before running
# loaded_model.save('tc-nltk-lstm-rnn-10-mix-weights.hdf5')
# loaded_model=load_model('tc-nltk-lstm-rnn-10-mix-weights.hdf5')

# Load Best Model
loaded_model = load_model('model-topcat-10-mix-020-0.764940-0.693318.h5')

In [23]:
accr = model.evaluate(x_test, y_test, batch_size=256)
print()
print('Loss: {:0.3f}\tAccuracy: {:0.3f}'.format(accr[0], accr[1]))

accr2 = loaded_model.evaluate(x_test, y_test, batch_size=256)
print()
print('Loss: {:0.3f}\tAccuracy: {:0.3f}'.format(accr2[0], accr2[1]))


Loss: 0.006	Accuracy: 0.695

Loss: 0.006	Accuracy: 0.674


In [None]:
# plot_model(model, to_file='tc-nltk-lstm-4-model.png')

In [24]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Training and Validation Accuracy - LSTM RNN')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig('tc-nltk-lstm-rnn-10-mix-acc.png', dpi=300, pad_inches=0.1)
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training and Validation Loss - LSTM RNN')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig('tc-nltk-lstm-rnn-10-mix-loss.png', dpi=300, pad_inches=0.1)
plt.show()

NameError: name 'history' is not defined