In [1]:
# Text Classification / tc-nltk-ae.ipynb
# Gourav Siddhad
# 26-Mar-2019

In [2]:
print('Importing Libraries', end='')

import nltk
from nltk import word_tokenize
from nltk.corpus import reuters, stopwords
from nltk.stem.porter import PorterStemmer

from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, auc
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer, minmax_scale
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt
from scipy import interp
from itertools import cycle

import re
import os
import numpy as np
import pandas as pd
import time
from numpy.random import seed

from keras.models import Model, model_from_json, load_model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical, plot_model
from keras.callbacks import EarlyStopping, ModelCheckpoint

%matplotlib inline

from scipy import interp
from itertools import cycle

print(' - Done')

Importing Libraries

Using TensorFlow backend.


 - Done


# Preprocessing

In [3]:
documents = reuters.fileids()
print('Total Documents -', len(documents))

print('Extracting (Id, Docs and Labels)', end='')
train_docs_id = list(filter(lambda doc: doc.startswith("train"), documents))
test_docs_id = list(filter(lambda doc: doc.startswith("test"), documents))

train_docs = [reuters.raw(doc_id) for doc_id in train_docs_id]
test_docs = [reuters.raw(doc_id) for doc_id in test_docs_id]
all_docs = train_docs
all_docs += test_docs

train_labels = [reuters.categories(doc_id) for doc_id in train_docs_id]
test_labels  = [reuters.categories(doc_id) for doc_id in test_docs_id]
all_labels = train_labels
all_labels += test_labels
print(' - Done')

del train_docs
del test_docs
del train_labels
del test_labels

print('Documents - ', len(all_docs))
print('Labels  - ', len(all_labels))

# List of categories
categories = reuters.categories()
print('Categories - ', len(categories))

print('Caching Stop Words', end='')
cachedStopWords = stopwords.words("english")
print(' - Done')

Total Documents - 10788
Extracting (Id, Docs and Labels) - Done
Documents -  10788
Labels  -  10788
Categories -  90
Caching Stop Words - Done


In [4]:
print('Sorting Train:Test Docs', end='')
X_train, X_test, y_train, y_test = train_test_split(all_docs, all_labels, test_size=0.2, random_state=42)
print(' - Done')

maxwords = 10000

print('Tokenizing', end='')
tk = Tokenizer(num_words=maxwords)
tk.fit_on_texts(X_train)
tk.fit_on_texts(X_test)
index_list_train = tk.texts_to_sequences(X_train)
index_list_test = tk.texts_to_sequences(X_test)
print(' - Done')

Sorting Train:Test Docs - Done
Tokenizing - Done


In [5]:
# max of index_list_train
# max of index_list_test

maxlen = 200

for i in index_list_train:
    if len(i)>maxlen:
        maxlen = len(i)
print(maxlen)

for i in index_list_test:
    if len(i)>maxlen:
        maxlen = len(i)
print(maxlen)

2354
2354


In [6]:
maxlen = 2000

print('Padding Sequences', end='')
x_train = sequence.pad_sequences(index_list_train, maxlen=maxlen)
x_test = sequence.pad_sequences(index_list_test, maxlen=maxlen)
print(' - Done')

print('Binarizing MultiLabels', end='')
lb = MultiLabelBinarizer()
y_train = lb.fit_transform(y_train)
y_test = lb.transform(y_test)
print(' - Done')

Padding Sequences - Done
Binarizing MultiLabels - Done


# Training and Testing

In [23]:
def ae():
    inputs = Input(name='inputs', shape=[maxlen])
    layer = Embedding(maxwords, 512, input_length=maxlen)(inputs)
    layer = Dense(32)(layer)
    layer = Activation('relu')(layer)
    
    layer = Dense(512)(layer)
    layer = Activation('sigmoid')(layer)
    layer = Dense(maxlen)(layer)
    
    model = Model(inputs=inputs, outputs=layer)
    return model

model = ae()
model.summary()
model.compile(loss='mse', optimizer='adadelta', metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 2000)              0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 2000, 512)         5120000   
_________________________________________________________________
dense_20 (Dense)             (None, 2000, 32)          16416     
_________________________________________________________________
activation_11 (Activation)   (None, 2000, 32)          0         
_________________________________________________________________
dense_21 (Dense)             (None, 2000, 512)         16896     
_________________________________________________________________
activation_12 (Activation)   (None, 2000, 512)         0         
_________________________________________________________________
dense_22 (Dense)             (None, 2000, 2000)        1026000   
Total para

In [26]:
checkpoint = ModelCheckpoint('model-ae-{epoch:03d}.h5', 
                             verbose=1, monitor='val_loss', save_best_only=True, mode='auto')

model.fit(x_train, x_train, epochs=5, batch_size=64, shuffle=True,
                validation_data=(x_test, x_test), callbacks=[checkpoint])

ValueError: Error when checking target: expected dense_22 to have 3 dimensions, but got array with shape (8630, 2000)

In [None]:
# Save Complete Model
model.save('tc-nltk-ae.h5')

# Save Model Configuration to JSON
model_json = model.to_json()
with open('tc-nltk-ae.json', 'w') as json_file:
    json_file.write(model_json)

# Load a Saved Model
# model = load_model('tc-nltk-lstm-rnn.h5')

In [None]:
# Load Model Configuration from JSON
json_file = open('tc-nltk-ae.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
loaded_model.load_weights('model-ae-012-0.994738-0.946830.h5') # Change before running
loaded_model.save('tc-nltk-ae-weights.hdf5')
loaded_model=load_model('tc-nltk-ae-weights.hdf5')

In [None]:
accr = model.evaluate(x_test, y_test, batch_size=256)
print()
print('Loss: {:0.3f}\tAccuracy: {:0.3f}'.format(accr[0], accr[1]))

In [None]:
# Plot training & validation accuracy values
plt.plot(history.history['acc'])
plt.plot(history.history['val_acc'])
plt.title('Training and Validation Accuracy - LSTM RNN')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig('tc-nltk-ae-acc.png', dpi=300, pad_inches=0.1)
plt.show()

# Plot training & validation loss values
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training and Validation Loss - LSTM RNN')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')
plt.savefig('tc-nltk-ae-loss.png', dpi=300, pad_inches=0.1)
plt.show()