In [1]:
# This Python file uses the following encoding: utf-8
from keras.models import Sequential
from keras.models import model_from_json
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout, Embedding
from keras.models import model_from_json
import signal
from keras.optimizers import Adam, RMSprop
from keras.preprocessing import sequence
from sklearn.model_selection import train_test_split
from keras.callbacks import EarlyStopping, ModelCheckpoint
from lib.pretty_testing import predict_test, roc_curve_plot
from sklearn.metrics import roc_curve, auc, precision_recall_fscore_support
import numpy as np
import pickle

# fix random seed for reproducibility
np.random.seed(8)

dataset_file = '../datasets/word_embedded_docs.p'
model_weights_file = '../models/keras_weights_word_embedding.h5'
model_file = '../models/keras_model_word_embedding.json'
roc_fig_filename = '../log_figs/roc_curve.png'
lr = 0.0003
epochs = 100
training = False
patience = 3
top_words = 10000
embedding_vector_length = 32
maxlen = 500

# load prepared data
with open(dataset_file) as f:
    data, labels = pickle.load(f)

print 'Data loaded'
    
# padding for the rnn
padded_data = sequence.pad_sequences(data, maxlen, padding="pre", truncating="post", value=0, dtype='uint32')
labels = np.array(labels)
del data

print 'Data padded'

# load the dataset but only keep the top n words, zero the rest
X_train, X_test, y_train, y_test = train_test_split(padded_data, labels, train_size=0.9, stratify=labels)
del padded_data

X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, train_size=0.95, stratify=y_train)

print 'Data splitted'


if training:
    model = Sequential()
    model.add(Embedding(top_words, embedding_vector_length, input_length=maxlen, mask_zero=True))
    model.add(LSTM(32, dropout=0.2)) #return_sequences=True
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=lr), metrics=['accuracy'])
    print(model.summary())
    
    with open(model_file,'w') as f:
        f.write(model.to_json())
    
    print 'Start training'

    model.fit(X_train, y_train, epochs=epochs, batch_size=64, validation_data=(X_val, y_val),
              callbacks=[EarlyStopping(monitor='val_acc', patience=patience),
                         ModelCheckpoint(model_weights_file, monitor='val_acc', save_best_only=True, save_weights_only=True)])
else:
    model = model_from_json(open(model_file).read())
    model.load_weights(model_weights_file)
    model.compile(loss='binary_crossentropy', optimizer=RMSprop(lr=lr), metrics=['accuracy'])
    print(model.summary())

# test
y_pred = predict_test(model, X_test, y_test, ['non_cost', 'cost'])
scores = model.predict(X_test, verbose=0).reshape(-1)

fpr, tpr, _ = roc_curve(y_test, scores, pos_label=1)
roc_auc = auc(fpr, tpr)
print('\nArea under ROC curve: {}'.format(roc_auc))
#fig = roc_curve_plot(fpr, tpr, roc_auc)
#fig.savefig(roc_fig_filename)

Using TensorFlow backend.


Data loaded
Data padded
Data splitted




_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 500, 32)           320000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 328,353
Trainable params: 328,353
Non-trainable params: 0
_________________________________________________________________
None
()
Test classification report
Accuracy: 0.988180
             precision    recall  f1-score   support

          0       1.00      0.98      0.99      1903
          1       0.98      1.00      0.99      1904

avg / total       0.99      0.99      0.99      3807

Test confusion Matrix
             non_cost     cost
    non_cost   1866.0     37.0
        cost      8.0   1896.0

Area under ROC 

In [2]:
precision_recall_fscore_support(y_test, y_pred)

(array([ 0.99573106,  0.98085877]),
 array([ 0.98055702,  0.99579832]),
 array([ 0.98808578,  0.98827209]),
 array([1903, 1904]))