In [1]:
import keras, string, itertools, random, datetime, numpy as np, matplotlib.pyplot as plt, tensorflow as tf
from string import punctuation
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
from keras.callbacks import TensorBoard, EarlyStopping
from keras.models import Sequential
from keras.utils import to_categorical
from keras.initializers import glorot_uniform, random_uniform
from keras.activations import relu
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.layers import Embedding, Conv1D, Flatten, Dense, Dropout, MaxPool1D, LSTM, \
Bidirectional, TimeDistributed, Dropout
# from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


In [2]:
# Define custom functions
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    '''
    Description: Prints and plots the confusion matrix.	Normalization can be applied by setting `normalize=True`

    Args:
    - cm: Confusion Matrix
    - classes: Names of classes
    - normalize: Whether to or to not normal values in Confusion Matrix
    - cmap: Plot color
    '''

    # Check if normalize is true or false
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Format axis and plot Confusion Matrix
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

In [3]:
# Set model parameters
name_1 = 'lstm'
name_2 = 'cnn-lstm'
model_name = name_1
max_seq_len = 128
drop_prob = 0.2
no_filters_1 = 32
no_filters_2 = 64
no_filters_3 = 32
kernel_weight = glorot_uniform(seed=50)
bias = random_uniform(seed=50)
kernel_1 = 3
kernel_2 = 5
kernel_3 = 7
lstm_hidden = 128
embed_dim = 256
adam_lr = 0.001
batch_size = 128
epochs = 10
valid_split = 0.3

In [4]:
# Set misc parameters
current = datetime.datetime.now()
date = current.strftime('%b-%d')
tensor_b = TensorBoard(log_dir='./tf_logs/model_{}_hidden_{}_dropout_{}_embed_dim_{}_lr_{}'.format(model_name, 
                        lstm_hidden, drop_prob,
                        embed_dim, adam_lr), 
                        batch_size=batch_size, 
                        write_graph=True, histogram_freq=0)
early_s = EarlyStopping(monitor='val_loss')
# class_names = ['Pad', 'NA', 'Comma', 'Period']

# Look-up table to remove punctuations from data
table = str.maketrans('', '', punctuation)

In [5]:
# Load and process input/label data
data = open('./data/processed/train.word.txt', 'r', encoding='utf-8').read()
data = data.lower()
data_split = data.split('\n')
data_split = data_split[:-1]

y_labels = open('./data/processed/train.label.txt', 'r', encoding='utf-8').read()
y_labels = y_labels.split('\n')
# **** To CHECK **** #
y_labels = y_labels[:-1]

# Check number of sequences and labels
print('Number of sequences: \t{}'.format(len(data_split)))
print('Number of labels: \t{}'.format(len(y_labels)))

Number of sequences: 	284436
Number of labels: 	284436


In [6]:
# Build words vocab
# Get all words in the dataset
all_data = ' '.join(data_split)
words = all_data.split()
words_in_vocab = Counter(words)
vocab = sorted(words_in_vocab, key=words_in_vocab.get, reverse=True)

# Skip most common word
vocab_to_int = {word: index for index, word in enumerate(vocab, 2)}
vocab_to_int['<pad>'] = 0  # The special value used for padding
vocab_to_int['<oov>'] = 1  # The special value used for OOVs
unique_vocab = len(vocab_to_int)
print('Number of unique words:', unique_vocab)
print('\n')

# Build labels vocab
all_labels = ' '.join(y_labels)
labels_tag = all_labels.split()
labels_in_vocab = Counter(labels_tag)
labels_vocab = sorted(labels_in_vocab, key=labels_in_vocab.get, reverse=True)
label_to_int = {t: i for i, t in enumerate(labels_vocab)}
label_to_int['<pad>'] = 6  # The special value used to padding

# Check labels
no_classes = len(label_to_int)
print('Class distribution:', labels_in_vocab)
print('\n')

print('Number of unique labels:', no_classes)
print('\n')

print(Counter(label_to_int))

Number of unique words: 76055


Class distribution: Counter({'0': 5722741, '1': 582004, '2': 384732, '3': 109757, '4': 71598, '5': 30607})


Number of unique labels: 7


Counter({'<pad>': 6, '5': 5, '4': 4, '3': 3, '2': 2, '1': 1, '0': 0})


In [7]:
# Tokenize input sequences
seq_int = []
for seq in data_split:
    seq_int.append([vocab_to_int[word] for word in seq.split()])

# Pad input sequences
pad_seq = pad_sequences(sequences=seq_int, maxlen=max_seq_len, padding='post', value=0)

# Check sample sequence
print('Sample sequence:', data_split[-1])
print('\n')
print('Sample sequence:', pad_seq[-1])
print('\n')

# Tokenize output labels
lab_int = []
for lab in y_labels:
    lab_int.append([word for word in lab.split()])

# Pad input labels
pad_labels = pad_sequences(sequences=lab_int, maxlen=max_seq_len, padding='post', value=6)
encoded_labels = [to_categorical(i, num_classes=no_classes) for i in pad_labels]

# Check sample label
print('Sample label:', pad_labels[-1])
print('\n')
print('Encoded label', encoded_labels[-1])
print('\n')
# Check max seq length
print("Maximum sequence length: {}".format(max_seq_len))
print('\n')

# Check that all sequences and labels are at max sequence length 
assert len(pad_seq)==len(seq_int)
assert len(pad_seq[0])==max_seq_len

assert len(pad_labels)==len(lab_int)
assert len(pad_labels[0])==max_seq_len
print('Sequence and labels length check passed!')

Sample sequence: what do you miss of your old life


Sample sequence: [ 23  29   6 561   7  42 190 197   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0]


Sample label: [0 0 0 0 0 0 0 3 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6
 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6 6]


Encoded label [[1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 

In [8]:
# Split train and label dataset
train_test_split_frac = 0.8
split_index = int(0.8*len(pad_seq))

# Split data into training, validation, and test data (features and labels, x and y)
train_val_x, test_x = pad_seq[:split_index], pad_seq[split_index:]
train_val_y, test_y = encoded_labels[:split_index], encoded_labels[split_index:]

# print out the shapes of your resultant feature data
print('Training/Validation Dataset: \t{}'.format(train_val_x.shape), len(train_val_y))
print('Testing Dataset: \t\t{}'.format(test_x.shape), len(test_y))

Training/Validation Dataset: 	(227548, 128) 227548
Testing Dataset: 		(56888, 128) 56888


In [9]:
# Model code
model = Sequential()
model.add(Embedding(input_dim=unique_vocab, output_dim=embed_dim, input_length=max_seq_len))
# model.add(Conv1D(filters=no_filters_1, kernel_initializer=kernel_weight, bias_initializer=bias,
#                  kernel_size=kernel_1, activation=relu, padding='SAME', strides=1))
# model.add(Dropout(rate=drop_prob, seed=50))
# model.add(Conv1D(filters=no_filters_1, kernel_initializer=kernel_weight, bias_initializer=bias,
#                  kernel_size=kernel_1, activation=relu, padding='SAME', strides=1))
# model.add(Dropout(rate=drop_prob, seed=50))
# model.add(Conv1D(filters=no_filters_1, kernel_initializer=kernel_weight, bias_initializer=bias,
#                  kernel_size=kernel_1, activation=relu, padding='SAME', strides=1))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(Bidirectional(LSTM(lstm_hidden, return_sequences=True)))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(Bidirectional(LSTM(lstm_hidden, return_sequences=True)))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(TimeDistributed(Dense(no_classes, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(adam_lr), 
              metrics=['accuracy'])#, ignore_class_accuracy(0)])
model.summary()
model.fit(x=train_val_x, y=np.array(train_val_y), batch_size=batch_size, 
          epochs=epochs, validation_split=valid_split, steps_per_epoch=None, validation_steps=None,
          shuffle=True, verbose=1, callbacks=[tensor_b, early_s])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 256)          19470080  
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 256)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 128, 256)          394240    
_________________________________________________________________
dropout_2 (Dropout)          (None, 128, 256)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 128, 256)          394240    
_________________________________________________________________
dropout_3 (Dropout)          (None, 128, 256)          0         
_________________________________________________________________
time_distributed_1 (TimeDist (None, 128, 7)            1799      
Total para

<keras.callbacks.History at 0x1b9c581f198>

In [10]:
# restore = {'na': 0, ''}
test_data = test_x[585]
pred_x_seq = []
for x in test_data:
    for value, index in vocab_to_int.items():
        if x == index:
            pred_x_seq.append(value)

# Predicted output
pred_expand = model.predict(np.expand_dims(test_data, axis=0))
pred_y = []
for y in pred_expand:
    pred_y.append(np.argmax(y, axis=1))
print('Predictions Index:')
print(pred_y)

pred_y_seq = []
for x in pred_y:
    for y in x:
        for value, index in label_to_int.items():
            if y == index:
                pred_y_seq.append(value)

combined = []
for i in range(len(pred_x_seq)):
    if pred_y_seq[i] == '<comma>':
        combined.append(str(pred_x_seq[i])+',')
    elif pred_y_seq[i] == '<period>':
        combined.append(str(pred_x_seq[i])+'.')
    else:
        combined.append(str(pred_x_seq[i]))

for i in range(len(combined)):
    if '.' in combined[i]:
        combined[i+1] = combined[i+1].capitalize()
    if combined[i] == 'i':
        combined[i] = combined[i].capitalize()
    else:
        continue
        
combined = ' '.join(combined)
combined = combined.replace('<pad>', '')

print('\n')
print('Prediction sequence:')            
print(' '.join(pred_x_seq))
print('\n')
print('Prediction output:')
print(' '.join(pred_y_seq))
print('\n')
print('Combined prediction:')
print(combined)

Predictions Index:
[array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6], dtype=int64)]


Prediction sequence:
would you have considered keeping it a little person who keeps you company and loves you for ever <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad> <pad

In [12]:
# Create confusion matrix and classification report
for_report = model.predict(test_x)
out_pred = [np.argmax(x, axis=1) for x in for_report]
out_pred = np.concatenate(out_pred, axis=0)

y_ = [np.argmax(x, axis=1) for x in test_y]
y_ = np.concatenate(y_, axis=0)

# print('Test dataset distribution:', counts)
cm = confusion_matrix(y_true=y_, y_pred=out_pred)
print(cm)

cr = classification_report(y_true=y_, y_pred=out_pred)
print(cr)

plt.figure()
plot_confusion_matrix(cm, classes=class_names, normalize=True, title='Normalized Confusion Matrix')
plt.show()

[[1050239   11599   11908    1325     458     147       0]
 [  20232   75289    9446    1946    1261     151      10]
 [  27263   14089   30198    1068     605      96       1]
 [   3131    4055     916   13663     134      38       0]
 [   2193    7498    1636     339    2120      37       1]
 [   2258    2255     856     198     132     346       0]
 [      0       0       0       0       0       0 5982527]]
              precision    recall  f1-score   support

           0       0.95      0.98      0.96   1075676
           1       0.66      0.69      0.67    108335
           2       0.55      0.41      0.47     73320
           3       0.74      0.62      0.68     21937
           4       0.45      0.15      0.23     13824
           5       0.42      0.06      0.10      6045
           6       1.00      1.00      1.00   5982527

   micro avg       0.98      0.98      0.98   7281664
   macro avg       0.68      0.56      0.59   7281664
weighted avg       0.98      0.98      0.98 

NameError: name 'class_names' is not defined

<Figure size 432x288 with 0 Axes>