In [3]:
import keras, string, itertools, random, datetime, numpy as np, matplotlib.pyplot as plt, tensorflow as tf
from string import punctuation
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
from keras.callbacks import TensorBoard, EarlyStopping
from keras.models import Sequential
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.layers import Embedding, Conv1D, Flatten, Dense, Dropout, MaxPool1D, LSTM, \
Bidirectional, TimeDistributed, Dropout

In [4]:
# Define custom functions
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    '''
    Description: Prints and plots the confusion matrix.	Normalization can be applied by setting `normalize=True`

    Args:
    - cm: Confusion Matrix
    - classes: Names of classes
    - normalize: Whether to or to not normal values in Confusion Matrix
    - cmap: Plot color
    '''

    # Check if normalize is true or false
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Format axis and plot Confusion Matrix
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

def chunk_seq(words, chunk_len):
    '''
    Description: Creates sequence of words of length equals to chunk_len

    Args:
    - words: Words in dataset
    - chunk_len: Breaks sequence into provided length
    
    Returns:
        Chunked dataset
    '''
    
    chunked_seq = []
    for i in range(0, len(words), chunk_len):
        chunked_seq.append(words[i:i+chunk_len])
    return chunked_seq

def get_labels(seq):
    labels_seq = []
    seq = seq.split()
    for i in range(len(seq)):
        if ',' in seq[i]:
            labels_seq.append('<comma>')
        elif '.' in seq[i]:
            labels_seq.append('<period>')
        elif '!' in seq[i]:
            labels_seq.append('<exclaim>')
        elif '?' in seq[i]:
            labels_seq.append('<question>')
        elif '...' in seq[i]:
            labels_seq.append('<3_dots>')
        else:
            labels_seq.append('<na>')
    return labels_seq

In [5]:
# Set model parameters
name_1 = 'lstm'
name_2 = 'cnn-lstm'
model_name = name_2
max_seq_len = 128
drop_prob = 0.2
no_filters_1 = 32
no_filters_2 = 64
no_filters_3 = 32
kernel_1 = 3
kernel_2 = 5
kernel_3 = 7
lstm_hidden = 100
embed_dim = 300
adam_lr = 0.001
batch_size = 128
epochs = 10
valid_split = 0.3

In [6]:
# Set misc parameters
current = datetime.datetime.now()
date = current.strftime('%b-%d')
tensor_b = TensorBoard(log_dir='./tf_logs/model_{}_hidden_{}_dropout_{}_embed_dim_{}_lr_{}'.format(model_name, 
                        lstm_hidden, drop_prob,
                        embed_dim, adam_lr), 
                        batch_size=batch_size, 
                        write_graph=True, histogram_freq=0)
early_s = EarlyStopping(monitor='val_loss')
# class_names = ['Pad', 'NA', 'Comma', 'Period']

# Look-up table to remove punctuations from data
table = str.maketrans('', '', punctuation)

In [7]:
# Load and process input/label data
data = open('./data/processed/train.word.txt', 'r', encoding='utf-8').read()
data = data.lower()
data_split = data.split('\n')
all_data = ' '.join(data_split)
longest_sent = max(data_split, key=len)
print(longest_sent)
print('Length of longest sentence', len(longest_sent.split()))
words = all_data.split()

y_labels = open('./data/processed/train.label.txt', 'r', encoding='utf-8').read()
y_labels = y_labels.split('\n')
y_labels = y_labels[:-1]
all_labels = ' '.join(y_labels)
labels_tag = all_labels.split()

# Chunk sequence and labels
x = chunk_seq(words, max_seq_len)
sequences = [' '.join(seq) for seq in x]

y = chunk_seq(labels_tag, max_seq_len)
process_labels = [' '.join(seq) for seq in y]

# Remove punctuations
sequences = [seq.translate(table) for seq in sequences]

# Check number of sequences and labels
print('Number of sequences: \t{}'.format(len(sequences)))
print('Number of labels: \t{}'.format(len(process_labels)))

this is one of the coldest places on earth the high arctic here the temperature drops to fifty degrees below freezing if i didn't have all this specialist clothing on the cold would kill me in minutes and yet there are animals that live here all the time and one of the most remarkable is hunting just over there an arctic fox the only reason that it and i don't freeze solid is that we're both mammals and have the mammal's ability to use our food to heat our bodies we're warm blooded the reason that it is more at home up here than i am is it has more of another mammalian characteristic hair than i have its body is insulated with fur warm bloodedness is one of the key factors that have enabled mammals to conquer the earth and to develop the most complex bodies in the whole animal kingdom in this series we will travel the world to discover just how varied and how astonishing mammals are we go to africa where the mammals are at their most spectacular here the plains are thronged with specia

Number of sequences: 	53918
Number of labels: 	53918


In [9]:
# Build words vocab
all_data = ' '.join(sequences)
words = all_data.split()
words_in_vocab = Counter(words)
vocab = sorted(words_in_vocab, key=words_in_vocab.get, reverse=True)

# Skip most common word
vocab_to_int = {word: index for index, word in enumerate(vocab, 2)}
vocab_to_int['-PAD-'] = 0  # The special value used for padding
vocab_to_int['-OOV-'] = 1  # The special value used for OOVs
unique_vocab = len(vocab_to_int)
print('Number of unique words:', unique_vocab)

# Build labels vocab
labels_in_vocab = Counter(labels_tag)
labels_vocab = sorted(labels_in_vocab, key=labels_in_vocab.get, reverse=True)
label_to_int = {t: i for i, t in enumerate(labels_vocab)}
label_to_int['-PAD-'] = 6  # The special value used to padding

# Check labels
no_classes = len(label_to_int)
print('Class distribution:', labels_in_vocab)
print('Number of unique labels:', no_classes)
print(Counter(label_to_int))

Number of unique words: 73199
Class distribution: Counter({'0': 5722741, '1': 582004, '2': 384732, '3': 109757, '4': 71598, '5': 30607})
Number of unique labels: 7
Counter({'-PAD-': 6, '5': 5, '4': 4, '3': 3, '2': 2, '1': 1, '0': 0})


In [10]:
print(sequences[10])
print(process_labels[10])

in the country its difficult to build a defence around that and youre just gonna let this happen no no its all with my lawyers and everything shell probably end up with no more than sixty or seventy per cent of my earnings and everything i own actually karen can we talk about something else cos i think i might you know cry look what are we doing in a pub when youve got this going on making it better oh hang on look youre right no you are but dont just split us up like that we dont have to do that cos i need you look well finish our drinks and then well sort this shit out but lets do it together please and then i
0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 3 2 2 0 0 0 0 0 0 1 0 0 0 0 0 0 0 5 0 0 0 0 0 0 0 1 0 0 0 1 2 5 0 0 0 0 0 2 0 0 0 0 5 0 5 5 2 0 0 0 0 0 0 0 0 0 0 0 0 3 0 0 1 1 0 2 2 0 1 2 0 4 0 0 0 0 0 0 0 2 0 0 0 0 0 2 0 0 0 1 2 0 0 0 2 0 0 0 0 0 0 1 0 0 0 0 1 3 0 0 0


In [11]:
# Tokenize input sequences
seq_int = []
for seq in sequences:
    seq_int.append([vocab_to_int[word] for word in seq.split()])

# Pad input sequences
pad_seq = pad_sequences(sequences=seq_int, maxlen=max_seq_len, padding='post', value=0)

# Check sample sequence
print('Sample sequence:', sequences[-1])
print('\n')
print('Sample sequence:', pad_seq[-1])
print('\n')

# Tokenize output labels
lab_int = []
for lab in process_labels:
    lab_int.append([word for word in lab.split()])

# Pad input labels
pad_labels = pad_sequences(sequences=lab_int, maxlen=max_seq_len, padding='post', value=6)
encoded_labels = [to_categorical(i, num_classes=no_classes) for i in pad_labels]

# Check sample label
print('Sample label:', pad_labels[-1])
print('\n')
print('Encoded label', encoded_labels[-1])
print('\n')
# Check max seq length
print("Maximum sequence length: {}".format(max_seq_len))
print('\n')

# Check that all sequences and labels are at max sequence length 
assert len(pad_seq)==len(seq_int)
assert len(pad_seq[0])==max_seq_len

assert len(pad_labels)==len(lab_int)
assert len(pad_labels[0])==max_seq_len
print('Sequence and labels length check passed!')

Sample sequence: my soul again but there was nothing i could find im home again but ive been here before old friends and i know ill keep consoled again in the past ive left behind when i was a boy well i heard somebody singing and i heard the guitars ringing and it brought me home again what do you miss of your old life


Sample sequence: [   43  1907   167    24    32    17   252     9    92   166    46   175
   167    24   111    65    57   165   189   432     5     9    60   147
   216 37530   167    11     2   487   111   218   341    71     9    17
     4   444    48     9   482   537  1219     5     9   482     2  8399
  3604     5     8   586    36   175   167    23    29     6   562     7
    42   189   196     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
  

In [12]:
# Split train and label dataset
train_test_split_frac = 0.8
split_index = int(0.8*len(pad_seq))

# Split data into training, validation, and test data (features and labels, x and y)
train_val_x, test_x = pad_seq[:split_index], pad_seq[split_index:]
train_val_y, test_y = encoded_labels[:split_index], encoded_labels[split_index:]

# print out the shapes of your resultant feature data
print('Training/Validation Dataset: \t{}'.format(train_val_x.shape), len(train_val_y))
print('Testing Dataset: \t\t{}'.format(test_x.shape), len(test_y))

Training/Validation Dataset: 	(43134, 128) 43134
Testing Dataset: 		(10784, 128) 10784


In [13]:
# Model code
model = Sequential()
model.add(Embedding(input_dim=unique_vocab, output_dim=embed_dim, input_length=max_seq_len))
model.add(Conv1D(filters=no_filters_1, kernel_size=kernel_1, padding='SAME', strides=1))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(Conv1D(filters=no_filters_2, kernel_size=kernel_2, padding='SAME', strides=1))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(Conv1D(filters=no_filters_3, kernel_size=kernel_3, padding='SAME', strides=1))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(Bidirectional(LSTM(lstm_hidden, return_sequences=True)))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(Bidirectional(LSTM(lstm_hidden, return_sequences=True)))
model.add(Dropout(rate=drop_prob, seed=50))
model.add(TimeDistributed(Dense(no_classes, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(adam_lr), 
              metrics=['accuracy'])#, ignore_class_accuracy(0)])
model.summary()
model.fit(x=train_val_x, y=np.array(train_val_y), batch_size=batch_size, 
          epochs=epochs, validation_split=valid_split, steps_per_epoch=None, validation_steps=None,
          shuffle=True, verbose=1, callbacks=[tensor_b, early_s])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 128, 300)          21959700  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 128, 32)           28832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 128, 32)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 128, 64)           10304     
_________________________________________________________________
dropout_2 (Dropout)          (None, 128, 64)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 128, 32)           14368     
_________________________________________________________________
dropout_3 (Dropout)          (None, 128, 32)           0         
__________

<keras.callbacks.History at 0x7ffa83e9ed68>

In [14]:
test_data = test_x[56]
pred_x_seq = []
for x in test_data:
    for value, index in vocab_to_int.items():
        if x == index:
            pred_x_seq.append(value)

# Predicted output
pred_expand = model.predict(np.expand_dims(test_data, axis=0))
pred_y = []
for y in pred_expand:
    pred_y.append(np.argmax(y, axis=1))
print('Predictions Index:')
print(pred_y)

# pred_y_seq = []
# for x in pred_y:
#     for y in x:
#         for value, index in label_to_int.items():
#             if y == index:
#                 pred_y_seq.append(value)

# combined = []
# for i in range(len(pred_x_seq)):
#     if pred_y_seq[i] == '<comma>':
#         combined.append(str(pred_x_seq[i])+',')
#     elif pred_y_seq[i] == '<period>':
#         combined.append(str(pred_x_seq[i])+'.')
#     else:
#         combined.append(str(pred_x_seq[i]))

# for i in range(len(combined)):
#     if '.' in combined[i]:
#         combined[i+1] = combined[i+1].capitalize()
#     if combined[i] == 'i':
#         combined[i] = combined[i].capitalize()
#     else:
#         continue
        
# combined = ' '.join(combined)
# combined

# print('\n')
# print('Prediction sequence:')            
# print(' '.join(pred_x_seq))
# print('\n')
# print('Prediction output:')
# print(' '.join(pred_y_seq))
# print('\n')
# print('Combined prediction:')
# print(combined)

Predictions Index:
[array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0,
       0, 3, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 0, 0, 0])]


In [16]:
# Create confusion matrix and classification report
for_report = model.predict(test_x)
out_pred = [np.argmax(x, axis=1) for x in for_report]
out_pred = np.concatenate(out_pred, axis=0)

y_ = [np.argmax(x, axis=1) for x in test_y]
y_ = np.concatenate(y_, axis=0)

# print('Test dataset distribution:', counts)
cm = confusion_matrix(y_true=y_, y_pred=out_pred)
print(cm)

cr = classification_report(y_true=y_, y_pred=out_pred)
print(cr)

plt.figure()
plot_confusion_matrix(cm, classes=class_names, normalize=True, title='Normalized Confusion Matrix')
plt.show()

[[1105979   26756   10579    2628     757       2       0]
 [  31349   71082    8023    2062     963       5       0]
 [  30778   23945   20523    1356     862       9       0]
 [   6440    4991    1015    9674     121       1       0]
 [   3443    6901    1629     370    1777       2       0]
 [   3016    2136     831     176      98       8       0]
 [     38       0      27       0       0       0       0]]


  'precision', 'predicted', average, warn_for)


              precision    recall  f1-score   support

           0       0.94      0.96      0.95   1146701
           1       0.52      0.63      0.57    113484
           2       0.48      0.26      0.34     77473
           3       0.59      0.43      0.50     22242
           4       0.39      0.13      0.19     14122
           5       0.30      0.00      0.00      6265
           6       0.00      0.00      0.00        65

   micro avg       0.88      0.88      0.88   1380352
   macro avg       0.46      0.35      0.37   1380352
weighted avg       0.86      0.88      0.87   1380352



NameError: name 'class_names' is not defined

<Figure size 432x288 with 0 Axes>