### CNN+BiLSTM approach to Part-of-Speech(PoS) tagging to predict and restore punctuations to sentences

The prevalent take on PoS problems have been to use BiLSTM or LSTM models due to their ability to capture and learn dependency information of sentences which are then used to make predictions.

This project intends to combine CNN with BiLSTM. Making use of CNNs' ability to capture word and morphological of sentences and forwarding them to the BiLSTM.

Outcome is to produce a hybrid model which outperforms a BiLSTM model.

In [1]:
# Import required packages and dependencies
import io, json, keras, string, itertools, random, datetime, numpy as np, matplotlib.pyplot as plt, tensorflow as tf
from string import punctuation
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
from keras.callbacks import TensorBoard, EarlyStopping
from keras.models import Sequential, Model
from keras.utils import to_categorical
from keras.initializers import glorot_uniform, random_uniform
from keras.layers import Activation
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.layers import Embedding, Conv1D, Flatten, Dense, Dropout, LSTM, Bidirectional, TimeDistributed, \
Dropout, Input, concatenate, Reshape
from keras import regularizers
from keras.utils import plot_model
from keras.layers.normalization import BatchNormalization

Using TensorFlow backend.


#### Custom functions to help display the Confusion Matrix and process the dataset

In [2]:
# Define custom functions
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion matrix', cmap=plt.cm.Blues):
    '''
    Description: 
        - Prints and plots the confusion matrix.	Normalization can be applied by setting `normalize=True`

    Args:
        - cm: Confusion Matrix
        - classes: Names of classes
        - normalize: Whether to or to not normal values in Confusion Matrix
        - cmap: Plot color
    '''

    # Check if normalize is true or false
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    # Format axis and plot Confusion Matrix
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                horizontalalignment="center",
                color="white" if cm[i, j] > thresh else "black")

def get_labels(seq):
    '''
    Description: 
        - Creates a sequence of labels based on the input sequence

    Args:
        - seq: Input sequence
    
    Returns:
        - Sequence labels
    '''
    
    labels_seq = []
    seq = seq.split()
    for i in range(len(seq)):
        if '...' in seq[i]:
            labels_seq.append('<3-dots>')
        elif ',' in seq[i]:
            labels_seq.append('<comma>')
        elif '.' in seq[i]:
            labels_seq.append('<period>')
        elif '?' in seq[i]:
            labels_seq.append('<question>')
        elif '!' in seq[i]:
            labels_seq.append('<exclaim>')
        else:
            labels_seq.append('<na>')
    return labels_seq

#### Set model and project parameters/settings

In [3]:
# Set model parameters
model_name = 'ted-glove-cnn-lstm'

# Dimension of the embedding layer, must match that of the word vectors
embed_dim = 300

# Maximum sequence length, how long each sentence/sequence should be
max_seq_len = 128

# Dropout are
drop_prob = 0.35

# Number of filters for each CNN layer
filter_sizes = [32,32,32]

# Kernel size for each CNN layer
kernels = [3,5,7]

# Weights and bias initialization for each CNN layer
kernel_weight = glorot_uniform(seed=50)
bias = glorot_uniform(seed=50)

# Regularization for each CNN layer
kernel_reg = regularizers.l2(l=0.0001)

# Number of hidden units for Dense layer
lstm_hidden = 1024

# Number of hidden units for BiLSTM layer
lstm_hidden_2 = 1024

# Learning rate for Adam optimizer
adam_lr = 0.001

# Batch size
batch_size = 64

# Number of epochs to train for
epochs = 50

# Portion of training data to be used for validation
valid_split = 0.3

In [4]:
# Set misc parameters
# Get current date and time
current = datetime.datetime.now()
date = current.strftime('%b-%d')

# Tensorboard settings
tensor_b = TensorBoard(log_dir='./tf_logs/model_{}_hidden_{}_dropout_{}_embed_dim_{}_lr_{}'.format(model_name, 
                        lstm_hidden, drop_prob,
                        embed_dim, adam_lr), 
                        batch_size=batch_size, 
                        write_graph=True, histogram_freq=0)

# Set model training early stop criteria
early_s = EarlyStopping(monitor='val_loss', patience=5, verbose=1)

# Set class names
class_names = ['Pad', 'NA', 'Comma', 'Period', 'Question', 'Exclaim', '3-Dots']

#### Read, load and process the Ted Talks dataset into training and validation sets with their corresponding labels

In [5]:
# Read and load dataset
data = open('./data/processed/ted_data', 'r', encoding='utf-8').read()

# Convert all characters to lowercase
data = data.lower()

# Look-up table to remove punctuations from data
table = str.maketrans('', '', punctuation)

# Define and remove characters and bracketed actions
replace = ['♫', '♪', '–', '…', '(applause)', '(laughter)']
for i in range(len(replace)):
    data = data.replace(replace[i], ' ')

# Split dataset by sentences
data_split = data.split('\n')
print('Pre number of sentences:', len(data_split))
print('\n')
# Get longest sentence in dataset and its index
print(max(enumerate(data_split), key=lambda x: len(x[1])))
print('\n')
print('Length of longest sentence:', len(max(data_split, key=len)))

# Clean and split the longest sentence into multiple ones based on full-stops
data_split[185073] = data_split[185073].replace(',', ', ')
data_split[185073] = data_split[185073].replace('.', '.\n')
long_sent = data_split[185073].split('\n')

# Check number of sentences from chunking longest sentence
print('Chunked longest sentence:', len(long_sent))

# Remove longest sentence at index 185703
del data_split[185073]

# Add chunked sentences back to dataset
for x in long_sent:
    data_split.append(x)

# Check length of dataset after addition
print('Post number of sentences:', len(data_split))
print('\n')

# Remove empty rows
data_split = data_split[:238003]

# Check last sentence of dataset
print('Last Sentence', data_split[-1])
print('\n')

# Get corresponding labels for dataset
process_labels = [get_labels(seq) for seq in data_split]
process_labels = [' '.join(seq) for seq in process_labels]

# Remove all punctuations from dataset
sequences = [seq.translate(table) for seq in data_split]

# Combined sentences back into a single piece for Counter
combined_sequences = ' '.join(sequences)

# Check if there are additional characters to remove
print(Counter(combined_sequences))
print('\n')
    
# Get all words in the dataset
words = combined_sequences.split()

# Save inputs and labels for reference
with open('./data/processed/processed_input', 'w', encoding='utf-8') as f:
    for x in sequences:
        f.write(x+'\n')
with open('./data/processed/processed_labels', 'w', encoding='utf-8') as f:
    for x in process_labels:
        f.write(x+'\n')

# Check number of sequences and labels
print('Number of sequences: \t{}'.format(len(sequences)))
print('Number of labels: \t{}'.format(len(process_labels)))

# Load processed labels
y_labels = open('./data/processed/processed_labels', 'r', encoding='utf-8').read()
y_labels = y_labels.split('\n')
y_labels = y_labels[:-1]
all_labels = ' '.join(y_labels)

# Get all labels in the dataset
labels_tag = all_labels.split()

Pre number of sentences: 237986


(185073, '  and this is more fun.so this last one is called "the sunshine kid."thank you very much for listening.old man sunshine was proud of his sun,and it brightened his day to see his little boy run,not because of what he’d done, nor the problems overcome,but that despite that his disposition remained a sunny one.it hadn’t always been like this.there’d been times when he’d tried to hide his brightness,you see, every star hits periods of hardship,it takes a brighter light to inspire them through the darkness.if we go back to when he was born in a nebula,we know that he never was thought of as regular,because he had a flair about him,to say the midas touch is wrongbut all he went near seemed to turn a little bronze,yes this sun was loved by some more than others,it was a case of joseph and his dreamcoat and his brothersbecause standing out from the crowd had its pros and its cons,and jealousy created enemies in those he outshonesuch as the shadow peo

#### Build words and labels vocabularies and store them as json dictionaries

In [6]:
# Build words vocab
all_data = ' '.join(sequences)
words = all_data.split()
words_in_vocab = Counter(words)
vocab = sorted(words_in_vocab, key=words_in_vocab.get, reverse=True)

# Skip most common word
vocab_to_int = {word: index for index, word in enumerate(vocab, 2)}
vocab_to_int['<pad>'] = 0  # The special value used for padding
vocab_to_int['<oov>'] = 1  # The special value used for OOVs

# Check number of unique words
unique_vocab = len(vocab_to_int)
print('Number of unique words:', unique_vocab)
print('\n')

# Build labels vocab
labels_in_vocab = Counter(labels_tag)
labels_vocab = sorted(labels_in_vocab, key=labels_in_vocab.get, reverse=True)
label_to_int = {t: i for i, t in enumerate(labels_vocab, 1)}
label_to_int['<pad>'] = 0  # The special value used to padding

# Write vocab and label dictionaries to file
with open('./vocabs.json', 'w', encoding='utf-8') as fv:
    json.dump(vocab_to_int, fv, indent=4)
    
with open('./labels.json', 'w', encoding='utf-8') as fl:
    json.dump(label_to_int, fl, indent=4)
    
# Check label classes distribution
no_classes = len(label_to_int)
print('Class distribution:', Counter(labels_in_vocab))
print('\n')

# Check number of unique labels
print('Number of unique labels:', no_classes)
print(label_to_int)

Number of unique words: 104910


Class distribution: Counter({'<na>': 4475054, '<comma>': 360733, '<period>': 294389, '<question>': 26054, '<exclaim>': 2330, '<3-dots>': 1394})


Number of unique labels: 7
{'<3-dots>': 6, '<pad>': 0, '<question>': 4, '<exclaim>': 5, '<na>': 1, '<period>': 3, '<comma>': 2}


#### Tokenize the sequences and their corresponding labels. Pad each sequence and its labels to maximum length

In [7]:
# Tokenize input sequences
seq_int = []
for seq in sequences:
    seq_int.append([vocab_to_int[word] for word in seq.split()])

# Pad input sequences
pad_seq = pad_sequences(sequences=seq_int, maxlen=max_seq_len, padding='post', value=0)

# Check sample sequence
print('Sample sequence:', sequences[-1])
print('\n')
print('Sample sequence:', pad_seq[-1])
print('\n')

# Tokenize output labels
lab_int = []
for lab in y_labels:
    lab_int.append([label_to_int[word] for word in lab.split()])

# Pad input labels
pad_labels = pad_sequences(sequences=lab_int, maxlen=max_seq_len, padding='post', value=0)
encoded_labels = [to_categorical(i, num_classes=no_classes) for i in pad_labels]

# Check sample label
print('Sample label:', pad_labels[-1])
print('\n')
print('Encoded label', encoded_labels[-1])

# Check max seq length
print("Maximum sequence length: {}".format(max_seq_len))

# Check that all sequences and labels are at max sequence length 
assert len(pad_seq)==len(seq_int)
assert len(pad_seq[0])==max_seq_len

assert len(pad_labels)==len(lab_int)
assert len(pad_labels[0])==max_seq_len
print('Sequence and labels length check passed!')

Sample sequence:  twentyfive years ago  scientists at cern created the world wide web


Sample sequence: [14536    84   197   648    31  9964   501     2    81  1928   949     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0]


Sample label: [1 1 2 1 1 1 1 1 1 1 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0

#### Split dataset into training and testing sets

In [8]:
# Split train and label dataset
train_test_split_frac = 0.8
split_index = int(0.8*len(pad_seq))

# Split data into training, validation, and test data (features and labels, x and y)
train_val_x, test_x = pad_seq[:split_index], pad_seq[split_index:]
train_val_y, test_y = encoded_labels[:split_index], encoded_labels[split_index:]

# print out the shapes of your resultant feature data
print('Training/Validation Dataset: \t{}'.format(train_val_x.shape), len(train_val_y))
print('Testing Dataset: \t\t{}'.format(test_x.shape), len(test_y))

Training/Validation Dataset: 	(190402, 128) 190402
Testing Dataset: 		(47601, 128) 47601


#### Load and process Glove pretrained word vectors

In [10]:
# Load glove pre-trained vectors
glove_index = dict()
f = open('./data/embeddings/glove.6B.300d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    glove_index[word] = coefs
f.close()
print('{} word vectors'.format(len(glove_index)))

embed_matrix = np.zeros((unique_vocab, embed_dim))
for word, i in vocab_to_int.items():
    embedding_vector = glove_index.get(word)
    if embedding_vector is not None:
        embed_matrix[i] = embedding_vector

400000 word vectors


#### Function to create model

In [11]:
def cnn_lstm(max_seq_len, unique_vocab, embed_dim, embed_matrix, filter_sizes, kernels, kernel_weight, bias):
    '''
    Description: 
        - Constructs and compiles the CNN+BiLSTM model
    
    Args(They can be defined earlier at the top of the notebook):
        - max_seq_len: Maximum sequence length 
        - unique_vocab: Number of unique words
        - embed_dim: Embedding layer dimension, needs to match with that of Glove pre-trained
        - embed_matrix: Pre-trained weights extracted from Glove based on unique words
        - filter_sizes: Number of filters per CNN layer
        - kernels: Kernel sizes per CNN layer
        - kernel_weight: Weights initialization for CNN layers
        - bias: Bias initialization for CNN layers
        
    Return: 
        - Compiled model
    '''
    embed_input = Input(shape=(max_seq_len,))

    # Add embedding layer using weights from glove
    embed = Embedding(input_dim=unique_vocab, output_dim=embed_dim, weights=[embed_matrix], 
                        input_length=max_seq_len, trainable=True)(embed_input) #104910 * 300
    
    embed = Dropout(rate=drop_prob, seed=50)(embed)

    cnn_outputs = []
    for i in range(len(filter_sizes)):
        # Add conv1d layer
        out_i = Conv1D(filters=filter_sizes[i], kernel_initializer=kernel_weight, bias_initializer=bias, 
                          kernel_size=kernels[i], kernel_regularizer=None, activation='relu', 
                          padding='SAME', strides=1)(embed)
        out_i = BatchNormalization()(out_i)
        cnn_outputs.append(out_i)

    cnn_outputs = concatenate(cnn_outputs, axis=-1)
    cnn_outputs = Dropout(rate=drop_prob, seed=50)(cnn_outputs)
    cnn_outputs = Reshape((-1, np.sum(filter_sizes)))(cnn_outputs)
    
    dense = Dense(lstm_hidden, activation='relu')(cnn_outputs)
    dense = Dropout(rate=drop_prob, seed=50)(dense)
    
    blstm_outputs = Bidirectional(LSTM(lstm_hidden_2, return_sequences=True))(dense)
    
    blstm_outputs = Dropout(rate=drop_prob, seed=50)(blstm_outputs)
    
    output = TimeDistributed(Dense(no_classes, activation='softmax'))(blstm_outputs)

    model = Model(inputs=[embed_input], outputs=[output])
    model.compile(loss='categorical_crossentropy', optimizer=Adam(adam_lr), 
              metrics=['accuracy'])
    
    return model

#### Initialize and train model

In [12]:
# Model code
model = cnn_lstm(max_seq_len=max_seq_len, unique_vocab=unique_vocab, embed_dim=embed_dim,
                embed_matrix=embed_matrix, filter_sizes=filter_sizes, kernels=kernels,
                 kernel_weight=kernel_weight, bias=bias)

# Summarize model
model.summary()

# Fit, train and evaluate model
model.fit(x=train_val_x, y=np.array(train_val_y), batch_size=batch_size, 
          epochs=epochs, validation_split=valid_split, steps_per_epoch=None, validation_steps=None,
          shuffle=True, verbose=1, callbacks=[tensor_b, early_s])

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 128)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 128, 300)     31473000    input_1[0][0]                    
__________________________________________________________________________________________________
dropout_1 (Dropout)             (None, 128, 300)     0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 128, 32)      28832       dropout_1[0][0]                  
__________________________________________________________________________________________________
conv1d_2 (

KeyboardInterrupt: 

#### Save model architecture and model

In [None]:
plot_model(model, to_file='model.png', show_shapes=True)
model.save('cnn_lstm_model.h5')

#### Load a sample test data, make a prediction from it and print the results

In [None]:
# Load a sample of test data
test_data = test_x[11111]

# Restore tokenized test data back to normal sentence
pred_x_seq = []
for x in test_data:
    for value, index in vocab_to_int.items():
        if x == index:
            pred_x_seq.append(value)

# Get predicted output of test data (Make predictions)
pred_expand = model.predict(np.expand_dims(test_data, axis=0))

# Retrieve position of highest probability from predictions
pred_y = []
for y in pred_expand:
    pred_y.append(np.argmax(y, axis=1))
print('Predictions Index:')
print(pred_y)

# Restore tokenized labels
pred_y_seq = []
for x in pred_y:
    for y in x:
        for value, index in label_to_int.items():
            if y == index:
                pred_y_seq.append(value)

# Restore punctuations and capitalization                
combined = []
for i in range(len(pred_x_seq)):
    if pred_y_seq[i] == '<comma>':
        combined.append(str(pred_x_seq[i])+',')
    elif pred_y_seq[i] == '<period>':
        combined.append(str(pred_x_seq[i])+'.')
    elif pred_y_seq[i] == '<question>':
        combined.append(str(pred_x_seq[i])+'?')
    elif pred_y_seq[i] == '<exclaim>':
        combined.append(str(pred_x_seq[i])+'!')
    elif pred_y_seq[i] == '<3-dots>':
        combined.append(str(pred_x_seq[i])+'...')
    else:
        combined.append(str(pred_x_seq[i]))

for i in range(len(combined)):
    if '.' in combined[i]:
        combined[i+1] = combined[i+1].capitalize()
    elif combined[i] == 'i':
        combined[i] = combined[i].capitalize()
    else:
        continue

# Join predicted words back into a sequence
combined = ' '.join(combined)
combined = combined.replace('<pad>', '')

print('\n')
print('Prediction sequence:')            
print(' '.join(pred_x_seq))
print('\n')
print('Prediction output:')
print(' '.join(pred_y_seq))
print('\n')
print('Combined prediction:')
print(combined.capitalize().replace('ive', "I've"))

#### Create Confusion Matrix and Classification Report to check model performance

In [None]:
# Create confusion matrix and classification report
for_report = model.predict(test_x)
out_pred = [np.argmax(x, axis=1) for x in for_report]
out_pred = np.concatenate(out_pred, axis=0)

y_ = [np.argmax(x, axis=1) for x in test_y]
y_ = np.concatenate(y_, axis=0)

cm = confusion_matrix(y_true=y_, y_pred=out_pred)
print(cm)

cr = classification_report(y_true=y_, y_pred=out_pred)
print(cr)

overall = classification_report(y_true=y_, y_pred=out_pred, output_dict=True)

plt.figure()
plot_confusion_matrix(cm, classes=class_names, normalize=True, title='Normalized Confusion Matrix')
plt.show()

#### Check the average Precision, Recall and, F1-Score of different classes

In [None]:
# comma, period, question, exclaim, 3-dots
precision = []
recall = []
f1 = []

for i in range(2, 7):
    precision.append(overall[str(i)]['precision'])
    recall.append(overall[str(i)]['recall'])
    f1.append(overall[str(i)]['f1-score'])

print('Comma, Period, Question Precision:\t{:.3f}'.format(np.average(precision[:3])))
print('Comma, Period, Question Recall:\t\t{:.3f}'.format(np.average(recall[:3])))
print('Comma, Period, Question F1-Score:\t{:.3f}'.format(np.average(f1[:3])))
print('\n')

print('Overall Precision:\t{:.3f}'.format(np.average(precision)))
print('Overall Recall:\t\t{:.3f}'.format(np.average(recall)))
print('Overall F1-Score:\t{:.3f}'.format(np.average(f1)))