In [1]:
import keras, string, random, datetime, numpy as np, matplotlib.pyplot as plt
import tensorflow as tf
from string import punctuation
from collections import Counter
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from keras import backend as K
K.tensorflow_backend._get_available_gpus()
from keras.callbacks import TensorBoard
from keras.models import Sequential
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.optimizers import Adam
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Dropout, LSTM, InputLayer, Bidirectional, TimeDistributed, Activation

Using TensorFlow backend.


In [2]:
def chunk_seq(seq, chunk_len):
    chunked_seq = []
    for i in range(0, len(seq), chunk_len):
        chunked_seq.append(seq[i:i+chunk_len])
    return chunked_seq

def get_labels(seq):
    labels_seq = []
    seq = seq.split()
    for i in range(len(seq)):
        if ',' in seq[i]:
            labels_seq.append('<comma>')
        elif '.' in seq[i]:
            labels_seq.append('<period>')
        else:
            labels_seq.append('<na>')
    return labels_seq

def ignore_class_accuracy(to_ignore=0):
    def ignore_accuracy(y_true, y_pred):
        y_true_class = K.argmax(y_true, axis=-1)
        y_pred_class = K.argmax(y_pred, axis=-1)
 
        ignore_mask = K.cast(K.not_equal(y_pred_class, to_ignore), 'int32')
        matches = K.cast(K.equal(y_true_class, y_pred_class), 'int32') * ignore_mask
        accuracy = K.sum(matches) / K.maximum(K.sum(ignore_mask), 1)
        return accuracy
    return ignore_accuracy

In [3]:
# Set misc parameters
current = datetime.datetime.now()
date = current.strftime('%dd-%%mm')
tb = TensorBoard(log_dir='./tf_logs/{}'.format(date), batch_size=64, write_graph=True, histogram_freq=0)

# Look-up table to remove punctuations from data
table = str.maketrans('', '', punctuation)

# Set max sequence length
max_seq_len = 30

In [4]:
# Load and process input/label data
data = open('./data/processed/ted_data', 'r', encoding='utf-8').read()
data = data.lower()
data_split = data.split('\n')
all_data = ' '.join(data_split)
words = all_data.split()

# Chunk sequence
x = chunk_seq(words, max_seq_len)
sequences = [' '.join(seq) for seq in x]

# Get sequence labels
process_labels = [get_labels(seq) for seq in sequences]
process_labels = [' '.join(seq) for seq in process_labels]

# Remove punctuations
sequences = [seq.translate(table) for seq in sequences]

with open('./processed_input', 'w', encoding='utf-8') as f:
    for x in sequences:
        f.write(x+'\n')

with open('./processed_labels', 'w', encoding='utf-8') as f:
    for x in process_labels:
        f.write(x+'\n')

# Check number of sequences and labels
print('Number of sequences: \t{}'.format(len(sequences)))
print('Number of labels: \t{}'.format(len(process_labels)))

y_labels = open('./processed_labels', 'r', encoding='utf-8').read()
y_labels = y_labels.split('\n')
y_labels = y_labels[:-1]
all_labels = ' '.join(y_labels)
labels_tag = all_labels.split()

Number of sequences: 	167697
Number of labels: 	167697


In [5]:
# Build words vocab
all_data = ' '.join(sequences)
words = all_data.split()
words_in_vocab = Counter(words)
vocab = sorted(words_in_vocab, key=words_in_vocab.get, reverse=True)

# Skip most common word
vocab_to_int = {word: index for index, word in enumerate(vocab, 2)}
vocab_to_int['-PAD-'] = 0  # The special value used for padding
vocab_to_int['-OOV-'] = 1  # The special value used for OOVs
unique_vocab = len(vocab_to_int)
print('Number of unique words:', unique_vocab)

Number of unique words: 108807


In [6]:
# Build labels vocab
labels_in_vocab = Counter(labels_tag)
labels_vocab = sorted(labels_in_vocab, key=labels_in_vocab.get, reverse=True)
label_to_int = {t: i for i, t in enumerate(labels_vocab, 1)}
label_to_int['-PAD-'] = 0  # The special value used to padding

# Check labels
no_classes = len(label_to_int)
print('Class distribution:', Counter(labels_in_vocab))
print('Number of unique labels:', no_classes)
print(label_to_int)

Class distribution: Counter({'<na>': 4387594, '<comma>': 354651, '<period>': 288658})
Number of unique labels: 4
{'<na>': 1, '<comma>': 2, '<period>': 3, '-PAD-': 0}


In [7]:
# Tokenize input sequences
seq_int = []
for seq in sequences:
    seq_int.append([vocab_to_int[word] for word in seq.split()])

# Pad input sequences
pad_seq = pad_sequences(sequences=seq_int, maxlen=max_seq_len, padding='post', value=0)

# Check sample sequence
print('Sample sequence:', sequences[-1])
print('Sample sequence:', pad_seq[-1])

# Tokenize output labels
lab_int = []
for lab in y_labels:
    lab_int.append([label_to_int[word] for word in lab.split()])

# Pad input labels
pad_labels = pad_sequences(sequences=lab_int, maxlen=max_seq_len, padding='post', value=0)
encoded_labels = [to_categorical(i, num_classes=no_classes) for i in pad_labels]

# Check sample label
print('Sample label:', pad_labels[-1])
print('Encoded label', encoded_labels[-1])

# Check max seq length
print("Maximum sequence length: {}".format(max_seq_len))

Sample sequence: day so it might be that this is not just a game it might be a way to decide our own fatethank youapplause
Sample sequence: [   142     17     13    170     29      7     14     10     31     46
      6    526     13    170     29      6     86      4   1222     42
    160 108806   9653      0      0      0      0      0      0      0]
Sample label: [3 1 1 1 1 1 1 1 1 1 1 3 1 1 1 1 1 1 1 1 1 3 3 0 0 0 0 0 0 0]
Encoded label [[0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 1. 0. 0.]
 [0. 0. 0. 1.]
 [0. 0. 0. 1.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]
 [1. 0. 0. 0.]]
Maximum sequence length: 30


In [8]:
# Check that all sequences and labels are at max sequence length 
assert len(pad_seq)==len(seq_int)
assert len(pad_seq[0])==max_seq_len

assert len(pad_labels)==len(lab_int)
assert len(pad_labels[0])==max_seq_len

In [9]:
# Split train and label dataset
train_test_split_frac = 0.8
split_index = int(0.8*len(pad_seq))

# Split data into training, validation, and test data (features and labels, x and y)
train_val_x, test_x = pad_seq[:split_index], pad_seq[split_index:]
train_val_y, test_y = encoded_labels[:split_index], encoded_labels[split_index:]

# print out the shapes of your resultant feature data
print('Training/Validation Dataset: \t{}'.format(train_val_x.shape), len(train_val_y))
print('Testing Dataset: \t\t{}'.format(test_x.shape), len(test_y))

Training/Validation Dataset: 	(134157, 30) 134157
Testing Dataset: 		(33540, 30) 33540


In [10]:
# Model code
model = Sequential()
model.add(Embedding(input_dim=unique_vocab, output_dim=128, input_length=max_seq_len))
model.add(Conv1D(filters=64, kernel_size=3, padding='SAME'))
model.add(Conv1D(filters=128, kernel_size=3, padding="SAME"))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(TimeDistributed(Dense(no_classes, activation='softmax')))
model.compile(loss='categorical_crossentropy', optimizer=Adam(0.001), metrics=['accuracy'])#, ignore_class_accuracy(0)])
model.summary()
model.fit(x=train_val_x, y=np.array(train_val_y), batch_size=64, epochs=2, validation_split=0.3, 
          shuffle=True, verbose=1, callbacks=[tb])

# print('Saving Model')
# model.save('model.h5')
# print('Done')

# scores = model.evaluate(x=test_x, y=np.array(test_y), verbose=1)
# print('Accuracy: {}'.format(scores[1] * 100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 30, 128)           13927296  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 30, 64)            24640     
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 30, 128)           24704     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 30, 512)           788480    
_________________________________________________________________
time_distributed_1 (TimeDist (None, 30, 4)             2052      
Total params: 14,767,172
Trainable params: 14,767,172
Non-trainable params: 0
_________________________________________________________________
Train on 93909 samples, validate on 40248 samples
Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x1ec72c6c3c8>

In [11]:
# Make prediction on a single sequence
# Sequence to predict
test_data = test_x[498]
pred_x_seq = []
for x in test_data:
    for value, index in vocab_to_int.items():
        if x == index:
            pred_x_seq.append(value)

# Predicted output
pred_expand = model.predict(np.expand_dims(test_data, axis=0))
pred_y = []
for y in pred_expand:
    pred_y.append(np.argmax(y, axis=1))
print('Predictions Index:')
print(pred_y)

pred_y_seq = []
for x in pred_y:
    for y in x:
        for value, index in label_to_int.items():
            if y == index:
                pred_y_seq.append(value)

print('Prediction sequence:')            
print(' '.join(pred_x_seq))
print('Prediction output:')
print(' '.join(pred_y_seq))

Predictions Index:
[array([3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1], dtype=int64)]
Prediction sequence:
religion the words haram — meaning religiously prohibited — and aib — meaning culturally inappropriate — were exchanged carelessly as if they meant the same thing and had the same
Prediction output:
<period> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na> <na>


In [27]:
# # WIP for CM and CR
# for_report = model.predict(test_x)
# fr_ = np.array([np.argmax(x, axis=1) for x in for_report])
# print(fr_.shape)

# y_ = np.array([np.argmax(x, axis=1) for x in test_y])
# print(y_.shape)

# cm = tf.confusion_matrix(labels=y_[50], predictions=fr_[50], num_classes=4)
# # print('Classification Report:')
# # # print(cr)
# # print('Confusion Matrix:')
# # print(cm)

(33540, 30)
(33540, 30)


In [39]:
for_report = model.predict(test_x)
fr_ = [np.argmax(x, axis=1) for x in for_report]
fr_ = np.concatenate(fr_, axis=0)

y_ = [np.argmax(x, axis=1) for x in test_y]
y_ = np.concatenate(y_, axis=0)

In [41]:
gggg = confusion_matrix(y_, fr_)
gggg

array([[     0,      7,      0,      0],
       [     0, 860577,  10728,  10566],
       [     0,  28932,  23236,  15730],
       [     0,  16558,   5660,  34206]], dtype=int64)

In [44]:
hhh = classification_report(y_, fr_)
hhh

  'precision', 'predicted', average, warn_for)


'              precision    recall  f1-score   support\n\n           0       0.00      0.00      0.00         7\n           1       0.95      0.98      0.96    881871\n           2       0.59      0.34      0.43     67898\n           3       0.57      0.61      0.59     56424\n\n   micro avg       0.91      0.91      0.91   1006200\n   macro avg       0.53      0.48      0.49   1006200\nweighted avg       0.90      0.91      0.91   1006200\n'