# CNN for Author Detection

> This notebook contains implementation of CNN model for author detection.

### Imports and Constants

In [0]:
import nltk
nltk.download('punkt')

In [0]:
%matplotlib inline

import os, glob, re

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sn

import pandas as pd
import numpy as np

import tensorflow as tf
from tensorflow.keras.utils import to_categorical, get_source_inputs
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Layer, InputSpec, Input, Dense, Embedding, Flatten, BatchNormalization, Activation, Conv1D, Add, MaxPooling1D, ThresholdedReLU, Convolution1D, Dropout
from tensorflow.keras.optimizers import SGD
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger, ReduceLROnPlateau

from sklearn.metrics import confusion_matrix, classification_report

from nltk import word_tokenize

In [0]:
isOnColab = True

if isOnColab == True:
  DIR_PREFIX = './drive/My Drive/MSCS/S4/NLP/project/'
else:
  DIR_PREFIX = './'

DATASET_DIR = DIR_PREFIX + 'dataset/'
MODEL_SAVE_DIR = DIR_PREFIX + 'models/'

TRAIN_FILE = DATASET_DIR + 'train.csv'
VAL_FILE = DATASET_DIR + 'val.csv'
TEST_FILE = DATASET_DIR + 'test.csv'

MAX_SENTENCE_LENGTH = 100
EMBEDDING_DIM = 100

### Load Dataset

In [0]:
def load_dataset(file_name):
  dF = pd.read_csv(file_name, delimiter=',')
  return (dF['Text'], dF['Label'])

In [0]:
raw_train_X, raw_train_Y = load_dataset(TRAIN_FILE)
raw_val_X, raw_val_Y     = load_dataset(VAL_FILE)
raw_test_X, raw_test_Y   = load_dataset(TEST_FILE)

print('train_X: {0}\ttrain_Y: {1}'.format(len(raw_train_X), len(raw_train_Y)))
print('val_X: {0}\tval_Y: {1}'.format(len(raw_val_X), len(raw_val_Y)))
print('test_X: {0}\ttest_Y: {1}'.format(len(raw_test_X), len(raw_test_Y)))

In [0]:
plot = raw_train_Y.value_counts().sort_values(ascending=False).plot(kind='bar', y='# of tweets', title='# of tweets per politician') 

### Preprocess Data

In [0]:
def preprocessing(data):
    processed = []
    for each in data:
        target = each

        # # tokenize words
        # target = word_tokenize(target)

        # truncate sentence for MAX_SENTENCE_LENGTH
        target = target[:MAX_SENTENCE_LENGTH]

        # finally append to processed
        processed.append(target)

    return processed

In [0]:
processed_train_X = preprocessing(raw_train_X)
processed_val_X = preprocessing(raw_val_X)
processed_test_X = preprocessing(raw_test_X)

In [0]:
# create one-hot labels
labels_dict = {}

for index, x in enumerate(list(raw_train_Y.unique())):
  labels_dict[x] = index

# replace string labels with corresponding numeric value from labels_dict
train_Y = raw_train_Y.apply(lambda x: labels_dict[x])
val_Y = raw_val_Y.apply(lambda x: labels_dict[x])
test_Y = raw_test_Y.apply(lambda x: labels_dict[x])

# convert to one-hot encoding
train_Y = to_categorical(train_Y)
val_Y = to_categorical(val_Y)
test_Y = to_categorical(test_Y)

# print
train_Y, val_Y, test_Y

In [0]:
# Tokenize texts
tokenizer = Tokenizer(oov_token='[UNK]', char_level=True)
tokenizer.fit_on_texts(processed_train_X)

print('char counts:', tokenizer.word_counts)
print('document_count:', tokenizer.document_count)
print('vocab_size:', len(tokenizer.word_index))

In [0]:
train_X = tokenizer.texts_to_sequences(processed_train_X)
val_X = tokenizer.texts_to_sequences(processed_val_X)
test_X = tokenizer.texts_to_sequences(processed_test_X)

print(len(train_X[:1][0]), train_X[:1])

In [0]:
# pad sequences

train_X = pad_sequences(train_X,  maxlen=MAX_SENTENCE_LENGTH, padding='post')
val_X = pad_sequences(val_X,      maxlen=MAX_SENTENCE_LENGTH, padding='post')
test_X = pad_sequences(test_X,    maxlen=MAX_SENTENCE_LENGTH, padding='post')

print(train_X[:3])

In [0]:
train_X[0].shape

### VDCNN - Very Deep CNNs

#### Model

Reference: https://github.com/zonetrooper32/VDCNN

In [0]:
class KMaxPooling(Layer):
    """
    K-max pooling layer that extracts the k-highest activations from a sequence (2nd dimension).
    TensorFlow backend.
    """
    def __init__(self, k=1, sorted=True, **kwargs):
        super().__init__(**kwargs)
        self.input_spec = InputSpec(ndim=3)
        self.k = k
        self.sorted = sorted

    def compute_output_shape(self, input_shape):
        return (input_shape[0], self.k, input_shape[2])

    def call(self, inputs):
        # swap last two dimensions since top_k will be applied along the last dimension
        shifted_inputs = tf.transpose(inputs, [0, 2, 1])
        
        # extract top_k, returns two tensors [values, indices]
        top_k = tf.nn.top_k(shifted_inputs, k=self.k, sorted=self.sorted)[0]
        
        # return flattened output
        return tf.transpose(top_k, [0,2,1])

In [0]:
def identity_block(inputs, filters, kernel_size=3, use_bias=False, shortcut=False):
    conv1 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, padding='same')(inputs)
    bn1 = BatchNormalization()(conv1)
    relu = Activation('relu')(bn1)
    conv2 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, padding='same')(relu)
    out = BatchNormalization()(conv2)
    if shortcut:
        out = Add()([out, inputs])
    return Activation('relu')(out)

def conv_block(inputs, filters, kernel_size=3, use_bias=False, shortcut=False, 
               pool_type='max', sorted=True, stage=1):
    conv1 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, padding='same')(inputs)
    bn1 = BatchNormalization()(conv1)
    relu1 = Activation('relu')(bn1)

    conv2 = Conv1D(filters=filters, kernel_size=kernel_size, strides=1, padding='same')(relu1)
    out = BatchNormalization()(conv2)

    if shortcut:
        residual = Conv1D(filters=filters, kernel_size=1, strides=2, name='shortcut_conv1d_%d' % stage)(inputs)
        residual = BatchNormalization(name='shortcut_batch_normalization_%d' % stage)(residual)
        out = downsample(out, pool_type=pool_type, sorted=sorted, stage=stage)
        out = Add()([out, residual])
        out = Activation('relu')(out)
    else:
        out = Activation('relu')(out)
        out = downsample(out, pool_type=pool_type, sorted=sorted, stage=stage)
    if pool_type is not None:
        out = Conv1D(filters=2*filters, kernel_size=1, strides=1, padding='same', name='1_1_conv_%d' % stage)(out)
        out = BatchNormalization(name='1_1_batch_normalization_%d' % stage)(out)
    return out

def downsample(inputs, pool_type='max', sorted=True, stage=1):
    if pool_type == 'max':
        out = MaxPooling1D(pool_size=3, strides=2, padding='same', name='pool_%d' % stage)(inputs)
    elif pool_type == 'k_max':
        k = int(inputs._keras_shape[1]/2)
        out = KMaxPooling(k=k, sorted=sorted, name='pool_%d' % stage)(inputs)
    elif pool_type == 'conv':
        out = Conv1D(filters=inputs._keras_shape[-1], kernel_size=3, strides=2, padding='same', name='pool_%d' % stage)(inputs)
        out = BatchNormalization()(out)
    elif pool_type is None:
        out = inputs
    else:
        raise ValueError('unsupported pooling type!')
    return out

def VDCNN(num_classes, depth=9, sequence_length=MAX_SENTENCE_LENGTH, embedding_dim=EMBEDDING_DIM, 
          shortcut=False, pool_type='max', sorted=True, use_bias=False, input_tensor=None):
    if depth == 9:
        num_conv_blocks = (1, 1, 1, 1)
    elif depth == 17:
        num_conv_blocks = (2, 2, 2, 2)
    elif depth == 29:
        num_conv_blocks = (5, 5, 2, 2)
    elif depth == 49:
        num_conv_blocks = (8, 8, 5, 3)
    else:
        raise ValueError('unsupported depth for VDCNN.')

    inputs = Input(shape=(sequence_length, ), name='inputs')
    embedded_chars = Embedding(input_dim=sequence_length, output_dim=embedding_dim)(inputs)
    out = Conv1D(filters=64, kernel_size=3, strides=1, padding='same', name='temp_conv')(embedded_chars)

    # Convolutional Block 64
    for _ in range(num_conv_blocks[0] - 1):
        out = identity_block(out, filters=64, kernel_size=3, use_bias=use_bias, shortcut=shortcut)
    out = conv_block(out, filters=64, kernel_size=3, use_bias=use_bias, shortcut=shortcut, 
                     pool_type=pool_type, sorted=sorted, stage=1)

    # Convolutional Block 128
    for _ in range(num_conv_blocks[1] - 1):
        out = identity_block(out, filters=128, kernel_size=3, use_bias=use_bias, shortcut=shortcut)
    out = conv_block(out, filters=128, kernel_size=3, use_bias=use_bias, shortcut=shortcut, 
                     pool_type=pool_type, sorted=sorted, stage=2)

    # Convolutional Block 256
    for _ in range(num_conv_blocks[2] - 1):
        out = identity_block(out, filters=256, kernel_size=3, use_bias=use_bias, shortcut=shortcut)
    out = conv_block(out, filters=256, kernel_size=3, use_bias=use_bias, shortcut=shortcut, 
                     pool_type=pool_type, sorted=sorted, stage=3)

    # Convolutional Block 512
    for _ in range(num_conv_blocks[3] - 1):
        out = identity_block(out, filters=512, kernel_size=3, use_bias=use_bias, shortcut=shortcut)
    out = conv_block(out, filters=512, kernel_size=3, use_bias=use_bias, shortcut=False, 
                     pool_type=None, stage=4)

    # k-max pooling with k = 8
    out = KMaxPooling(k=8, sorted=True)(out)
    out = Flatten()(out)

    # Dense Layers
    out = Dense(2048, activation='relu')(out)
    out = Dense(2048, activation='relu')(out)
    out = Dense(num_classes, activation='softmax')(out)

    if input_tensor is not None:
        inputs = get_source_inputs(input_tensor)
    else:
        inputs = inputs

    # Create model.
    model = Model(inputs=inputs, outputs=out, name='VDCNN')
    return model

#### Depth = 9 (conv layers)

In [0]:
# setup callbacks

DEPTH = 9

MODEL_SAVE_AT = MODEL_SAVE_DIR + 'vDCNN/depth_{0}/'.format(DEPTH)
if not os.path.exists(MODEL_SAVE_AT):
  os.makedirs(MODEL_SAVE_AT)


filepath = MODEL_SAVE_AT + 'model' + '.hdf5'
logfilepath = MODEL_SAVE_AT + 'logs_' + 'model' + '.csv'

reduce_lr_rate=0.2
logCallback = CSVLogger(logfilepath, separator=',', append=False)
earlyStopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', save_weights_only=True, verbose=1,
                             save_best_only=True, mode='auto')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=reduce_lr_rate, patience=3,
                              cooldown=0, min_lr=0.0000000001, verbose=0)

callbacks_list = [logCallback, earlyStopping, reduce_lr, checkpoint]

In [0]:
model = VDCNN(num_classes=train_Y.shape[1], 
  depth=DEPTH, 
  sequence_length=MAX_SENTENCE_LENGTH, 
  shortcut=False,
  pool_type='max', 
  sorted=False,
  use_bias=False)

model.compile(optimizer=SGD(lr=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

##### Training

In [0]:
EPOCHS = 50
BATCH_SIZE = 64

In [0]:
hist = model.fit(train_X, train_Y, validation_data=(val_X, val_Y), epochs=EPOCHS, batch_size=BATCH_SIZE,
               verbose=1, shuffle=True, callbacks=callbacks_list)

##### Inference and Results

In [0]:
predictions = model.predict(test_X, batch_size=BATCH_SIZE, verbose=1)

In [0]:
labels_list = labels_dict.keys()

test_Y_max = np.argmax(test_Y, axis=-1)
predictions_max = np.argmax(predictions, axis=-1)

####################################### CONFUSION MATRIX

cm = confusion_matrix(test_Y_max, predictions_max)
print('Confusion Matrix:\n', cm)

cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = pd.DataFrame(cm, labels_list, labels_list)
sn.set(font_scale=1.4) # for label size
sn.heatmap(cm, annot=True, annot_kws={"size": 11}, fmt=".2f") # font size
plt.show()

report_labels = [str(x) for x in labels_dict.values()]
#########################################################

################################### CLASSIFICATION REPORT
print("\n\nClassification Report\n", classification_report(test_Y_max, predictions_max, 
                                                       labels=list(labels_dict.values()), target_names=report_labels))
#########################################################


################################# MODEL TRAINING PROGRESS
logsdF = pd.read_csv(logfilepath, delimiter=',', header=0, 
                     names=['epoch', 'accuracy', 'loss', 'val_accuracy', 'val_loss'])
ax = plt.gca()

logsdF.plot(kind='line', x='epoch', y='accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='loss', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_loss', ax=ax)
plt.legend()
plt.title('Model Training Progress')
plt.show()
#########################################################

#### Depth = 17 (Convolution Layers)

In [0]:
# setup callbacks

DEPTH = 17

MODEL_SAVE_AT = MODEL_SAVE_DIR + 'vDCNN/depth_{0}/'.format(DEPTH)
if not os.path.exists(MODEL_SAVE_AT):
  os.makedirs(MODEL_SAVE_AT)


filepath = MODEL_SAVE_AT + 'model' + '.hdf5'
logfilepath = MODEL_SAVE_AT + 'logs_' + 'model' + '.csv'

reduce_lr_rate=0.2
logCallback = CSVLogger(logfilepath, separator=',', append=False)
earlyStopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', save_weights_only=True, verbose=1,
                             save_best_only=True, mode='auto')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=reduce_lr_rate, patience=3,
                              cooldown=0, min_lr=0.0000000001, verbose=0)

callbacks_list = [logCallback, earlyStopping, reduce_lr, checkpoint]

In [0]:
model = VDCNN(num_classes=train_Y.shape[1], 
  depth=DEPTH, 
  sequence_length=MAX_SENTENCE_LENGTH, 
  shortcut=False,
  pool_type='max', 
  sorted=False,
  use_bias=False)

model.compile(optimizer=SGD(lr=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

##### Training

In [0]:
EPOCHS = 50
BATCH_SIZE = 64

In [0]:
hist = model.fit(train_X, train_Y, validation_data=(val_X, val_Y), epochs=EPOCHS, batch_size=BATCH_SIZE,
               verbose=1, shuffle=True, callbacks=callbacks_list)

##### Inference and Results

In [0]:
predictions = model.predict(test_X, batch_size=BATCH_SIZE, verbose=1)

In [0]:
labels_list = labels_dict.keys()

test_Y_max = np.argmax(test_Y, axis=-1)
predictions_max = np.argmax(predictions, axis=-1)

####################################### CONFUSION MATRIX

cm = confusion_matrix(test_Y_max, predictions_max)
print('Confusion Matrix:\n', cm)

cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = pd.DataFrame(cm, labels_list, labels_list)
sn.set(font_scale=1.4) # for label size
sn.heatmap(cm, annot=True, annot_kws={"size": 11}, fmt=".2f") # font size
plt.show()

report_labels = [str(x) for x in labels_dict.values()]
#########################################################

################################### CLASSIFICATION REPORT
print("\n\nClassification Report\n", classification_report(test_Y_max, predictions_max, 
                                                       labels=list(labels_dict.values()), target_names=report_labels))
#########################################################


################################# MODEL TRAINING PROGRESS
logsdF = pd.read_csv(logfilepath, delimiter=',', header=0, 
                     names=['epoch', 'accuracy', 'loss', 'val_accuracy', 'val_loss'])
ax = plt.gca()

logsdF.plot(kind='line', x='epoch', y='accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='loss', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_loss', ax=ax)
plt.legend()
plt.title('Model Training Progress')
plt.show()
#########################################################

#### Depth = 29 (Convolution Layers)

In [0]:
# setup callbacks

DEPTH = 29

MODEL_SAVE_AT = MODEL_SAVE_DIR + 'vDCNN/depth_{0}/'.format(DEPTH)
if not os.path.exists(MODEL_SAVE_AT):
  os.makedirs(MODEL_SAVE_AT)


filepath = MODEL_SAVE_AT + 'model' + '.hdf5'
logfilepath = MODEL_SAVE_AT + 'logs_' + 'model' + '.csv'

reduce_lr_rate=0.2
logCallback = CSVLogger(logfilepath, separator=',', append=False)
earlyStopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', save_weights_only=True, verbose=1,
                             save_best_only=True, mode='auto')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=reduce_lr_rate, patience=3,
                              cooldown=0, min_lr=0.0000000001, verbose=0)

callbacks_list = [logCallback, earlyStopping, reduce_lr, checkpoint]

In [0]:
model = VDCNN(num_classes=train_Y.shape[1], 
  depth=DEPTH, 
  sequence_length=MAX_SENTENCE_LENGTH, 
  shortcut=False,
  pool_type='max', 
  sorted=False,
  use_bias=False)

model.compile(optimizer=SGD(lr=0.01, momentum=0.9), loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

##### Training

In [0]:
EPOCHS = 50
BATCH_SIZE = 64

In [0]:
hist = model.fit(train_X, train_Y, validation_data=(val_X, val_Y), epochs=EPOCHS, batch_size=BATCH_SIZE,
               verbose=1, shuffle=True, callbacks=callbacks_list)

##### Inference and Results

In [0]:
predictions = model.predict(test_X, batch_size=BATCH_SIZE, verbose=1)

In [0]:
labels_list = labels_dict.keys()

test_Y_max = np.argmax(test_Y, axis=-1)
predictions_max = np.argmax(predictions, axis=-1)

####################################### CONFUSION MATRIX

cm = confusion_matrix(test_Y_max, predictions_max)
print('Confusion Matrix:\n', cm)

cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = pd.DataFrame(cm, labels_list, labels_list)
sn.set(font_scale=1.4) # for label size
sn.heatmap(cm, annot=True, annot_kws={"size": 11}, fmt=".2f") # font size
plt.show()

report_labels = [str(x) for x in labels_dict.values()]
#########################################################

################################### CLASSIFICATION REPORT
print("\n\nClassification Report\n", classification_report(test_Y_max, predictions_max, 
                                                       labels=list(labels_dict.values()), target_names=report_labels))
#########################################################


################################# MODEL TRAINING PROGRESS
logsdF = pd.read_csv(logfilepath, delimiter=',', header=0, 
                     names=['epoch', 'accuracy', 'loss', 'val_accuracy', 'val_loss'])
ax = plt.gca()

logsdF.plot(kind='line', x='epoch', y='accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='loss', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_loss', ax=ax)
plt.legend()
plt.title('Model Training Progress')
plt.show()
#########################################################