# LSTM for Author Detection

> This notebook contains implementation of LSTM model for author detection.

### Imports and Constants

In [0]:
import nltk
nltk.download('punkt')

In [0]:
%matplotlib inline

import os, glob, re

import matplotlib.pyplot as plt
plt.style.use('ggplot')
import seaborn as sn

import pandas as pd
import numpy as np

from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, Bidirectional, BatchNormalization, SpatialDropout1D, Conv1D, GlobalMaxPooling1D
from tensorflow.keras import optimizers
from tensorflow.keras import regularizers
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, CSVLogger, ReduceLROnPlateau

from sklearn.metrics import confusion_matrix, classification_report

from nltk import word_tokenize

In [0]:
isOnColab = True

if isOnColab == True:
  DIR_PREFIX = './drive/My Drive/MSCS/S4/NLP/project/'
else:
  DIR_PREFIX = './'

DATASET_DIR = DIR_PREFIX + 'dataset/'
MODEL_SAVE_DIR = DIR_PREFIX + 'models/'

TRAIN_FILE = DATASET_DIR + 'train.csv'
VAL_FILE = DATASET_DIR + 'val.csv'
TEST_FILE = DATASET_DIR + 'test.csv'

MAX_SENTENCE_LENGTH = 100
EMBEDDING_DIM = 100

### Load Dataset

In [0]:
def load_dataset(file_name):
  dF = pd.read_csv(file_name, delimiter=',')
  return (dF['Text'], dF['Label'])

In [0]:
raw_train_X, raw_train_Y = load_dataset(TRAIN_FILE)
raw_val_X, raw_val_Y     = load_dataset(VAL_FILE)
raw_test_X, raw_test_Y   = load_dataset(TEST_FILE)

print('train_X: {0}\ttrain_Y: {1}'.format(len(raw_train_X), len(raw_train_Y)))
print('val_X: {0}\tval_Y: {1}'.format(len(raw_val_X), len(raw_val_Y)))
print('test_X: {0}\ttest_Y: {1}'.format(len(raw_test_X), len(raw_test_Y)))

In [0]:
plot = raw_train_Y.value_counts().sort_values(ascending=False).plot(kind='bar', y='# of tweets', title='# of tweets per politician') 

### Preprocess Data

In [0]:
def preprocessing(data):
    processed = []
    for each in data:
        target = each

        # tokenize words
        target = word_tokenize(target)

        # truncate sentence for MAX_SENTENCE_LENGTH
        target = target[:MAX_SENTENCE_LENGTH]

        # finally append to processed
        processed.append(target)

    return processed

In [0]:
processed_train_X = preprocessing(raw_train_X)
processed_val_X = preprocessing(raw_val_X)
processed_test_X = preprocessing(raw_test_X)

In [0]:
# create one-hot labels
labels_dict = {}

for index, x in enumerate(list(raw_train_Y.unique())):
  labels_dict[x] = index

# replace string labels with corresponding numeric value from labels_dict
train_Y = raw_train_Y.apply(lambda x: labels_dict[x])
val_Y = raw_val_Y.apply(lambda x: labels_dict[x])
test_Y = raw_test_Y.apply(lambda x: labels_dict[x])

# convert to one-hot encoding
train_Y = to_categorical(train_Y)
val_Y = to_categorical(val_Y)
test_Y = to_categorical(test_Y)

# print
train_Y, val_Y, test_Y

In [0]:
# Tokenize texts
tokenizer = Tokenizer(oov_token='[UNK]')
tokenizer.fit_on_texts(processed_train_X)

print('word counts:', tokenizer.word_counts)
print('document_count:', tokenizer.document_count)
print('vocab_size:', len(tokenizer.word_index))

In [0]:
train_X = tokenizer.texts_to_sequences(processed_train_X)
val_X = tokenizer.texts_to_sequences(processed_val_X)
test_X = tokenizer.texts_to_sequences(processed_test_X)

print(train_X[:3])

In [0]:
# pad sequences

train_X = pad_sequences(train_X,  maxlen=MAX_SENTENCE_LENGTH, padding='post')
val_X = pad_sequences(val_X,      maxlen=MAX_SENTENCE_LENGTH, padding='post')
test_X = pad_sequences(test_X,    maxlen=MAX_SENTENCE_LENGTH, padding='post')

print(train_X[:3])

In [0]:
train_X[0].shape

### Prepare Embeddings

In [0]:
# GLOVE_DIR = DATASET_DIR + 'glove.twitter.27B.100d.txt'
GLOVE_DIR = DATASET_DIR + 'glove.6B.100d.txt'

In [0]:
# load embeddings of GLoVE
embeddings_dict = {}

with open(GLOVE_DIR, 'r', encoding="utf-8") as f:
  for line in f:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      embeddings_dict[word] = vector

print('embeddings_dict:', len(embeddings_dict))

In [0]:
# padding
embeddings_dict['[UNK]'] = np.zeros((EMBEDDING_DIM,), dtype='float32')

# load embeddings for train dataset
for X in processed_train_X:
  for sentence in X:
    words = sentence.split(' ')
    for word in words:
      try:
        embdng = embeddings_dict[word]
      except KeyError:
        embdng = np.zeros((EMBEDDING_DIM,), dtype='float32')
    
      # add embedding to DICT
      embeddings_dict[word] = embdng

print('embeddings_dict:', len(embeddings_dict))

# load embeddings for validation dataset
for X in processed_val_X:
  for sentence in X:
    words = sentence.split(' ')
    for word in words:
      try:
        embdng = embeddings_dict[word]
      except KeyError:
        embdng = np.zeros((EMBEDDING_DIM,), dtype='float32')
    
      # add embedding to DICT
      embeddings_dict[word] = embdng

print('embeddings_dict:', len(embeddings_dict))

# load embeddings for test dataset
for X in processed_test_X:
  for sentence in X:
    words = sentence.split(' ')
    for word in words:
      try:
        embdng = embeddings_dict[word]
      except KeyError:
        embdng = np.zeros((EMBEDDING_DIM,), dtype='float32')
    
      # add embedding to DICT
      embeddings_dict[word] = embdng

print('embeddings_dict:', len(embeddings_dict))

In [0]:
# create matrix of embeddings
EMBEDDING_MATRIX = np.zeros((len(tokenizer.word_index) + 1, EMBEDDING_DIM))

for word, i in tokenizer.word_index.items():
  try:
    vector = embeddings_dict[word]
  except KeyError:
    vector = np.zeros((EMBEDDING_DIM,), dtype='float32')
  # append vector
  EMBEDDING_MATRIX[i] = vector 

print('EMBEDDING_MATRIX', EMBEDDING_MATRIX.shape, EMBEDDING_MATRIX)

### Simple LSTM

In [0]:
# setup callbacks

MODEL_SAVE_AT = MODEL_SAVE_DIR + 'lstm_plain/'
if not os.path.exists(MODEL_SAVE_AT):
  os.makedirs(MODEL_SAVE_AT)


filepath = MODEL_SAVE_AT + 'model' + '.hdf5'
logfilepath = MODEL_SAVE_AT + 'logs_' + 'model' + '.csv'

reduce_lr_rate=0.2
logCallback = CSVLogger(logfilepath, separator=',', append=False)
earlyStopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', save_weights_only=True, verbose=1,
                             save_best_only=True, mode='auto')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=reduce_lr_rate, patience=3,
                              cooldown=0, min_lr=0.0000000001, verbose=0)

callbacks_list = [logCallback, earlyStopping, reduce_lr, checkpoint]

In [0]:
# Model Code

inp_layer = Input(shape=(MAX_SENTENCE_LENGTH,))

layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=EMBEDDING_DIM, 
                  input_length=MAX_SENTENCE_LENGTH, weights=[EMBEDDING_MATRIX], trainable=True)(inp_layer)

layer = SpatialDropout1D(0.3)(layer)
layer = LSTM(128, dropout=0.2, recurrent_dropout=0.2)(layer)
layer = BatchNormalization()(layer)
layer = Dropout(0.5)(layer)
layer = Dense(len(labels_dict), activation='sigmoid')(layer)

output_layer = layer

model = Model(inputs=inp_layer, outputs=output_layer)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

#### Training

In [0]:
EPOCHS = 50
BATCH_SIZE = 64

In [0]:
hist = model.fit(train_X, train_Y, validation_data=(val_X, val_Y), epochs=EPOCHS, batch_size=BATCH_SIZE,
               verbose=1, shuffle=True, callbacks=callbacks_list)

#### Inference and Results

In [0]:
predictions = model.predict(test_X, batch_size=BATCH_SIZE, verbose=1)

In [0]:
labels_list = labels_dict.keys()

test_Y_max = np.argmax(test_Y, axis=-1)
predictions_max = np.argmax(predictions, axis=-1)

####################################### CONFUSION MATRIX

cm = confusion_matrix(test_Y_max, predictions_max)
print('Confusion Matrix:\n', cm)

cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = pd.DataFrame(cm, labels_list, labels_list)
sn.set(font_scale=1.4) # for label size
sn.heatmap(cm, annot=True, annot_kws={"size": 11}, fmt=".2f") # font size
plt.show()

report_labels = [str(x) for x in labels_dict.values()]
#########################################################

################################### CLASSIFICATION REPORT
print("\n\nClassification Report\n", classification_report(test_Y_max, predictions_max, 
                                                       labels=list(labels_dict.values()), target_names=report_labels))
#########################################################


################################# MODEL TRAINING PROGRESS
logsdF = pd.read_csv(logfilepath, delimiter=',')
ax = plt.gca()

logsdF.plot(kind='line', x='epoch', y='accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='loss', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_loss', ax=ax)
plt.legend()
plt.title('Model Training Progress')
plt.show()
#########################################################

### BiDirectional LSTM with 1D CNN and Max Pooling

In [0]:
# setup callbacks

MODEL_SAVE_AT = MODEL_SAVE_DIR + 'lstm_bidirectional/'
if not os.path.exists(MODEL_SAVE_AT):
  os.makedirs(MODEL_SAVE_AT)


filepath = MODEL_SAVE_AT + 'model' + '.hdf5'
logfilepath = MODEL_SAVE_AT + 'logs_' + 'model' + '.csv'

reduce_lr_rate=0.2
logCallback = CSVLogger(logfilepath, separator=',', append=False)
earlyStopping = EarlyStopping(monitor='val_accuracy', min_delta=0, patience=5, verbose=0, mode='auto')
checkpoint = ModelCheckpoint(filepath, monitor='val_accuracy', save_weights_only=True, verbose=1,
                             save_best_only=True, mode='auto')
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=reduce_lr_rate, patience=3,
                              cooldown=0, min_lr=0.0000000001, verbose=0)

callbacks_list = [logCallback, earlyStopping, reduce_lr, checkpoint]

In [0]:
# Model Code

inp_layer = Input(shape=(MAX_SENTENCE_LENGTH,))

layer = Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=EMBEDDING_DIM, 
                  input_length=MAX_SENTENCE_LENGTH, weights=[EMBEDDING_MATRIX], 
									embeddings_regularizer=regularizers.l2(0.00), trainable=True)(inp_layer)

layer = SpatialDropout1D(0.3)(layer)

layer = Bidirectional(LSTM(128, name='blstm_1',
	activation='tanh',
	recurrent_activation='hard_sigmoid',
	recurrent_dropout=0.0,
	dropout=0.5, 
	kernel_initializer='glorot_uniform',
	return_sequences=True))(layer)
 
layer = Conv1D(filters=64, kernel_size=2, padding='valid', kernel_initializer = 'glorot_uniform', name='conv2d_1')(layer)
layer = GlobalMaxPooling1D()(layer)
# layer = BatchNormalization()(layer)
# layer = Dropout(0.5)(layer)
layer = Dense(len(labels_dict), activation='sigmoid')(layer)

output_layer = layer

model = Model(inputs=inp_layer, outputs=output_layer)

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print(model.summary())

#### Training

In [0]:
EPOCHS = 50
BATCH_SIZE = 64

In [0]:
hist = model.fit(train_X, train_Y, validation_data=(val_X, val_Y), epochs=EPOCHS, batch_size=BATCH_SIZE,
               verbose=1, shuffle=True, callbacks=callbacks_list)

#### Inference and Results

In [0]:
predictions = model.predict(test_X, batch_size=BATCH_SIZE, verbose=1)

In [0]:
labels_list = labels_dict.keys()

test_Y_max = np.argmax(test_Y, axis=-1)
predictions_max = np.argmax(predictions, axis=-1)

####################################### CONFUSION MATRIX

cm = confusion_matrix(test_Y_max, predictions_max)
print('Confusion Matrix:\n', cm)

cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
cm = pd.DataFrame(cm, labels_list, labels_list)
sn.set(font_scale=1.4) # for label size
sn.heatmap(cm, annot=True, annot_kws={"size": 11}, fmt=".2f") # font size
plt.show()

report_labels = [str(x) for x in labels_dict.values()]
#########################################################

################################### CLASSIFICATION REPORT
print("\n\nClassification Report\n", classification_report(test_Y_max, predictions_max, 
                                                       labels=list(labels_dict.values()), target_names=report_labels))
#########################################################


################################# MODEL TRAINING PROGRESS
logsdF = pd.read_csv(logfilepath, delimiter=',')
ax = plt.gca()

logsdF.plot(kind='line', x='epoch', y='accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='loss', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_accuracy', ax=ax)
logsdF.plot(kind='line', x='epoch', y='val_loss', ax=ax)
plt.legend()
plt.title('Model Training Progress')
plt.show()
#########################################################