# Preprocessing

In [98]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [99]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.utils import resample

import re
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

In [100]:
tf.test.is_gpu_available()

True

In [101]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [102]:
train = pd.read_csv('drive/MyDrive/train_quotes.csv')
test = pd.read_csv('drive/MyDrive/test_quotes.csv')

In [103]:
train = train.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])
test = test.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

In [106]:
train['top25pct'] = (train['Likes'] >= 89).astype(int)
test['top25pct'] = (test['Likes'] >= 89).astype(int)

In [107]:
train.head()

Unnamed: 0,Quote,Likes,top25pct
0,"She turned to look at him, and he was already ...",210,1
1,Being in a religion is important but you must ...,0,0
2,Shortcuts will only satisfy and give you succe...,0,0
3,Though the earth contains greater energy and m...,5,0
4,A novel is a mirror walking along a main road.,130,1


In [108]:
test.head()

Unnamed: 0,Quote,Likes,top25pct
0,"Beyond work and love, I would add two other in...",350,1
1,A conflict is the harmful expression of differ...,0,0
2,Experience is more important than knowledge.,50,0
3,"Wake up, for life is only a fleeting moment, b...",0,0
4,It is astounding how it's harder to find a per...,0,0


In [109]:
def upsample_minority(df):

  # Upsample minority class in both the training and test data
  df_majority = df.loc[df['top25pct'] == 0, :]
  df_minority = df.loc[df['top25pct'] == 1, :]
  df_minority_upsampled = resample(df_minority, replace = True, n_samples = len(df_majority), random_state = 42)

  # Combine together to get the upsampled training data
  df = pd.concat([df_majority, df_minority_upsampled])

  return df

In [110]:
# Upsample the minority class
train = upsample_minority(train)
test = upsample_minority(test)

In [111]:
train.head()

Unnamed: 0,Quote,Likes,top25pct
1,Being in a religion is important but you must ...,0,0
2,Shortcuts will only satisfy and give you succe...,0,0
3,Though the earth contains greater energy and m...,5,0
5,She wore far too much rouge last night and not...,26,0
7,Happy Youth Day1 Timothy 4:12Ecclesiastes 12:1,0,0


In [112]:
test.head()

Unnamed: 0,Quote,Likes,top25pct
1,A conflict is the harmful expression of differ...,0,0
2,Experience is more important than knowledge.,50,0
3,"Wake up, for life is only a fleeting moment, b...",0,0
4,It is astounding how it's harder to find a per...,0,0
5,"But then again, that's what the Book of Job wa...",3,0


In [113]:
# Preprocessing - to remove length-1 words, and remove non-alphabet symbols
def preprocessing(quotes):

    processed_quotes = []
    
    for quote in tqdm(quotes):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', quote)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_quotes.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_quotes

In [114]:
train['Quote'] = preprocessing(train['Quote'])
test['Quote'] = preprocessing(test['Quote'])

100%|██████████| 84710/84710 [00:01<00:00, 74265.79it/s]
100%|██████████| 21246/21246 [00:00<00:00, 75542.10it/s]


In [115]:
def preprocessing_2(quotes):
    
    processed_quotes = []

    for quote in tqdm(quotes):
        tokens = word_tokenize(quote)

        # Convert to lower case
        tokens = [w.lower() for w in tokens]

        # Remove punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]

        # Remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        
        # Filter out stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        
        processed_quotes.append(words)
        
    return processed_quotes

In [116]:
train['Quote'] = preprocessing_2(train['Quote'])
test['Quote'] = preprocessing_2(test['Quote'])

100%|██████████| 84710/84710 [00:42<00:00, 2013.12it/s]
100%|██████████| 21246/21246 [00:10<00:00, 2033.65it/s]


In [117]:
# Shuffle test again, and reset index (very important!!!)
test = test.sample(frac = 1)
test = test.reset_index(drop = True)
train = train.reset_index(drop = True)

# Fitting Word Embeddings

In [118]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [119]:
# Extract the embeddings from the stored file
# Embedding is size 111k (# words) x 100 (dimensions)
import os 

EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('drive/MyDrive/word2vec_train.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [120]:
def vectorize_text(content):

  # Vectorize the text samples into 2D integer tensor - max length 16 words
  tokenizer_obj = Tokenizer()

  # Fit the tokenizer on the text
  tokenizer_obj.fit_on_texts(content)

  # Generate the sequence of tokens
  sequences = tokenizer_obj.texts_to_sequences(content)

  # Get the max length of each article - 5587
  max_length = max([len(s) for s in content])
  
  # Pad the sequences
  vectorized_text = pad_sequences(sequences, maxlen = max_length)

  return vectorized_text, tokenizer_obj, max_length

In [121]:
def get_embedding_matrix(tokenizer_obj, EMBEDDING_DIM = 100):
 
  word_index = tokenizer_obj.word_index

  num_words = len(word_index) + 1
  words_not_found = []
  
  # Create the emedding matrix - map embeddings from word2vec model for each word and create matrix of word vectors
  embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

  for word, i in word_index.items():
      if i > num_words: # Least common words (don't care)
          continue
          
      embedding_vector = embeddings_index.get(word)
      
      if (embedding_vector is not None):
          # Assign the ith elmenet of the embedding matrix to the embedding of that word
          embedding_matrix[i] = embedding_vector
      else:
          words_not_found.append(word)
          
  print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

  return embedding_matrix

In [122]:
# Vectorize the text (return document x length matrix)
train_vectorized, tokenizer, max_length = vectorize_text(train['Quote'])

test_vectorized = tokenizer.texts_to_sequences(test['Quote'])
test_vectorized = pad_sequences(test_vectorized, maxlen = max_length)

# Get the embedding matrix of the words
embedding_matrix = get_embedding_matrix(tokenizer)
num_words = embedding_matrix.shape[0]

number of null word embeddings: 5586


# Tuning Deep Learning Model

Reference: https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

In [133]:
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, LSTM, GRU, SpatialDropout1D, Bidirectional, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.optimizers import SGD, Adam
from tensorboard.plugins.hparams import api as hp
from keras.regularizers import l2

from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

In [134]:
# Establish X and y data
X_train = train_vectorized
X_test = test_vectorized

y_train = train['top25pct'].to_numpy()
y_test = test['top25pct'].to_numpy()

print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_test: ', y_test.shape)

Shape of X_train:  (84710, 373)
Shape of y_train:  (84710,)
Shape of X_test:  (21246, 373)
Shape of y_test:  (21246,)


In [140]:
# Untuned RNN model
def RNN_Model(learning_rate, dropout):
    
    text_sequence = Input(shape = (max_length,), name = 'text_sequence_input')
    rnn_layer = Embedding(num_words, EMBEDDING_DIM, weights = [embedding_matrix], trainable = False, name = 'embedding')(text_sequence)
    rnn_layer = LSTM(units = 32, dropout = dropout)(rnn_layer)
    rnn_layer = Dense(32, activation = 'relu')(rnn_layer)
    output = Dense(1, name = 'output')(rnn_layer)
    model = Model(inputs = text_sequence, outputs = output)
    model.compile(loss = keras.losses.BinaryCrossentropy(from_logits = True), optimizer = Adam(learning_rate = learning_rate), metrics = ['accuracy'])
    
    return model

In [None]:
# create model
model = KerasClassifier(build_fn=RNN_Model, epochs=15, batch_size=32, verbose=1)

# define the grid search parameters
learning_rate = [0.005, 0.015]
dropout = [0.2, 0.5]
param_grid = dict(learning_rate=learning_rate, dropout=dropout)
grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
grid_result = grid.fit(X_train, y_train)

# summarize results
print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [None]:
# LEARNING_RATE = [0.005, 0.015]
# DROPOUT = [0.2, 0.5]
# for learning_rate in LEARNING_RATE:
#     for dropout in DROPOUT:
#         model = RNN_Model(learning_rate, dropout)
#         model.summary()
#         print("learning rate: " + str(learning_rate))
#         print("dropout: " + str(dropout))
#         print("-----------------------------------------------------------------")
#         history = model.fit(X_train, y_train, batch_size = 32, epochs = 20, verbose = 1)

# Training Deep Learning Model

In [147]:
# Tuned RNN model
def RNN_Model():
    
    text_sequence = Input(shape = (max_length,), name = 'text_sequence_input')
    rnn_layer = Embedding(num_words, EMBEDDING_DIM, weights = [embedding_matrix], trainable = False, name = 'embedding')(text_sequence)
    rnn_layer = LSTM(units = 32, dropout = 0.2)(rnn_layer)
    rnn_layer = Dense(32, activation = 'relu')(rnn_layer)
    output = Dense(1, name = 'output')(rnn_layer)
    model = Model(inputs = text_sequence, outputs = output)
    model.compile(loss = keras.losses.BinaryCrossentropy(from_logits = True), optimizer = Adam(learning_rate = 0.005), metrics = ['accuracy'])
    
    return model

In [148]:
model = RNN_Model()
model.summary()
history = model.fit(X_train, y_train, batch_size = 32, epochs = 20, verbose = 1)

Model: "model_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_sequence_input (InputLa [(None, 373)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 373, 100)          8911900   
_________________________________________________________________
lstm_9 (LSTM)                (None, 32)                17024     
_________________________________________________________________
dense_9 (Dense)              (None, 32)                1056      
_________________________________________________________________
output (Dense)               (None, 1)                 33        
Total params: 8,930,013
Trainable params: 18,113
Non-trainable params: 8,911,900
_________________________________________________________________
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20

# Evaluating Model Results

In [153]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [149]:
# Evaluate on test set
results = model.evaluate(X_test, y_test)



In [150]:
y_test_probs = model.predict(X_test)
y_test_preds = (y_test_probs > 0.5).astype(int)

In [154]:
def get_classification_metrics(actual, pred):
  print(confusion_matrix(actual, pred))
  print('Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}'.format(
      accuracy_score(actual, pred),
      precision_score(actual, pred),
      recall_score(actual, pred),
      f1_score(actual, pred)))

In [155]:
get_classification_metrics(y_test, y_test_preds)

[[8670 1953]
 [6663 3960]]
Accuracy: 0.5944648404405535, Precision: 0.669710806697108, Recall: 0.3727760519627224, F1 Score: 0.47895500725689405
