# Preprocessing

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm
from sklearn.utils import resample

import re
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

In [3]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
train = pd.read_csv('drive/MyDrive/train_quotes.csv')
test = pd.read_csv('drive/MyDrive/test_quotes.csv')

In [6]:
train = train.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])
test = test.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

In [7]:
train['top25pct'] = (train['Likes'] >= 89).astype(int)
test['top25pct'] = (test['Likes'] >= 89).astype(int)

In [8]:
train.head()

Unnamed: 0,Quote,Author,Likes,top25pct
0,Politics is a fair and good enough profession ...,Nurudeen Ushawu,1,0
1,"This is the legend of Cassius Clay, The most b...",Muhammad Ali,59,0
2,The cosmos is within us. We are made of star-s...,Carl Sagan,927,1
3,"A poet is, before anything else, a person who ...",W.H. Auden,550,1
4,"When I have a little money, I buy books; and i...",Desiderius Erasmus Roterodamus,8459,1


In [9]:
test.head()

Unnamed: 0,Quote,Author,Likes,top25pct
0,You might asked why I loved you.For the same r...,Tatjana Ostojic,2,0
1,..time is always the price we pay for the unli...,André Aciman,60,0
2,"As above so below,can I, with you, go? Alwayst...",Lavinia Valeriana,0,0
3,The ocean pulsed outside our window. The sound...,Chelsie Shakespeare,11,0
4,"Penyakit yang Menghambat Dunia Islam :Pertama,...",Habiburrahman El Shirazy,0,0


In [10]:
def upsample_minority(df):

  # Upsample minority class in both the training and test data
  df_majority = df.loc[df['top25pct'] == 0, :]
  df_minority = df.loc[df['top25pct'] == 1, :]
  df_minority_upsampled = resample(df_minority, replace = True, n_samples = len(df_majority), random_state = 42)

  # Combine together to get the upsampled training data
  df = pd.concat([df_majority, df_minority_upsampled])

  return df

In [11]:
# Upsample the minority class
train = upsample_minority(train)
test = upsample_minority(test)

In [12]:
train.head()

Unnamed: 0,Quote,Author,Likes,top25pct
0,Politics is a fair and good enough profession ...,Nurudeen Ushawu,1,0
1,"This is the legend of Cassius Clay, The most b...",Muhammad Ali,59,0
5,The Second Koran tells us that the darkness in...,Maureen F. McHugh,4,0
6,"A powerful woman can stand, even after a fall....",Gift Gugu Mona,0,0
7,Aye. And I can do without a viper-tongued wenc...,Jennifer La Brecque,1,0


In [13]:
test.head()

Unnamed: 0,Quote,Author,Likes,top25pct
0,You might asked why I loved you.For the same r...,Tatjana Ostojic,2,0
1,..time is always the price we pay for the unli...,André Aciman,60,0
2,"As above so below,can I, with you, go? Alwayst...",Lavinia Valeriana,0,0
3,The ocean pulsed outside our window. The sound...,Chelsie Shakespeare,11,0
4,"Penyakit yang Menghambat Dunia Islam :Pertama,...",Habiburrahman El Shirazy,0,0


In [14]:
# Preprocessing - to remove length-1 words, and remove non-alphabet symbols
def preprocessing(quotes):

    processed_quotes = []
    
    for quote in tqdm(quotes):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', quote)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_quotes.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_quotes

In [15]:
train['Quote'] = preprocessing(train['Quote'])
test['Quote'] = preprocessing(test['Quote'])

100%|██████████| 85242/85242 [00:01<00:00, 73815.23it/s]
100%|██████████| 21298/21298 [00:00<00:00, 79359.98it/s]


In [16]:
def preprocessing_2(quotes):
    
    processed_quotes = []

    for quote in tqdm(quotes):
        tokens = word_tokenize(quote)

        # Convert to lower case
        tokens = [w.lower() for w in tokens]

        # Remove punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]

        # Remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        
        # Filter out stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        
        processed_quotes.append(words)
        
    return processed_quotes

In [17]:
train['Quote'] = preprocessing_2(train['Quote'])
test['Quote'] = preprocessing_2(test['Quote'])

100%|██████████| 85242/85242 [00:37<00:00, 2289.27it/s]
100%|██████████| 21298/21298 [00:09<00:00, 2264.30it/s]


In [18]:
# Shuffle test again, and reset index (very important!!!)
test = test.sample(frac = 1)
test = test.reset_index(drop = True)
train = train.reset_index(drop = True)

# Fitting Word Embeddings

In [19]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [20]:
# Extract the embeddings from the stored file
# Embedding is size 111k (# words) x 100 (dimensions)
import os 

EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('drive/MyDrive/word2vec_train.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [21]:
def vectorize_text(content):

  # Vectorize the text samples into 2D integer tensor - max length 16 words
  tokenizer_obj = Tokenizer()

  # Fit the tokenizer on the text
  tokenizer_obj.fit_on_texts(content)

  # Generate the sequence of tokens
  sequences = tokenizer_obj.texts_to_sequences(content)

  # Get the max length of each quote
  max_length = max([len(s) for s in content])
  
  # Pad the sequences
  vectorized_text = pad_sequences(sequences, maxlen = max_length)

  return vectorized_text, tokenizer_obj, max_length

In [22]:
def get_embedding_matrix(tokenizer_obj, EMBEDDING_DIM = 100):
 
  word_index = tokenizer_obj.word_index

  num_words = len(word_index) + 1
  words_not_found = []
  
  # Create the emedding matrix - map embeddings from word2vec model for each word and create matrix of word vectors
  embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

  for word, i in word_index.items():
      if i > num_words: # Least common words (don't care)
          continue
          
      embedding_vector = embeddings_index.get(word)
      
      if (embedding_vector is not None):
          # Assign the ith elmenet of the embedding matrix to the embedding of that word
          embedding_matrix[i] = embedding_vector
      else:
          words_not_found.append(word)
          
  print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

  return embedding_matrix

In [23]:
# Vectorize the text (return document x length matrix)
train_vectorized, tokenizer, max_length = vectorize_text(train['Quote'])

test_vectorized = tokenizer.texts_to_sequences(test['Quote'])
test_vectorized = pad_sequences(test_vectorized, maxlen = max_length)

# Get the embedding matrix of the words
embedding_matrix = get_embedding_matrix(tokenizer)
num_words = embedding_matrix.shape[0]

number of null word embeddings: 31


# Tuning Deep Learning Model

Reference: https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/

https://towardsdatascience.com/combining-numerical-and-text-features-in-deep-neural-networks-e91f0237eea4

In [24]:
!pip install category_encoders

Collecting category_encoders
[?25l  Downloading https://files.pythonhosted.org/packages/44/57/fcef41c248701ee62e8325026b90c432adea35555cbc870aff9cfba23727/category_encoders-2.2.2-py2.py3-none-any.whl (80kB)
[K     |████                            | 10kB 26.0MB/s eta 0:00:01[K     |████████▏                       | 20kB 21.2MB/s eta 0:00:01[K     |████████████▏                   | 30kB 11.4MB/s eta 0:00:01[K     |████████████████▎               | 40kB 9.4MB/s eta 0:00:01[K     |████████████████████▎           | 51kB 9.1MB/s eta 0:00:01[K     |████████████████████████▍       | 61kB 9.4MB/s eta 0:00:01[K     |████████████████████████████▍   | 71kB 9.7MB/s eta 0:00:01[K     |████████████████████████████████| 81kB 5.5MB/s 
Installing collected packages: category-encoders
Successfully installed category-encoders-2.2.2


In [25]:
import keras
from keras.models import Sequential, Model
from keras.layers import Concatenate, Input, Dense, Embedding, LSTM, GRU, SpatialDropout1D, Bidirectional, Dropout, BatchNormalization, concatenate
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.optimizers import SGD, Adam
from tensorboard.plugins.hparams import api as hp
from keras.regularizers import l2

from category_encoders import TargetEncoder
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier

  import pandas.util.testing as tm


In [26]:
# Establish X and y data
X_train = train_vectorized
X_test = test_vectorized

encoder = TargetEncoder()
X_train_authors = encoder.fit_transform(train['Author'], train['Likes'])
X_test_authors = encoder.fit_transform(test['Author'], test['Likes'])

y_train = train['top25pct'].to_numpy()
y_test = test['top25pct'].to_numpy()

print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_test: ', y_test.shape)

Shape of X_train:  (85242, 367)
Shape of y_train:  (85242,)
Shape of X_test:  (21298, 367)
Shape of y_test:  (21298,)


  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [27]:
# Untuned RNN model
# def RNN_Model(learning_rate, dropout):
    
#     text_sequence = Input(shape = (max_length,), name = 'text_sequence_input')
#     meta_input = Input(shape=(1,))
#     rnn_layer = Embedding(num_words, EMBEDDING_DIM, weights = [embedding_matrix], trainable = False, name = 'embedding')(text_sequence)
#     rnn_layer = LSTM(units = 32, dropout = dropout)(rnn_layer)
#     concat = concatenate([rnn_layer, meta_input])
#     rnn_layer = Dense(32, activation = 'relu')(concat)
#     output = Dense(1, name = 'output')(rnn_layer)
#     model = Model(inputs = [text_sequence, meta_input], outputs = output)
#     model.compile(loss = keras.losses.BinaryCrossentropy(from_logits = True), optimizer = Adam(learning_rate = learning_rate), metrics = ['accuracy'])
    
#     return model

In [28]:
# # create model
# model = KerasClassifier(build_fn=RNN_Model, epochs=15, batch_size=32, verbose=1)

# # define the grid search parameters
# learning_rate = [0.005, 0.015]
# dropout = [0.2, 0.5]
# param_grid = dict(learning_rate=learning_rate, dropout=dropout)
# grid = GridSearchCV(estimator=model, param_grid=param_grid, cv=3)
# grid_result = grid.fit([X_train, X_train_authors], y_train)

# # summarize results
# print("Best score: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

In [29]:
# LEARNING_RATE = [0.005, 0.015]
# DROPOUT = [0.2, 0.5]
# for learning_rate in LEARNING_RATE:
#     for dropout in DROPOUT:
#         model = RNN_Model(learning_rate, dropout)
#         model.summary()
#         print("learning rate: " + str(learning_rate))
#         print("dropout: " + str(dropout))
#         print("-----------------------------------------------------------------")
#         history = model.fit(X_train, y_train, batch_size = 32, epochs = 20, verbose = 1)

# Training Deep Learning Model

In [41]:
# Tuned RNN model
def RNN_Model():
    
    text_sequence = Input(shape = (max_length,), name = 'text_sequence_input')
    meta_input = Input(shape=(1,))
    rnn_layer = Embedding(num_words, EMBEDDING_DIM, weights = [embedding_matrix], trainable = True, name = 'embedding')(text_sequence)
    rnn_layer = LSTM(units = 32, dropout = 0.6)(rnn_layer)
    concat = concatenate([rnn_layer, meta_input])
    rnn_layer = Dense(32, activation = 'tanh')(concat)
    output = Dense(1, name = 'output')(rnn_layer)
    model = Model(inputs = [text_sequence, meta_input], outputs = output)
    model.compile(loss = keras.losses.BinaryCrossentropy(from_logits = True), optimizer = Adam(learning_rate = 0.0005), metrics = ['accuracy'])
    
    return model

In [42]:
model = RNN_Model()
model.summary()
history = model.fit([X_train, X_train_authors], y_train, batch_size = 32, epochs = 12, verbose = 1)

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
text_sequence_input (InputLayer [(None, 367)]        0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 367, 100)     8852800     text_sequence_input[0][0]        
__________________________________________________________________________________________________
lstm_3 (LSTM)                   (None, 32)           17024       embedding[0][0]                  
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
____________________________________________________________________________________________

# Evaluating Model Results

In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [43]:
# Evaluate on test set
results = model.evaluate([X_test, X_test_authors], y_test)



In [44]:
y_test_probs = model.predict([X_test, X_test_authors])
y_test_preds = (y_test_probs > 0.5).astype(int)

In [45]:
def get_classification_metrics(actual, pred):
  print(confusion_matrix(actual, pred))
  print('Accuracy: {}, Precision: {}, Recall: {}, F1 Score: {}'.format(
      accuracy_score(actual, pred),
      precision_score(actual, pred),
      recall_score(actual, pred),
      f1_score(actual, pred)))

In [46]:
get_classification_metrics(y_test, y_test_preds)

[[8830 1819]
 [5580 5069]]
Accuracy: 0.6525964879331393, Precision: 0.735917537746806, Recall: 0.4760071368203587, F1 Score: 0.5780920339852882
