# Preprocessing

In [4]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [5]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tqdm import tqdm

import re
from nltk.tokenize import word_tokenize
import string
from nltk.corpus import stopwords

In [6]:
tf.test.is_gpu_available()

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.


True

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [8]:
train = pd.read_csv('drive/MyDrive/train_quotes.csv')
test = pd.read_csv('drive/MyDrive/test_quotes.csv')

In [9]:
train = train.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])
test = test.drop(columns = ['Unnamed: 0', 'Unnamed: 0.1'])

In [10]:
train.head(1)

Unnamed: 0,Quote,Likes
0,"She turned to look at him, and he was already ...",210


In [11]:
test.head(1)

Unnamed: 0,Quote,Likes
0,"Beyond work and love, I would add two other in...",350


In [12]:
# Preprocessing - to remove length-1 words, and remove non-alphabet symbols
def preprocessing(quotes):

    processed_quotes = []
    
    for quote in tqdm(quotes):
        
        # remove other non-alphabets symbols with space (i.e. keep only alphabets and whitespaces).
        processed = re.sub('[^a-zA-Z ]', '', quote)
        
        words = processed.split()
        
        # keep words that have length of more than 1 (e.g. gb, bb), remove those with length 1.
        processed_quotes.append(' '.join([word for word in words if len(word) > 1]))
    
    return processed_quotes

In [13]:
train['Quote'] = preprocessing(train['Quote'])
test['Quote'] = preprocessing(test['Quote'])

100%|██████████| 56617/56617 [00:00<00:00, 89198.01it/s]
100%|██████████| 14155/14155 [00:00<00:00, 91637.35it/s]


In [14]:
def preprocessing_2(quotes):
    
    processed_quotes = []

    for quote in tqdm(quotes):
        tokens = word_tokenize(quote)
        # Convert to lower case
        tokens = [w.lower() for w in tokens]
        # Remove punctuation
        table = str.maketrans('', '', string.punctuation)
        stripped = [w.translate(table) for w in tokens]
        # Remove remaining tokens that are not alphabetic
        words = [word for word in stripped if word.isalpha()]
        # Filter out stopwords
        stop_words = set(stopwords.words('english'))
        words = [w for w in words if not w in stop_words]
        
        processed_quotes.append(words)
        
    return processed_quotes

In [15]:
train['Quote'] = preprocessing_2(train['Quote'])
test['Quote'] = preprocessing_2(test['Quote'])

100%|██████████| 56617/56617 [00:22<00:00, 2511.07it/s]
100%|██████████| 14155/14155 [00:05<00:00, 2525.10it/s]


In [16]:
# Shuffle test again, and reset index (very important!!!)
test = test.sample(frac = 1)
test = test.reset_index(drop = True)
train = train.reset_index(drop = True)

# Fitting Word Embeddings

In [17]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [18]:
# Extract the embeddings from the stored file
# Embedding is size 111k (# words) x 100 (dimensions)
import os 

EMBEDDING_DIM = 100

embeddings_index = {}
f = open(os.path.join('drive/MyDrive/word2vec_train.txt'), encoding = 'utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:])
    embeddings_index[word] = coefs
f.close()

In [19]:
def vectorize_text(content):

  # Vectorize the text samples into 2D integer tensor - max length 16 words
  tokenizer_obj = Tokenizer()

  # Fit the tokenizer on the text
  tokenizer_obj.fit_on_texts(content)

  # Generate the sequence of tokens
  sequences = tokenizer_obj.texts_to_sequences(content)

  # Get the max length of each article - 5587
  max_length = max([len(s) for s in content])
  
  # Pad the sequences
  vectorized_text = pad_sequences(sequences, maxlen = max_length)

  return vectorized_text, tokenizer_obj, max_length

In [20]:
def get_embedding_matrix(tokenizer_obj, EMBEDDING_DIM = 100):
 
  word_index = tokenizer_obj.word_index

  num_words = len(word_index) + 1
  words_not_found = []
  # Create the emedding matrix - map embeddings from word2vec model for each word and create matrix of word vectors
  embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))

  for word, i in word_index.items():
      if i > num_words: # Least common words (don't care)
          continue
          
      embedding_vector = embeddings_index.get(word)
      
      if (embedding_vector is not None):
          # Assign the ith elmenet of the embedding matrix to the embedding of that word
          embedding_matrix[i] = embedding_vector
      else:
          words_not_found.append(word)
          
  print('number of null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))

  return embedding_matrix

In [21]:
# Vectorize the text (return document x length matrix)
train_vectorized, tokenizer, max_length = vectorize_text(train['Quote'])

test_vectorized = tokenizer.texts_to_sequences(test['Quote'])
test_vectorized = pad_sequences(test_vectorized, maxlen = max_length)

# Get the embedding matrix of the words
embedding_matrix = get_embedding_matrix(tokenizer)
num_words = embedding_matrix.shape[0]

number of null word embeddings: 5645


# Training Deep Learning Model

In [78]:
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Embedding, LSTM, GRU, SpatialDropout1D, Bidirectional, Dropout, BatchNormalization
from keras.layers.embeddings import Embedding
from keras.initializers import Constant
from keras.optimizers import SGD, Adam
from tensorboard.plugins.hparams import api as hp
from keras.regularizers import l2

In [79]:
# Original RNN Model
def RNN_Model():
    
    text_sequence = Input(shape = (max_length,), name = 'text_sequence_input')
    rnn_layer = Embedding(num_words, EMBEDDING_DIM, weights = [embedding_matrix], trainable = False, name = 'embedding')(text_sequence)
    rnn_layer = LSTM(units = 32, dropout = 0.1)(rnn_layer)
    rnn_layer = Dense(32, activation = 'relu')(rnn_layer)
    output = Dense(1, name = 'output')(rnn_layer)
    model = Model(inputs = text_sequence, outputs = output)
    
    return model

In [80]:
model = RNN_Model()
model.summary()

Model: "model_7"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
text_sequence_input (InputLa [(None, 373)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 373, 100)          8969000   
_________________________________________________________________
lstm_11 (LSTM)               (None, 32)                17024     
_________________________________________________________________
dense_14 (Dense)             (None, 32)                1056      
_________________________________________________________________
output (Dense)               (None, 1)                 33        
Total params: 8,987,113
Trainable params: 18,113
Non-trainable params: 8,969,000
_________________________________________________________________


In [81]:
# Establish X and y data
X_train = train_vectorized
X_test = test_vectorized

y_train = train['Likes'].to_numpy()
y_test = test['Likes'].to_numpy()

print('Shape of X_train: ', X_train.shape)
print('Shape of y_train: ', y_train.shape)
print('Shape of X_test: ', X_test.shape)
print('Shape of y_test: ', y_test.shape)

Shape of X_train:  (56617, 373)
Shape of y_train:  (56617,)
Shape of X_test:  (14155, 373)
Shape of y_test:  (14155,)


In [87]:
model.compile(loss = keras.losses.mean_squared_error, optimizer = Adam(learning_rate = 0.001))

In [88]:
history = model.fit(X_train, y_train, batch_size = 32, epochs = 20, verbose = 1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [89]:
# Evaluate on test set
results = model.evaluate(X_test, y_test)

