In [1]:

!wget -q https://github.com/CISC-372/Notebook/releases/download/a4/test.csv
!wget -q https://github.com/CISC-372/Notebook/releases/download/a4/train.csv

In [2]:
# comment your understanding of each function 
import pandas as pd
import csv


xy_train_df = pd.read_csv('train.csv')
x_test_df  = pd.read_csv('test.csv', index_col='id')


xy_train_df['length'] = xy_train_df.apply(lambda x: len(x.review), axis=1)
xy_train_df = xy_train_df.sort_values('length')
xy_train_df

Unnamed: 0,id,rating,review,length
6037,2596,1,Five Stars_GOOD,15
5353,4643,1,Love it_Love it,15
2545,8791,1,Five Stars_Good,15
3902,6098,1,Five Stars_love!,16
2850,4609,1,love these_so cute!,19
...,...,...,...,...
5651,518,1,"So far, it's awesome_Ok, so I'll say up front ...",5765
1615,124,1,It Works (Read Tips For Potential Effectivenes...,6740
5046,7257,1,An exquisitely effective product with an astou...,8082
4859,7555,1,Gorgeous professional looking manicure at home...,8134


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import nltk
nltk.download('punkt')
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import PorterStemmer

vocab_size = 10000 # change vocabulary size to 30000, then to 40000
max_len = 288     # change max_len to 512, 256, 320

xy_train, xy_validation = train_test_split(
    xy_train_df, test_size=0.2)

# build vocabulary from the training set
tokenizer = Tokenizer(num_words=vocab_size,) 
tokenizer.fit_on_texts(xy_train.review) 


def _preprocess(texts):
  # Note: the proprecessing techniques are added in 2nd tuning of GRU model (and is removed when training bidirectional GRU)
  texts_ap=[] # initialize the list that will contains the samples after implementing the preprocessing techniques
  # Added preprocessing for each sample in the argument 'texts' through a for loop
  ps = PorterStemmer()    # intialize the stemmer to do the stemming
  stop_words = set(stopwords.words('english'))  # get the set of English stopwords from the nltk package 

#  for i in range(0, texts.size):
#    review = re.sub('[^a-zA-Z]', ' ', texts.iloc[i])  # replace all the characters of the current text sample that are not in the alphabet with a empty space ' '
   # review = review.lower() # added case-normalization in 8th tuning of GRU model: Convert all English characters in the current sample into lower-case letters
                             # removed in 9th tuning of GRU model as it deteriorates the model performace 
#    review = review.split() # split the current text sample into list of words, so the preprocessing techniques can be implemented   
#    review = [w for w in review if not w in stop_words] # Preprocessing technique: 'stopwords removal': remove all stopwords from the current text sample
   # review = [ps.stem(w) for w in review]   # Preprocessing technique: 'stemming': perform stemming for each word/token in the current text sample
                             # removed in 10th tuning of GRU model as it deteriorates the model performace 
#    review = ' '.join(review)   # join the individual tokens/words back into a sentence (with space between each word/token) 
#    texts_ap.append(review) # append the processed text sample into the list 'texts_ap'
  
  return pad_sequences(
      tokenizer.texts_to_sequences(texts), #texts_ap
      maxlen=max_len, 
      padding='post'
  )


x_train = _preprocess(xy_train.review)
y_train = xy_train.rating

x_valid = _preprocess(xy_validation.review)
y_valid = xy_validation.rating

x_test = _preprocess(x_test_df.review)

print(x_train.shape)
print(x_valid.shape)
print(x_test.shape)
tokenizer.get_config() # returns the tokenizer configuration as Python Dictionary (including 'word_counts', which counting the frequency of occurrence of each word in the training set)





In [4]:
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense
from keras import backend as K

import tensorflow as tf
from tensorflow.keras.optimizers import Adam

# comment your understanding of each line and 
# the output shape of each line below. for each dimensionality, explains its 
# meaning. (e.g. None is the batch size)


# since Keras 2.0 metrics f1, precision, and recall have been removed, we have to code the f1 metric function for the evaluation of the model performance 
# Below is a custom f1 metric function retrieved from https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

x = keras.Input((max_len)) # input layer

embeded = keras.layers.Embedding(vocab_size, 100)(x) # embedding layer

averaged = tf.reduce_mean(embeded, axis=1) # average layer to reduce the rank-3 tensor into a 2d matrix

# multiple Dense layer, Fully-Connected NN
averaged5 = Dense(128,activation=None)(averaged)

averaged6 = Dense(256,activation='relu')(averaged5)

averaged2 = Dense(512,activation='relu')(averaged6)

averaged4 = Dense(256,activation='relu')(averaged2)

averaged3 = Dense(128,activation='relu')(averaged4)

pred = keras.layers.Dense(1, activation=tf.nn.sigmoid)(averaged3) # output layer

model = keras.Model(x, pred)

# such design of the model gives 93.2% on the Kaggle public leaderboard

model.compile(
    optimizer=Adam(clipnorm=None),
    loss='binary_crossentropy',
    metrics=['accuracy',f1])

model.summary() # print out a summary table of the model structure. 


Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 288)]             0         
_________________________________________________________________
embedding (Embedding)        (None, 288, 100)          1000000   
_________________________________________________________________
tf.math.reduce_mean (TFOpLam (None, 100)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               12928     
_________________________________________________________________
dense_1 (Dense)              (None, 256)               33024     
_________________________________________________________________
dense_2 (Dense)              (None, 512)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 256)               131328

In [5]:
# use Callback() method to save the the model weights that yield the highest validation f1
checkpoint_filepath = '/checkpoint' # set the path to save the check_point model parameters
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( # define the Callback() function
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_f1', 
    mode='max',
    save_best_only=True)

history = model.fit(x_train,
                    y_train,
                    epochs=30,
                    batch_size=512,
                    validation_data=(x_valid, y_valid),
                    callbacks=[model_checkpoint_callback],
                    verbose=1)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [6]:

model.evaluate(x_valid, y_valid)



[0.5367026925086975, 0.9333333373069763, 0.9622277617454529]

In [7]:
def predict_class(_dataset):
  classes = model.predict(_dataset) > 0.5
  return np.squeeze(classes * 1) 

y_predict = predict_class(x_valid)

from sklearn.metrics import  f1_score
from sklearn.metrics import confusion_matrix

print(f1_score(y_valid, y_predict, average='micro'))

0.9333333333333333


In [35]:
# submission
pd.DataFrame(
    {'id': x_test_df.index,
     'rating': predict_class(x_test)}).to_csv('sample_submission.csv', index=False)

# Train a 2-layer GRU Model

In [9]:
from __future__ import absolute_import, division, print_function, unicode_literals
import collections
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import GRU, Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
from keras import backend as K

import tensorflow as tf
from tensorflow.keras.optimizers import Adam


# Creates the GRU model
gru = Sequential()
gru.add(Embedding(input_dim = vocab_size, output_dim = 400, input_length=max_len)) # The first layer of the GRU model will always be the embeddinhg layer
gru.add(GRU(units=300,return_sequences=True)) # 'return_sequences=True' is required if we want to build multi-layer RNN, because we need the intermediate GRU layers to output the accumulated memory vectors generated from EVERY time-step of the samples,
                                              # and so the input to the next GRU layer is a rank-3 tensor of the required shape: [batch_size, number_of_time_step, size_of_the_accumulated_memory_vector or hidden_dim]
gru.add(GRU(units=200,return_sequences=True)) # added GRU layer in 6th tuning of GRU model
gru.add(GRU(units=200,))
gru.add(Dense(units=1, activation='sigmoid',trainable=True)) # the output layer of the GRU model


# Note: I define the f1 metric function again in this cell of code in case I (or the TA) jump to run the GRU model code directly
# since Keras 2.0 metrics f1, precision, and recall have been removed, we have to code the f1 metric function for the evaluation of the model performance 
# Below is a custom f1 metric function retrieved from https://stackoverflow.com/questions/43547402/how-to-calculate-f1-macro-in-keras:
def f1(y_true, y_pred):
    def recall(y_true, y_pred):
        """Recall metric.

        Only computes a batch-wise average of recall.

        Computes the recall, a metric for multi-label classification of
        how many relevant items are selected.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
        recall = true_positives / (possible_positives + K.epsilon())
        return recall

    def precision(y_true, y_pred):
        """Precision metric.

        Only computes a batch-wise average of precision.

        Computes the precision, a metric for multi-label classification of
        how many selected items are relevant.
        """
        true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
        predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
        precision = true_positives / (predicted_positives + K.epsilon())
        return precision
    precision = precision(y_true, y_pred)
    recall = recall(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

gru.compile(
    optimizer= Adam(clipnorm=None),
    #tf.keras.optimizers.Adadelta(clipnorm=None), #Adam(clipnorm=None),
    #tf.keras.optimizers.SGD(lr=0.01, decay=1e-7, momentum=0.9, nesterov=False),
    # start with 'clipnorm=None', try change optimizer to 'tf.keras.optimizers.Adadelta(clipnorm=None)'
    loss='binary_crossentropy',
    metrics=['accuracy', f1])
gru.summary() # print out a summary table of the model structure.


Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 288, 400)          4000000   
_________________________________________________________________
gru_3 (GRU)                  (None, 288, 300)          631800    
_________________________________________________________________
gru_4 (GRU)                  (None, 288, 200)          301200    
_________________________________________________________________
gru_5 (GRU)                  (None, 200)               241200    
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 201       
Total params: 5,174,401
Trainable params: 5,174,401
Non-trainable params: 0
_________________________________________________________________


In [10]:
# use Callback() method to save the Keras model or the model weights that yield the highest validation f1
checkpoint_filepath = '/checkpoint' # set the path to save the check_point model parameters
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( # define the Callback() function
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_f1', 
    mode='max',
    save_best_only=True)

history = gru.fit(x_train,
                  y_train,
                  epochs=20, # change to 20
                  batch_size=1024,
                  validation_data=(x_valid, y_valid),
                  callbacks=[model_checkpoint_callback], # implement Callback() in conjunction with '.fit()' to save a model or weights (in a checkpoint file) that yield the highest validation f1 score
                  verbose=1)

# put/load the weights into the GRU model
gru.load_weights(checkpoint_filepath)

# Adjustment log:
# Initial design of the GRU model: Vocabulary size is 10000, 'output_dim' of the embedding layer is 200, 'maxlen' of the 'pad_sequences()' is 256 (so the 'input_length' of the embedding layer is 256), 
# first GRU layer has 'units=200' (so the accumulated memory/knowledge vector has size 200), the second GRU layer has 'units=100', output layer of the GRU model is a Dense layer with only one hidden unit (units=1,activation='sigmoid')
# the optimizer chosen is 'Adam' with 'clipnorm=None', batch size chosen is 128
# Initial result: validation f1 is stabilized around 93% 

# 1st tuning: Change the vocabulary size to 30000, change the max_len of each text sample to 512, change 'output_dim' of the embedding layer to 300
# result: validation f1 is still stabilized around 93% (no improvement)

# 2nd tuning: Added preprocessing techniques (stopword&punctuation removal + stemming)
# result: validation f1 is stabilized around 93.5% (slight improvement on the model performance), which means that the added preprocessing techniques can improve the model performance slightly

# 3rd tuning: change the 'max_len' of each text sample to 256 to see if shorter the maximal length of each sample (less padding) can affect the model performance or not
# result: validation f1 is stabilized around 94% (slight improvement on the model performance), which may indicates that increasing the 'max_len' may deteriorate the model performance 
# result on Leaderboard: we get a score of 87.3% in the Kaggle public leaderboard

# 4th tuning: Change the units of the first GRU layer to 300, Change the units of the second GRU layer to 200, to see if the longer the accumulated memory/knowledge vector size is, the better the model performs
# result: validation f1 is still stabilized around 94% (no improvement on the model performance), which may indicates that increasing the 'units' of the GRU layers or the accumulated memory/knowledge vector sizes may not improve the model performance

# 5th tuning: Change the optimizer to 'AdaDelta()' to see if changing optimizer can improve model performance or not
# result: validation f1 is varying within the range of 93.5% to 94% (no improvement on the model performance)

# 6th tuning: Added one more GRU layer with 'units=200', change 'epochs' to 20, change batch_size to 1024  
# result: validation f1 is still varying within the range of 93.5% to 94% (no improvement on the model performance), which indicates that adding etra GRU layer will not improve the model's performance

# 7th tuning: Change the optimizer to 'SGD with Momentum', change 'output_dim' of the embedding layer to 400
# result: validation f1 is still stabilized around 94% , which may indicate that changing the optimizer or increasing the 'size of the meaning vector associated with each token in the sample' may not improve the model performance

# 8th tuning: change optimizer back to 'Adam', added the case-normalization preprocessing technique (convert all sample English characters into lower-case) to see if model performance will improve or not
# result: validation f1 is stabilized around 93% (decrease in the model performance), which indicates that adding the 'case-normalization' preprocessing technique will actually deteriorate the model performance

# 9th tuning: Remove the 'case-normalization' preprocessing technique, Change the vocabulary size to 40000 to see if increasing the vocabulary dictionary size will improve the model performance or not
# result: validation f1 is still stabilized around 93%, which indicates that increasing the vocabulary dictionary size will not improve the model performance

# 10th tuning: Change the vocabulary size to back to 30000, Remove the 'stemming' preprocessing technique to see if this preprocessing technique actually deteriorates the model performance
# result: validation f1 is stabilized around 95% (improvement on the model performance), which suggests that the 'stemming' preprocessing technique actually deteriorates the model performance

# 11th tuning: Remove the 'stopwords & punctuation removal' preprocessing technique to see if all the preprocessing techniques are actually not needed as they actually deteriorate the model performance
# result: validation f1 is stabilized around 93% (decrease in the model performance), which indicates that the 'stopwords & punctuation removal' preprocessing technique could actually improve the model performance, and so we should add it back 

# Now we can conclude that no matter how we tune the GRU model, the upper limit of this model is around 94% or 95% in the validation f1, which can be considered as adequately satisfactory model performance, and if we want to further improve the model performance, we should try to use a better model architecture such as the bidirectional GRU




Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f3337d79590>

In [11]:
gru.evaluate(x_valid, y_valid)
def predict_class(_dataset):
  classes = gru.predict(_dataset) > 0.5
  return np.squeeze(classes * 1) 

y_predict = predict_class(x_valid)

from sklearn.metrics import  f1_score
from sklearn.metrics import confusion_matrix

print(f1_score(y_valid, y_predict, average='micro'))

0.8867469879518072


In [39]:
# submission
pd.DataFrame(
    {'id': x_test_df.index,
     'rating': predict_class(x_test)}).to_csv('sample_submission.csv', index=False)

# Train a LSTM Model

In [12]:
from tensorflow.keras.layers import GRU, Embedding, Dense, LSTM
from tensorflow.keras.models import Sequential
# Creates the LSTM model
lstm = Sequential()
lstm.add(Embedding(input_dim = vocab_size, output_dim = 300, input_length=max_len)) # The first layer of RNN model will always be the embeddinhg layer
lstm.add(LSTM(units=256,return_sequences=True)) # the first LSTM layer, 'return_sequences=True' is required  
lstm.add(LSTM(units=128)) # the second/final LSTM layer, 'return_sequences=False' is required 
lstm.add(Dense(units=1, activation='sigmoid')) # the output layer of the LSTM model

lstm.compile(
    optimizer= tf.keras.optimizers.Adam(clipnorm=None), #tf.keras.optimizers.SGD(lr=0.01, decay=1e-7, momentum=0.9, nesterov=False),#tf.keras.optimizers.Adadelta(clipnorm=None),
    # start with 'clipnorm=None', try change optimizer to 'tf.keras.optimizers.Adadelta(clipnorm=None)'
    loss='binary_crossentropy',
    metrics=['accuracy', f1])

lstm.summary() # print out a summary table of the model structure.


Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 288, 300)          3000000   
_________________________________________________________________
lstm (LSTM)                  (None, 288, 256)          570368    
_________________________________________________________________
lstm_1 (LSTM)                (None, 128)               197120    
_________________________________________________________________
dense_8 (Dense)              (None, 1)                 129       
Total params: 3,767,617
Trainable params: 3,767,617
Non-trainable params: 0
_________________________________________________________________


In [13]:
# use Callback() method to save the Keras model or the model weights that yield the highest validation f1
checkpoint_filepath = '/checkpoint' # set the path to save the check_point model parameters
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( # define the Callback() function
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_f1', 
    mode='max',
    save_best_only=True)

history = lstm.fit(x_train,
                    y_train,
                    epochs=5,
                    batch_size=1024,
                    validation_data=(x_valid, y_valid),
                    verbose=1,
                    callbacks=[model_checkpoint_callback]
                   )

# put/load the weights into the LSTM model
lstm.load_weights(checkpoint_filepath)

# Adjustment log:
# Initial design of the LSTM model: Vocabulary size is 10000, 'output_dim' of the embedding layer is 200, 'maxlen' of the 'pad_sequences()' is 256 (so the 'input_length' of the embedding layer is 256), 
# first LSTM layer has 'units=256' (so the accumulated memory/knowledge vector has size 256), the second LSTM layer has 'units=128', output layer of the LSTM model is a Dense layer with only one hidden unit (units=1,activation='sigmoid')
# the optimizer chosen is 'Adam' with 'clipnorm=None', batch size chosen is 128, epoch chosen is 5
# Initial result: validation f1 is also stabilized around 93% (similar performance with the GRU model)

# 1st tuning: change 'output_dim' of the embedding layer to 300, change batch_size to 1024
# result: validation f1 is still stabilized around 93% (no improvement on the model performance)
# Since the LSTM model has a similar performance with the GRU model, we change the model architecture to the bidirecional NN architecture now (instead of continuing tuning the LSTM model)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f3335766410>

In [14]:
lstm.evaluate(x_valid, y_valid)
def predict_class(_dataset):
  classes = lstm.predict(_dataset) > 0.5
  return np.squeeze(classes * 1) 

y_predict = predict_class(x_valid)

from sklearn.metrics import  f1_score
from sklearn.metrics import confusion_matrix

print(f1_score(y_valid, y_predict, average='micro'))

0.8859437751004016


In [43]:
# submission
pd.DataFrame(
    {'id': x_test_df.index,
     'rating': predict_class(x_test)}).to_csv('sample_submission.csv', index=False)

# Train a Bidirectional GRU/LSTM Model with Attention Mechanism

In [15]:
# Attention Mechanism Code is directly retrieved from: https://stackoverflow.com/questions/62948332/how-to-add-attention-layer-to-a-bi-lstm
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Layer
from tensorflow.keras import backend as K
class Attention(Layer):
    
    def __init__(self, return_sequences=True):
        self.return_sequences = return_sequences
        super(Attention,self).__init__()
        
    def build(self, input_shape):
        
        self.W=self.add_weight(name="att_weight", shape=(input_shape[-1],1),
                               initializer="normal")
        self.b=self.add_weight(name="att_bias", shape=(input_shape[1],1),
                               initializer="zeros")
        
        super(Attention,self).build(input_shape)
        
    def call(self, x):
        
        e = K.tanh(K.dot(x,self.W)+self.b)
        a = K.softmax(e, axis=1)
        output = x*a
        
        if self.return_sequences:
            return output
        
        return K.sum(output, axis=1)

In [16]:
from tensorflow.keras.layers import GRU, Embedding, Dense, LSTM, Bidirectional,Dropout, TimeDistributed
from tensorflow.keras.models import Sequential

# Creates the Bidirectional NN model
bidir = Sequential()
bidir.add(Embedding(input_dim = vocab_size, output_dim = 300, input_length=max_len))  # The first layer of RNN model will always be the embeddinhg layer

forward_layer = LSTM(units=100,return_sequences=True) # create the forward layer for the bidrectional RNN

# The original first bidirectional layer
bidir.add(Bidirectional(forward_layer, )) # Note: No need to specify the 'backward_layer' as this API will automatically create the 'backward_layer' by duplicating the (hyper-)parameters of the 'forward_layer' we specified

bidir.add(Dropout(0.2)) # dropout layer added to prevent overfitting
bidir.add(Bidirectional(LSTM(units=100,return_sequences=True))) # added bidirectional GRU layer at the 7th tuning of the bidirectional NN model (changed from GRU to LSTM in 13th tuning)
bidir.add(Dropout(0.2))
bidir.add(Bidirectional(LSTM(units=100,return_sequences=True))) # added bidirectional GRU layer at the 11th tuning of the bidirectional NN model (changed from GRU to LSTM in 13th tuning)
bidir.add(Dropout(0.2))
bidir.add(Bidirectional(LSTM(units=100,return_sequences=True))) # added bidirectional LSTM layer at the 14th tuning of the bidirectional NN model
bidir.add(Dropout(0.2))
bidir.add(Bidirectional(LSTM(units=100,return_sequences=True))) # added bidirectional LSTM layer at the 14th tuning of the bidirectional NN model
# The original second bidirectional layer
bidir.add(Bidirectional(LSTM(units=100,return_sequences=True))) # create the second bidirectional layer with a more concise syntax 
bidir.add(Attention(return_sequences=False)) # added custom Attention layer at 14th tuning
#bidir.add(Dense(units=64, activation='relu')) # added dense layer at 12th tuning (removed at 13th tuning)
bidir.add(Dense(units=1, activation='sigmoid')) # the output layer of the bidirectional NN model
 

bidir.compile(
    optimizer=  tf.keras.optimizers.Adam(clipnorm=None),
    #tf.keras.optimizers.Adam(clipnorm=None),#tf.keras.optimizers.Adadelta(clipnorm=None),  #tf.keras.optimizers.SGD(lr=0.01, decay=1e-7, momentum=0.9, nesterov=False),
    # start with 'clipnorm=None', try change optimizer to 'tf.keras.optimizers.Adadelta(clipnorm=None)'
    loss='binary_crossentropy',
    metrics=['accuracy', f1])
bidir.summary() # print out a summary table of the model structure.


Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 288, 300)          3000000   
_________________________________________________________________
bidirectional (Bidirectional (None, 288, 200)          320800    
_________________________________________________________________
dropout (Dropout)            (None, 288, 200)          0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 288, 200)          240800    
_________________________________________________________________
dropout_1 (Dropout)          (None, 288, 200)          0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 288, 200)          240800    
_________________________________________________________________
dropout_2 (Dropout)          (None, 288, 200)         

In [17]:
# use Callback() method to save the Keras model or the model weights that yield the highest validation f1
checkpoint_filepath = '/checkpoint' # set the path to save the check_point model parameters
model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint( # define the Callback() function
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_f1', # we can try switch 'val_f1' with 'val_accuracy' as usually high validation accuracy comes along with high validation f1
    mode='max',
    save_best_only=True)

history = bidir.fit(x_train,
                    y_train,
                    epochs=15,
                    batch_size=64,
                    validation_data=(x_valid, y_valid),
                    verbose=1,
                   callbacks=[model_checkpoint_callback],  # implement Callback() in conjunction with '.fit()' to save a model or weights (in a checkpoint file) that yield the highest validation f1 score
                    # class_weight={0: 2.0, 1: 1.0,}, assign different weights to different class labels since the dataset is highly imbalanced (5452 positive cases VS 771 negative cases)
                   )

# put/load the weights into the bidirectional NN model
bidir.load_weights(checkpoint_filepath)

# Adjustment log:
# Initial design of the bidirectional NN model: Vocabulary size is 10000, 'output_dim' of the embedding layer is 200, 'maxlen' of the 'pad_sequences()' is 256 (so the 'input_length' of the embedding layer is 256), 
# first bidirectional GRU layer has 'units=200' (so the accumulated memory/knowledge vector has size 200), the second bidirectional GRU layer has 'units=200', output layer of the bidirectional NN model is a Dense layer with only one hidden unit (units=1,activation='sigmoid')
# the optimizer chosen is 'Adam' with 'clipnorm=None', batch size chosen is 1024, epoch chosen is 5, added preprocessing techniques (stopword & punctuation removal + stemming)
# Initial result: validation f1 is varying within the range of 93.7% to 94.8% (better performance than the GRU and the LSTM model)

# 1st tuning: Change the vocabulary size to 30000, change 'output_dim' of the embedding layer to 300, change epoch to 10, change batch_size to 512
# result: validation f1 is varying within the range of 93.3% to 94.8% (no improvement on the model performance)
# result on Leaderboard: we get a score of 89.9% in the Kaggle public leaderboard

# 2nd tuning: remove the 'stemming' preprocessing technique to see if this preprocessing technique actually improve the model performance or not
# result: validation f1 is still varying within the range of 93.2% to 94.8% (no improvement on the model performance), which may suggest that the 'stemming' preprocessing technique may not affect the model performance at all

# 3rd tuning: Change the units of the first bidirectional layer to 300, Change the units of the second bidirectional layer to 300, to see if the longer the accumulated memory/knowledge vector size is, the better the model performs
# result: validation f1 is still varying within the range of 93.2% to 94.8% (no improvement on the model performance), which may indicate that increasing the 'units' of the bidirectional layers or the accumulated memory/knowledge vector sizes may not improve the model performance
# Highlight of the result: Both of the training accuracy and f1 are around 99.9% when the number of epoch is above 7, while the validation accuracy and f1 are still around 94%, which may imply that the model is overfitting to the training data

# 4th tuning: Remove the 'stopwords & punctuation removal' preprocessing technique to see if all the preprocessing techniques are actually not needed as they actually deteriorate the model performance
# result: validation f1 is varying within the range of 93.4% to 95.4% (improvement on the model performance), which may imply that all the preprocessing techniques are actually not needed for the Bidirectional GRU model according to the gained validation f1 score
# result on Leaderboard: we get a score of 91.7% in the Kaggle public leaderboard

# 5th tuning: Add 2 Dropout layers with 'dropout=0.2' to the bidirectional model to prevent overfitting, and see if the model performance can benefit from the 2 Dropout layers
# result: validation f1 is still varying within the range of 93.4% to 95.4%, and both of the training accuracy and f1 are around 99.9% when the number of epoch is above 7, which may indicate that the added 2 Dropout layers for the model may not mitigate the overfitting problem

# 6th tuning: Change the units of the first bidirectional layer to 200, Change the units of the second bidirectional layer to 200 to see if the overfitting can be mitigated or not
# result: validation f1 is varying within the range of 93.4% to 95% (slight decrease in the model performance), and both of the training accuracy and f1 are around 99.5% when the number of epoch is above 7, which may also indicate that reducing the 'units' of the bidirectional layers or the accumulated memory/knowledge vector sizes may not mitigate the overfitting problem adequately 

# 7th tuning: Change optimizer to 'AdaDelta', Added one more bidirectional GRU layer with 'units=200',
# result: validation f1 is varying within the range of 92.8% to 93.2% (decrease in the model performance), which may indicate that the added bidirectional GRU layer might deteriorate the model performance

# 8th tuning: Change optimizer to 'Adam', Change the units of the first & second & thrid bidirectional layer to 300, change batch_size to 256
# result: validation f1 is varying within the range of 94.3% to 96.0% (slight improvement on the model performance), which means the conclusion we gained in 7th tuning may be incorrect (the additional bidirectional GRU layer improves the model's performance slightly in fact)
# Highlight of the result: Both of the training accuracy and f1 are around 99% when the number of epoch is above 7, so lowering the number of epoch may prevent overfitting

# 9th tuning: change the 'max_len' of each text sample to 384 to see if the longer maximal length of each sample (less padding) can affect the model performance or not
# result: validation f1 is varying within the range of 94.8% to 96.3% (slight improvement on the model performance), which indicates that increasing the 'max_len' slightly may improve the model performance
# result on Leaderboard: we get a score of 91.9% in the Kaggle public leaderboard

# 10th tuning: change 'output_dim' of the embedding layer to 400, Change the units of the first & second & thrid bidirectional layer to 400, Change the vocabulary size to 40000 to see if increasing the vocabulary dictionary size, the size of the meaning vector associated with each token in the sample, and the accumulated memory/knowledge vector size will improve the model performance or not
# result: validation f1 is still varying within the range of 94.8% to 96.3% (no improvement on the model performance), which imply that simply increasing the vocabulary dictionary size, the size of the meaning vector associated with each token in the sample, and the accumulated memory/knowledge vector size will not improve the model performance

# 11th tuning: Added one more bidirectional GRU layer with 'units=400'
# result: validation f1 is varying within the range of 93.4% to 95.3% (decrease in the model performance), which may indicate that the extra layer may not improve the model performance anymore

# 12th tuning: added one dense layer with 64 hidden units before the output layer of the model
# result: validation f1 is varying within the range of 93.4% to 95.3% (no improvement on the model performance), which may imply the added dense layer do not improve the model performance either

# 13th tuning: Remove the added one dense layer in 12th tuning, Change the vocabulary size to 50000, Change all bidirectional GRU layers into LSTM layers, Change the units of the first & second & thrid & fourth bidirectional layer to 100, change the 'max_len' of each text sample to 288, change 'out__dim' of the embedding layer to 300,
# result: validation f1 is stabilized around 95.5% (improvement on the model performance), which may indicate that LSTM outperforms the GRU under the bidirectional RNN design
# result on Leaderboard: we get a score of 92.6% in the Kaggle public leaderboard

# 14th tuning: add 2 more bidirectional LSTM layers with 2 Droupout layers with 'dropout=0.2', added attention layer from the external source, Change the vocabulary size to 10000 (large vocabulary size tends to lead overfitting),
# result: validation f1 is varying within the range of 94.1% to 96.9% (improvement on the model performance), which may imply that the added attention layer can improve the model performance



Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f31f690bd10>

In [18]:
bidir.evaluate(x_valid, y_valid)
def predict_class(_dataset):
  classes = bidir.predict(_dataset) > 0.5
  return np.squeeze(classes * 1) 

y_predict = predict_class(x_valid)

from sklearn.metrics import  f1_score
from sklearn.metrics import confusion_matrix

print(f1_score(y_valid, y_predict, average='micro'))

0.9349397590361446


In [48]:
# submission
pd.DataFrame(
    {'id': x_test_df.index,
     'rating': predict_class(x_test)}).to_csv('sample_submission.csv', index=False)