In [1]:
import sys
import numpy as np
import pandas as pd

from keras import Sequential
from keras.layers import Input, Dense, LSTM, Embedding, Dropout, BatchNormalization, Activation, Bidirectional,Flatten
from keras import optimizers
from keras import regularizers
from keras.utils import np_utils
from keras.preprocessing.text import text_to_word_sequence, Tokenizer
from keras.preprocessing.sequence import pad_sequences

from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split

DATA_DIR = "./"

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# load data from files and save it into lists
def load_data(f, data, pos_or_neg, target):
    for line in f.readlines():
        line_str = ""
        line = line[1:-2].split(r", ")
        for i in range(len(line)):
            line[i] = line[i][1:-1]
            line_str += str(line[i]+" ")
        data.append(line_str)
        target.append(pos_or_neg)
    
data = []
target = []

for file_name in ["training_pos.csv",
          "training_neg.csv",
          "validation_pos.csv",
          "validation_neg.csv",
          "test_pos.csv",
          "test_neg.csv"]:
    f = open(DATA_DIR + file_name,'r')
    pos_or_neg = 1 if 'pos' in file_name else 0
    load_data(f,data, pos_or_neg, target)
    f.close()


In [3]:
word_seq = [text_to_word_sequence(sent) for sent in data]
# we should be safe using MAX _SENT _LEN as 90th Percentile Sentence Length
MAX_SENT_LEN = int(np.percentile([len(seq) for seq in word_seq], 90))
print('90th Percentile Sentence Length:', MAX_SENT_LEN)

# cut every sentence  according to MAX_SENT_LEN
data = [' '.join(seq[:MAX_SENT_LEN]) for seq in word_seq]

# Convert the sequence of words to sequnce of indices
MAX_VOCAB_SIZE = 80000
tokenizer = Tokenizer(MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(data)
data = tokenizer.texts_to_sequences(data)
data = pad_sequences(data, maxlen=MAX_SENT_LEN, padding='post', truncating='post')


90th Percentile Sentence Length: 23


In [5]:
# split data into train, validation, test
length = len(data)
train_data = data[:int(length*0.8)]
val_data = data[int(length*0.9):]
test_data = data[int(length*0.8):int(length*0.9)]
train_target = target[:int(length*0.8)]
val_target = target[int(length*0.9):]
test_target = target[int(length*0.8):int(length*0.9)]

train = list(zip(train_data, train_target))
import random
random.shuffle(train)
train_data[:], train_target[:] = zip(*train)


In [6]:
train_target = np_utils.to_categorical(train_target, 2)
test_target =  np_utils.to_categorical(test_target, 2)
val_target = np_utils.to_categorical(val_target, 2)

In [7]:
W2V_DIR = 'W2V_DIR'
embeddings = Word2Vec.load(DATA_DIR + W2V_DIR)
print('Dimension of w2v:', embeddings.vector_size)
EMBEDDING_DIM = embeddings.vector_size

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


Dimension of w2v: 300


In [8]:
# Create an embedding matrix containing only the word's in our vocabulary
# If the word does not have a pre-trained embedding, then randomly initialize the embedding
embeddings_matrix = np.random.uniform(-0.05, 0.05, size=(len(tokenizer.word_index)+1, EMBEDDING_DIM)) # +1 is because the matrix indices start with 0

for word, i in tokenizer.word_index.items(): # i=0 is the embedding for the zero padding
    try:
        embeddings_vector = embeddings[word]
    except KeyError:
        embeddings_vector = None
    if embeddings_vector is not None:
        embeddings_matrix[i] = embeddings_vector
        
del embeddings


  import sys


In [9]:
def sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate):
    
    # Build a sequential model by stacking neural net units 
    model = Sequential()
    # Input layer of embeddings
    model.add(Embedding(input_dim=len(tokenizer.word_index)+1,
                              output_dim=EMBEDDING_DIM,
                              weights = [embeddings_matrix], trainable=True, name='word_embedding_layer', 
                              mask_zero=False,input_length=MAX_SENT_LEN))
#     model.add(LSTM(128, return_sequences=False, name='lstm_layer'))
    model.add(Flatten())
    
    if(dropoutFlag):
        model.add(Dropout(dropoutRate))
        
    if(l2normFlag):
        model.add(Dense(units = 30,activation = hidden_actfunc, name = 'hidden_layer',kernel_regularizer=regularizers.l2(l=0.01)))
    else:
        model.add(Dense(units = 30,activation = hidden_actfunc, name = 'hidden_layer'))
    
    if(dropoutFlag):
        model.add(Dropout(dropoutRate))
    
    if(l2normFlag):
        model.add(Dense(2, activation='softmax', name='output_layer',kernel_regularizer=regularizers.l2(l=0.01)))
    else:
        model.add(Dense(2, activation='softmax', name='output_layer'))

    # Use cross-entropy as the loss function
    model.compile(loss='categorical_crossentropy',optimizer ='adam',  metrics=['accuracy'])
    return model
    


In [10]:
# from keras.utils import plot_model
# from IPython.display import Image
# plot_model(model, to_file='basic_lstm_classifier.png', show_layer_names=True, show_shapes=True)
# Image('basic_lstm_classifier.png')

In [11]:
# # train data
# BATCH_SIZE = 128
# N_EPOCHS = 10
# dropoutRate = 0.5
# for hidden_actfunc in ["relu", "sigmoid", "tanh"]:
#     for l2normFlag in [True, False]:
#         for dropoutFlag in [True, False]:
#             print("----------------------------------------------------------------------------------")
#             print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
#                       "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
#             for dropoutRate in np.arange(0.4, 1 , 0.2):
#                 model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
#                 model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
                
                

In [12]:
BATCH_SIZE = 128
N_EPOCHS = 10

## relu

In [13]:
# hidden_layer = relu, l2norm = True, dropout = True
hidden_actfunc = "relu"
l2normFlag = True
dropoutFlag = True
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

W0625 12:20:14.654095 113336 deprecation_wrapper.py:119] From C:\Users\h5weng\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0625 12:20:14.682020 113336 deprecation_wrapper.py:119] From C:\Users\h5weng\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0625 12:20:14.691023 113336 deprecation_wrapper.py:119] From C:\Users\h5weng\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0625 12:20:14.701988 113336 deprecation_wrapper.py:119] From C:\Users\h5weng\AppData\Local\Continuum\anaconda3\lib\site-packages\keras\backend\tensorflow_backend.py:174: The name tf.get_default_session is deprecated. Please

----------------------------------------------------------------------------------
hidden_actfunc: relu   l2normFlag: True   dropoutFlag: True   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 76.86%


In [14]:
# hidden_layer = relu, l2norm = True, dropout = False
hidden_actfunc = "relu"
l2normFlag = True
dropoutFlag = False
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))       

----------------------------------------------------------------------------------
hidden_actfunc: relu   l2normFlag: True   dropoutFlag: False   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 78.33%


In [15]:
# hidden_layer = relu, l2norm = False, dropout = True
hidden_actfunc = "relu"
l2normFlag = False
dropoutFlag = True
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))       

----------------------------------------------------------------------------------
hidden_actfunc: relu   l2normFlag: False   dropoutFlag: True   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 77.38%


In [16]:
# hidden_layer = relu, l2norm = False, dropout = False
hidden_actfunc = "relu"
l2normFlag = False
dropoutFlag = False
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))            

----------------------------------------------------------------------------------
hidden_actfunc: relu   l2normFlag: False   dropoutFlag: False   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 75.22%


## sigmoid

In [17]:
# hidden_layer = sigmoid, l2norm = True, dropout = True
hidden_actfunc = "sigmoid" 
l2normFlag = True
dropoutFlag = True
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: sigmoid   l2normFlag: True   dropoutFlag: True   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 74.14%


In [18]:
# hidden_layer = sigmoid, l2norm = True, dropout = False
hidden_actfunc = "sigmoid"
l2normFlag = True
dropoutFlag = False
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: sigmoid   l2normFlag: True   dropoutFlag: False   dropoutRate 0.5Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 78.08%


In [19]:
# hidden_layer = sigmoid, l2norm = False, dropout = True
hidden_actfunc = "sigmoid"
l2normFlag = False
dropoutFlag = True
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: sigmoid   l2normFlag: False   dropoutFlag: True   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 79.44%


In [20]:
# hidden_layer = sigmoid, l2norm = False, dropout = False
hidden_actfunc = "sigmoid"
l2normFlag = False
dropoutFlag = False
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: sigmoid   l2normFlag: False   dropoutFlag: False   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 75.52%


## tanh

In [21]:
# hidden_layer = tanh, l2norm = True, dropout = True
hidden_actfunc = "sigmoid"
l2normFlag = True
dropoutFlag = True
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: sigmoid   l2normFlag: True   dropoutFlag: True   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 74.62%


In [22]:
# hidden_layer = tanh, l2norm = True, dropout = False
hidden_actfunc = "tanh"
l2normFlag = True
dropoutFlag = False
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: tanh   l2normFlag: True   dropoutFlag: False   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 77.58%


In [27]:
# hidden_layer = tanh, l2norm = False, dropout = True
hidden_actfunc = "tanh"
l2normFlag = False
dropoutFlag = True
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: tanh   l2normFlag: False   dropoutFlag: True   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 79.75%


In [28]:
# hidden_layer = tanh, l2norm = False, dropout = False
hidden_actfunc = "tanh"
l2normFlag = False
dropoutFlag = False
dropoutRate = 0.5
model = sequential_model (hidden_actfunc, l2normFlag, dropoutFlag, dropoutRate)
print("----------------------------------------------------------------------------------")
print("hidden_actfunc: "+hidden_actfunc, "  l2normFlag: " + str(l2normFlag), 
      "  dropoutFlag: "+str(dropoutFlag), "  dropoutRate " + str(dropoutRate))
model.fit(train_data, train_target,batch_size=BATCH_SIZE,epochs=N_EPOCHS, validation_data =(val_data, val_target))
scores = model.evaluate(test_data, test_target)
print("%s: %.2f%%"%(model.metrics_names[1], scores[1]*100))

----------------------------------------------------------------------------------
hidden_actfunc: tanh   l2normFlag: False   dropoutFlag: False   dropoutRate 0.5
Train on 640000 samples, validate on 80000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
acc: 76.59%
