### Deep Learning Solution for Sentiment Analysis

    
#### LSTM (Long short term memory)
* test dataset: 
  * took > 1 hour
  * train : test = 2 :1
  * tried activation functions:
      * tanh function: doesn't work
      * relu function: doesn't work, values get too large to `nan`
      * sigmoid function: better for binary classification
      * softmax function: works well for multiple classification
  * acc (softmax): 71% 
  * label accuracy =  0.7170868347338936

* whole dataset:
  * train : test = 9 :1
  * Trainning model  LSTM_v0.1.h5  successfully, took(s):  66079.6750668 =18.4 hours
  * Evaluating model took(s):  197.92229590000352,  acc: 0.79
  * label accuracy =  0.7927490188749766
  
TODO/Imporvements:
* preprocess the reviews:
  * TF*IDF to reduce common words
* represent the reviews:
  * word2vec
  * use sementic knowledgebase: freebase, freebase, satori, wordnet, etc
  
* try diff superparams
* try BRNN, GRU-RNN

In [16]:
import timeit
import numpy as np 
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

def create_model(max_fatures, embed_dim, input_length, lstm_out):
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = input_length))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(5,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model
    
def preprocess_data(dataFile, max_fatures):
    data = pd.read_csv(dataFile)
    data = data[['Review','Label']]

    data['Review'] = data['Review'].apply(lambda x: x.lower())
    data['Review'] = data['Review'].apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))
    
    tokenizer = Tokenizer(num_words=max_fatures, split=' ')
    tokenizer.fit_on_texts(data['Review'].values)
    X = tokenizer.texts_to_sequences(data['Review'].values)
    X = pad_sequences(X)
    Y = pd.get_dummies(data['Label']).values
    
    return X, Y
    
def train_model(X_train, Y_train, batch_size, modelName):
    start = timeit.default_timer()
    
    model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)
    model.save(modelName) 
    stop = timeit.default_timer()
    print('Trainning model ', modelName, ' successfully, took(s): ', stop - start)

# super params
max_fatures = 2000
embed_dim = 128
lstm_out = 196
batch_size = 32

X,Y = preprocess_data('../100k-courseras-course-reviews-dataset/reviews.csv', max_fatures)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.1, random_state = 42)

print('X.shape', X.shape, 'X_train.shape', X_train.shape, 'X_test.shape', X_test.shape)
print('Y.shape', Y.shape, 'Y_train.shape', Y_train.shape, 'Y_test.shape', Y_test.shape)



X.shape (107018, 1148) X_train.shape (96316, 1148) X_test.shape (10702, 1148)
Y.shape (107018, 5) Y_train.shape (96316, 5) Y_test.shape (10702, 5)


In [17]:
# Train the Model

modelName = 'LSTM_v0.1.h5'

input_length = X_train.shape[1]
model =  create_model(max_fatures, embed_dim, input_length, lstm_out)
train_model(X_train, Y_train, batch_size, modelName)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 1148, 128)         256000    
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 1148, 128)         0         
_________________________________________________________________
lstm_5 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 985       
Total params: 511,785
Trainable params: 511,785
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/7
 - 9401s - loss: 0.6377 - acc: 0.7661
Epoch 2/7
 - 9283s - loss: 0.5565 - acc: 0.7877
Epoch 3/7
 - 9334s - loss: 0.5376 - acc: 0.7942
Epoch 4/7
 - 9364s - loss: 0.5240 - acc: 0.7998
Epoch 5/7
 - 9394s - loss: 0.5121 - acc: 0.8041
Epoch 6/7
 - 9443s - lo

In [23]:
# Evaluate the Model
from keras.models import load_model

def evaluate_model(modelName, X_test, Y_test, batch_size):
    model = load_model(modelName)

    start = timeit.default_timer()
    score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = batch_size)
    stop = timeit.default_timer()
    print('Evaluating model took(s): ', stop - start)

    print("score: %.2f" % (score))
    print("acc: %.2f" % (acc))
    
    results = model.predict(X_test, batch_size=batch_size, verbose=2, steps=None)
    predict_labels = [np.argmax(x)+1 for x in results]
    actual_labels = [np.argmax(x)+1 for x in Y_test]
    corrects = [ 1 if predict_labels[i] == actual_labels[i] else 0 for i in range(0, len(actual_labels))]
    acc = sum(corrects)/len(corrects)
    print('label accuracy = ', acc)
    
evaluate_model(modelName, X_test, Y_test, batch_size)

In [25]:
### Play with more Experiments ####

def create_model(max_fatures, embed_dim, input_length, lstm_out):
    model = Sequential()
    model.add(Embedding(max_fatures, embed_dim,input_length = input_length))
    model.add(SpatialDropout1D(0.4))
    model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(5,activation='softmax'))
    model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
    print(model.summary())
    return model

# super params
max_fatures = 2000
embed_dim = 128
lstm_out = 196
batch_size = 32

X,Y = preprocess_data('../100k-courseras-course-reviews-dataset/reviews_test.csv', max_fatures)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.1, random_state = 42)

print('X.shape', X.shape, 'X_train.shape', X_train.shape, 'X_test.shape', X_test.shape)
print('Y.shape', Y.shape, 'Y_train.shape', Y_train.shape, 'Y_test.shape', Y_test.shape)

modelName = 'LSTM_test_softmax_v0.1.h5'

input_length = X_train.shape[1]
model =  create_model(max_fatures, embed_dim, input_length, lstm_out)
train_model(X_train, Y_train, batch_size, modelName)

evaluate_model(modelName, X_test, Y_test, batch_size)

X.shape (10702, 733) X_train.shape (9631, 733) X_test.shape (1071, 733)
Y.shape (10702, 5) Y_train.shape (9631, 5) Y_test.shape (1071, 5)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 733, 128)          256000    
_________________________________________________________________
spatial_dropout1d_9 (Spatial (None, 733, 128)          0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_9 (Dense)              (None, 5)                 985       
Total params: 511,785
Trainable params: 511,785
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/7
 - 623s - loss: 0.8756 - acc: 0.6956
Epoch 2/7
 - 605s - loss: 0.7200 - acc: 0.7269
Epoch 3/7
 - 611s - loss: 0.663