## Data Preparation

In [70]:
# loading and cleaning reviews
import nltk
import random
from nltk.classify.scikitlearn import SklearnClassifier
import pickle

from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
import string
from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize



# read the training txt    
short_pos = open ("C:\software\WinPython-64bit-3.6.2.0Qt5\\notebooks\capstone\shortmoviereview\positive.txt").read()
short_neg = open ("C:\software\WinPython-64bit-3.6.2.0Qt5\\notebooks\capstone\shortmoviereview\\negative.txt").read()

#split into paragraph and append a tag
documents_pos = []
documents_neg = []

for r in short_pos.split('\n'):
    documents_pos.append(r)
    
for r in short_neg.split('\n'):
    documents_neg.append(r)
    
documents = documents_pos + documents_neg
print("has ",len(documents_pos)," postive movies reviews")
print("has ",len(documents_neg)," negative movies reviews")
print("all documents is ",len(documents))    
print(documents[1:3])

has  5332  postive movies reviews
has  5332  negative movies reviews
all documents is  10664
['the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth . ', 'effective but too-tepid biopic']


In [71]:
# get the frequence of the words
# We can step through the vocabulary and remove all words that have a low occurrence, such as only being used once or twice in all reviews.

all_words = []

short_pos_words = word_tokenize(short_pos)
short_neg_words = word_tokenize(short_neg)

for w in short_pos_words:
    all_words.append(w.lower())
    
for w in short_neg_words:
    all_words.append(w.lower())

#remove the stop words and punctuation.
stop_words = set(stopwords.words('english'))

filtered_words = [w for w in all_words if not w in stop_words and w not in string.punctuation]

all_words = nltk.FreqDist(filtered_words)

all_words = [k for k,c in all_words.items() if c >=2]



In [72]:
#save word_features
save_word_features = open("all_words.pickle","wb")
pickle.dump(all_words, save_word_features)
save_word_features.close()

In [2]:
# turn a doc into clean tokens and join into the sentence
def clean_doc(doc):
    # split into tokens by white space
    tokens = doc.split()
    token_words = list()
    for w in tokens:
        token_words.append(w.lower())
    #remove the stop words and punctuation.
    stop_words = set(stopwords.words('english'))
    clean_words = [w for w in token_words if w in all_words]
    clean_words = ' '.join(clean_words)

    return clean_words

In [3]:
# get the filtered sentences which have removed the stopwords and punctuation
filter_sentence= list()
for d in documents:
    sentence = clean_doc(d)
    filter_sentence.append(sentence)

    
print(filter_sentence[1:3])


["gorgeously elaborate continuation lord rings trilogy huge column words cannot adequately describe co-writer/director peter jackson's expanded vision j r r tolkien's middle-earth", 'effective too-tepid biopic']


In [74]:
#save the filter_sentence
save_sentence = open("filter_sentence.pickle","wb")
pickle.dump(filter_sentence, save_sentence)
save_sentence.close()

In [4]:
# training documents as sequences of integers using the Tokenizer class in the Keras API
from keras.preprocessing.text import Tokenizer

# create the tokenizer
tokenizer = Tokenizer()
# fit the tokenizer on the documents
tokenizer.fit_on_texts(filter_sentence)
# sequence encode
encoded_docs = tokenizer.texts_to_sequences(filter_sentence)
print(encoded_docs[1])

Using TensorFlow backend.


[3228, 2053, 7158, 4517, 2823, 4518, 957, 7159, 706, 769, 4519, 2513, 573, 162, 16, 958, 5502, 4520, 574, 1178, 1887, 1887, 7160, 372, 1370]


In [5]:
# pad sequences
from keras.preprocessing.sequence import pad_sequences
max_length = max([len(s.split()) for s in filter_sentence])
Xtrain = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(Xtrain[1:3])

[[3228 2053 7158 4517 2823 4518  957 7159  706  769 4519 2513  573  162
    16  958 5502 4520  574 1178 1887 1887 7160  372 1370    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]
 [ 607 1371 2514 2515    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0]]


In [6]:
type(Xtrain)

numpy.ndarray

In [7]:
#get the trainning and testing datasets
import numpy as np
from numpy import array
x_pos_train=Xtrain[0:4000]
x_pos_test=Xtrain[4000:5332]
x_neg_train=Xtrain[5332:9332]
x_neg_test=Xtrain[9332:]

x_train=np.append(x_pos_train, x_neg_train,axis=0)
y_train=array([1 for _ in range(4000)] + [0 for _ in range(4000)])

x_test=np.append(x_pos_test, x_neg_test,axis=0)
y_test=array([1 for _ in range(1332)] + [0 for _ in range(1332)])



In [8]:
print(type(x_train))
print(len(x_train))

<class 'numpy.ndarray'>
8000


In [9]:
# define vocabulary size (largest integer value)
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

19432


## implementation

### define model

In [61]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers import Embedding
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import Dropout
import numpy

numpy.random.seed(7)
model1 = Sequential()
model1.add(Embedding(vocab_size, 50, input_length=max_length))
model1.add(Dropout(0.5))
model1.add(Conv1D(filters=64, kernel_size=3, activation='relu'))
model1.add(MaxPooling1D(pool_size=2))
model1.add(Dropout(0.5))
model1.add(Flatten())
model1.add(Dense(100, activation='relu'))
model1.add(Dense(1, activation='sigmoid'))
print(model1.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 39, 50)            971600    
_________________________________________________________________
dropout_24 (Dropout)         (None, 39, 50)            0         
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 37, 64)            9664      
_________________________________________________________________
max_pooling1d_15 (MaxPooling (None, 18, 64)            0         
_________________________________________________________________
dropout_25 (Dropout)         (None, 18, 64)            0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 1152)              0         
_________________________________________________________________
dense_40 (Dense)             (None, 100)               115300    
__________

In [62]:

# compile network
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# fit network
model1.fit(x_train, y_train, epochs=3, verbose=2, batch_size=64, shuffle=True)


Epoch 1/3
6s - loss: 0.6932 - acc: 0.5044
Epoch 2/3
3s - loss: 0.5781 - acc: 0.7069
Epoch 3/3
3s - loss: 0.3141 - acc: 0.8656


<keras.callbacks.History at 0x2366d165780>

In [63]:

# Evaluate
loss, acc = model1.evaluate(x_test, y_test, verbose=0)
print('Test Accuracy: %f' % (acc*100))


Test Accuracy: 75.638138


In [64]:
## using LSTM
from keras.layers import LSTM

# fix random seed for reproducibility
numpy.random.seed(7)

# create the model
embedding_vecor_length = 32
model2 = Sequential()
model2.add(Embedding(vocab_size, 50, input_length=max_length))
model2.add(LSTM(50, dropout=0.4, recurrent_dropout=0.4))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
print(model2.summary())
model2.fit(x_train, y_train, epochs=10, batch_size=256,shuffle=True)
# Final evaluation of the model
scores = model2.evaluate(x_test, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_29 (Embedding)     (None, 39, 50)            971600    
_________________________________________________________________
lstm_16 (LSTM)               (None, 50)                20200     
_________________________________________________________________
dense_42 (Dense)             (None, 1)                 51        
Total params: 991,851
Trainable params: 991,851
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 74.06%


In [60]:
# # LSTM and CNN

# model3 = Sequential()
# model3.add(Embedding(vocab_size, 50, input_length=max_length))
# model3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
# model3.add(MaxPooling1D(pool_size=2))
# model3.add(LSTM(100))
# model3.add(Dense(1, activation='sigmoid'))
# model3.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
# print(model3.summary())
# model3.fit(x_train, y_train, epochs=3, batch_size=128)
# # Final evaluation of the model
# scores = model3.evaluate(x_test, y_test, verbose=0)
# print("Accuracy: %.2f%%" % (scores[1]*100))

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_27 (Embedding)     (None, 39, 50)            971600    
_________________________________________________________________
conv1d_14 (Conv1D)           (None, 39, 32)            4832      
_________________________________________________________________
max_pooling1d_14 (MaxPooling (None, 19, 32)            0         
_________________________________________________________________
lstm_15 (LSTM)               (None, 100)               53200     
_________________________________________________________________
dense_39 (Dense)             (None, 1)                 101       
Total params: 1,029,733
Trainable params: 1,029,733
Non-trainable params: 0
_________________________________________________________________
None
Epoch 1/3
Epoch 2/3
Epoch 3/3
Accuracy: 73.76%


In [65]:
#save weights
cnn = "cnn.hdf5"
model1.save_weights(cnn,overwrite=True)

In [66]:
lstm = "lstm.hdf5"
model2.save_weights(lstm,overwrite=True)