In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [3]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
        
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    
    print('read', filetype, 'files:', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
    
    return all_labels, all_texts   

In [4]:
y_train, train_text = read_files("train")

read train files: 25000


In [5]:
y_test,  test_text  = read_files("test")

read test files: 25000


In [6]:
token = Tokenizer(num_words = 3800)
token.fit_on_texts(train_text)

In [7]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)

In [8]:
x_train = sequence.pad_sequences(x_train_seq, maxlen = 380)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen = 380)

In [9]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

In [10]:
model = Sequential()

In [11]:
model.add(Embedding(output_dim = 32,
                    input_dim  = 3800,
                    input_length = 380))
model.add(Dropout(0.2))

In [12]:
model.add(Flatten())

In [13]:
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))

In [14]:
model.add(Dense(units = 1, activation = 'sigmoid'))

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 380, 32)           121600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 380, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 12160)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               3113216   
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 3,235,073
Trainable params: 3,235,073
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [17]:
train_history = model.fit(x_train, 
                          y_train, 
                          batch_size = 100,
                          epochs = 10,
                          verbose = 2,
                          validation_split = 0.2
                         )

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 23s - loss: 0.4752 - acc: 0.7600 - val_loss: 0.4621 - val_acc: 0.7978
Epoch 2/10
 - 22s - loss: 0.1944 - acc: 0.9260 - val_loss: 0.4966 - val_acc: 0.7942
Epoch 3/10
 - 22s - loss: 0.0797 - acc: 0.9742 - val_loss: 0.5494 - val_acc: 0.8180
Epoch 4/10
 - 22s - loss: 0.0282 - acc: 0.9922 - val_loss: 0.7095 - val_acc: 0.8142
Epoch 5/10
 - 21s - loss: 0.0149 - acc: 0.9957 - val_loss: 0.9655 - val_acc: 0.7862
Epoch 6/10
 - 22s - loss: 0.0103 - acc: 0.9975 - val_loss: 1.0532 - val_acc: 0.7886
Epoch 7/10
 - 23s - loss: 0.0100 - acc: 0.9971 - val_loss: 1.0541 - val_acc: 0.7994
Epoch 8/10
 - 23s - loss: 0.0112 - acc: 0.9962 - val_loss: 1.1084 - val_acc: 0.7950
Epoch 9/10
 - 23s - loss: 0.0146 - acc: 0.9946 - val_loss: 1.3126 - val_acc: 0.7690
Epoch 10/10
 - 22s - loss: 0.0158 - acc: 0.9944 - val_loss: 1.1858 - val_acc: 0.7922


In [18]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.84744

In [19]:
predict = model.predict_classes(x_test)
predict[:10]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [20]:
predict[12501:12510]

array([[0],
       [0],
       [1],
       [1],
       [0],
       [1],
       [0],
       [0],
       [0]])

In [21]:
predict_classes = predict.reshape(-1)
print('1 ~ 10: ', predict_classes[:10])
print('12501 ~ 12510: ', predict_classes[12501:12511])

1 ~ 10:  [1 1 1 1 1 1 1 1 1 1]
12501 ~ 12510:  [0 0 1 1 0 1 0 0 0 0]


In [28]:
SentimentDict = {1:'Positive', 0:'Negative'}
def display_test_Sentiment(i):
    print(test_text[i])
    print('\nLabel:', SentimentDict[y_test[i]],
        ', Predict:', SentimentDict[predict_classes[i]])

In [29]:
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen=380)
    predict_result = model.predict_classes(pad_input_seq)
    print(SentimentDict[predict_result[0][0]])

In [34]:
# Positive
predict_review('''
While Beauty and the Beast boasts some fantastic set pieces, wonderful costumes and great performances. The entire film just feels like a retread of the original animated classic, and as a result doesn't feel new (like The Jungle Book) or boring (like Malificent) just uninspired. If I had a choice I'd watch the original, and while I don't like to compare remakes it's hard when this film's job is to give you the nostalgia you felt for the first movie.
''')

Negative


In [35]:
# Negative
predict_review('''
I really cant find the words to put my disatisfaction with this movie the best way i can. At one hand i really enjoyed the graphics and in general whole production was great, the songs, designs , even the story was ok when observed detached from the whole movie, BUT once it all is merged together with actors and all.... it was just not.... good. I did like Gaston.
''')

Positive
