In [1]:
from keras.datasets import imdb

Using TensorFlow backend.


In [2]:
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

In [3]:
import os

import urllib.request
import tarfile

url = "http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz"
filePath = "data/aclImdb_v1.tar.gz"

# note: need to create the directory "data" by yourself
if not os.path.isfile(filePath):
    result = urllib.request.urlretrieve(url, filePath)
    print('downloaded: ', result)

# unzip the tar file
if not os.path.exists("data/aclImdb"):
    tempTarFile = tarfile.open("data/aclImdb_v1.tar.gz", 'r:gz')
    result = tempTarFile.extractall('data/')

In [4]:
# using regular expression (to remove the HTML tag)
import re

def remove_tags(text):
    regular_expression_tag = re.compile(r'<[^>]+>')
    return regular_expression_tag.sub('',text) # replace as ''

In [5]:
import os

# read files (positive or negative)
def read_files(file_type):
    path = "data/aclImdb/"
    file_list=[]
    
    positive_path = path + file_type + "/pos/"
    for f in os.listdir(positive_path):
        file_list = file_list + [positive_path + f] 
    
    negative_path = path + file_type + "/neg/"
    for f in os.listdir(negative_path):
        file_list = file_list + [negative_path + f] 
        
    print('read', file_type, 'files: ', len(file_list) )
    
    all_labels = ( [1]*12500 + [0]*12500 )
    
    all_texts = []
    
    for f in file_list:
        with open(f, encoding='utf8') as file_input:
            all_texts = all_texts + [ remove_tags(" ".join(file_input.readlines() ) ) ] #remove html tags
    
    return all_labels, all_texts

In [6]:
y_train, x_train_text = read_files("train")

read train files:  25000


In [7]:
y_test, x_test_text = read_files("test")

read test files:  25000


In [8]:
token = Tokenizer( num_words=2000 )
token.fit_on_texts(x_train_text)

x_train_seq = token.texts_to_sequences(x_train_text)
x_test_seq = token.texts_to_sequences(x_test_text)

x_train_final = sequence.pad_sequences( x_train_seq, maxlen=100 )
x_test_final = sequence.pad_sequences( x_test_seq, maxlen=100 )

In [9]:
# add Embedding layer
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

In [10]:
# import RNN
from keras.layers.recurrent import LSTM

In [11]:
model = Sequential()

In [12]:
model.add( Embedding(output_dim=32, input_dim=2000, input_length=100) )
model.add( Dropout(0.35) )

In [13]:
# add RNN
model.add( LSTM(units=16) )

In [14]:
model.add( Dense(units=256, activation='relu') )
model.add( Dropout(0.5) )

In [15]:
model.add( Dense(units=1, activation='sigmoid') )

In [16]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 16)                3136      
_________________________________________________________________
dense_1 (Dense)              (None, 256)               4352      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 71,745
Trainable params: 71,745
Non-trainable params: 0
_________________________________________________________________


In [17]:
model.compile( loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'] )

In [18]:
train_history = model.fit( x_train_final, y_train, batch_size=100, epochs=10, verbose=2, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 48s - loss: 0.5338 - acc: 0.7218 - val_loss: 0.6683 - val_acc: 0.6496
Epoch 2/10
 - 41s - loss: 0.3401 - acc: 0.8531 - val_loss: 0.3027 - val_acc: 0.8682
Epoch 3/10
 - 41s - loss: 0.3088 - acc: 0.8733 - val_loss: 0.4796 - val_acc: 0.7580
Epoch 4/10
 - 42s - loss: 0.2938 - acc: 0.8787 - val_loss: 0.4703 - val_acc: 0.7904
Epoch 5/10
 - 42s - loss: 0.2825 - acc: 0.8847 - val_loss: 0.5189 - val_acc: 0.7642
Epoch 6/10
 - 42s - loss: 0.2737 - acc: 0.8879 - val_loss: 0.4994 - val_acc: 0.7732
Epoch 7/10
 - 42s - loss: 0.2627 - acc: 0.8935 - val_loss: 0.4929 - val_acc: 0.8028
Epoch 8/10
 - 43s - loss: 0.2544 - acc: 0.8981 - val_loss: 0.5807 - val_acc: 0.7634
Epoch 9/10
 - 42s - loss: 0.2448 - acc: 0.9016 - val_loss: 0.4602 - val_acc: 0.7984
Epoch 10/10
 - 41s - loss: 0.2327 - acc: 0.9071 - val_loss: 0.4849 - val_acc: 0.7704


In [19]:
scores = model.evaluate( x_test_final, y_test, verbose=1 )
print(scores[1])

0.83068


In [20]:
predict = model.predict_classes(x_test_final)

In [21]:
predict[:10]

array([[1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [22]:
# reshape
predict_classes = predict.reshape(-1)
predict_classes[:10]

array([1, 1, 0, 1, 1, 1, 1, 1, 1, 1])

In [23]:
SentimentDict = {1:'positive', 0:'negative'}
def display_test_Sentiment(i):
    print(x_test_text[i])
    print('[label]')
    print('ground truth:', SentimentDict[y_test[i]] ) 
    print('predict result:', SentimentDict[ predict_classes[i] ] )

In [24]:
display_test_Sentiment(0)

I went and saw this movie last night after being coaxed to by a few friends of mine. I'll admit that I was reluctant to see it because from what I knew of Ashton Kutcher he was only able to do comedy. I was wrong. Kutcher played the character of Jake Fischer very well, and Kevin Costner played Ben Randall with such professionalism. The sign of a good movie is that it can toy with our emotions. This one did exactly that. The entire theater (which was sold out) was overcome by laughter during the first half of the movie, and were moved to tears during the second half. While exiting the theater I not only saw many women in tears, but many full grown men as well, trying desperately not to let anyone see them crying. This movie was great, and I suggest that you go see it before you judge.
[label]
ground truth: positive
predict result: positive


In [25]:
display_test_Sentiment(12500)

Once again Mr. Costner has dragged out a movie for far longer than necessary. Aside from the terrific sea rescue sequences, of which there are very few I just did not care about any of the characters. Most of us have ghosts in the closet, and Costner's character are realized early on, and then forgotten until much later, by which time I did not care. The character we should really care about is a very cocky, overconfident Ashton Kutcher. The problem is he comes off as kid who thinks he's better than anyone else around him and shows no signs of a cluttered closet. His only obstacle appears to be winning over Costner. Finally when we are well past the half way point of this stinker, Costner tells us all about Kutcher's ghosts. We are told why Kutcher is driven to be the best with no prior inkling or foreshadowing. No magic here, it was all I could do to keep from turning it off an hour in.
[label]
ground truth: negative
predict result: negative


In [26]:
display_test_Sentiment(12501)

This is an example of why the majority of action films are the same. Generic and boring, there's really nothing worth watching here. A complete waste of the then barely-tapped talents of Ice-T and Ice Cube, who've each proven many times over that they are capable of acting, and acting well. Don't bother with this one, go see New Jack City, Ricochet or watch New York Undercover for Ice-T, or Boyz n the Hood, Higher Learning or Friday for Ice Cube and see the real deal. Ice-T's horribly cliched dialogue alone makes this film grate at the teeth, and I'm still wondering what the heck Bill Paxton was doing in this film? And why the heck does he always play the exact same character? From Aliens onward, every film I've seen with Bill Paxton has him playing the exact same irritating character, and at least in Aliens his character died, which made it somewhat gratifying...Overall, this is second-rate action trash. There are countless better films to see, and if you really want to see this one, 