In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

Using TensorFlow backend.


In [2]:
import re
def rm_tags(text):
    re_tag = re.compile(r'<[^>]+>')
    return re_tag.sub('', text)

In [3]:
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list = []
    
    positive_path = path + filetype + "/pos/"
    for f in os.listdir(positive_path):
        file_list += [positive_path + f]
        
    negative_path = path + filetype + "/neg/"
    for f in os.listdir(negative_path):
        file_list += [negative_path + f]
    
    print('read', filetype, 'files:', len(file_list))
    
    all_labels = ([1] * 12500 + [0] * 12500)
    
    all_texts = []
    for fi in file_list:
        with open(fi, encoding = 'utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
    
    return all_labels, all_texts   

In [4]:
y_train, train_text = read_files("train")

read train files: 25000


In [5]:
y_test,  test_text  = read_files("test")

read test files: 25000


In [6]:
token = Tokenizer(num_words = 2000)
token.fit_on_texts(train_text)

In [7]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)

In [8]:
x_train = sequence.pad_sequences(x_train_seq, maxlen = 100)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen = 100)

In [9]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten
from keras.layers.embeddings import Embedding

In [10]:
model = Sequential()

In [11]:
model.add(Embedding(output_dim = 32,
                    input_dim  = 2000,
                    input_length = 100))
model.add(Dropout(0.2))

In [12]:
model.add(Flatten())

In [13]:
model.add(Dense(units = 256, activation = 'relu'))
model.add(Dropout(0.35))

In [14]:
model.add(Dense(units = 1, activation = 'sigmoid'))

In [15]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           64000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 3200)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 256)               819456    
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 257       
Total params: 883,713
Trainable params: 883,713
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss = 'binary_crossentropy',
              optimizer = 'adam',
              metrics = ['accuracy'])

In [17]:
train_history = model.fit(x_train, 
                          y_train, 
                          batch_size = 100,
                          epochs = 10,
                          verbose = 2,
                          validation_split = 0.2
                         )

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
 - 6s - loss: 0.4774 - acc: 0.7573 - val_loss: 0.3234 - val_acc: 0.8656
Epoch 2/10
 - 7s - loss: 0.2689 - acc: 0.8890 - val_loss: 0.5418 - val_acc: 0.7728
Epoch 3/10
 - 8s - loss: 0.1617 - acc: 0.9400 - val_loss: 0.5564 - val_acc: 0.7926
Epoch 4/10
 - 8s - loss: 0.0835 - acc: 0.9709 - val_loss: 0.8771 - val_acc: 0.7500
Epoch 5/10
 - 7s - loss: 0.0496 - acc: 0.9830 - val_loss: 0.6775 - val_acc: 0.8212
Epoch 6/10
 - 6s - loss: 0.0361 - acc: 0.9872 - val_loss: 1.4680 - val_acc: 0.6974
Epoch 7/10
 - 6s - loss: 0.0286 - acc: 0.9896 - val_loss: 1.3790 - val_acc: 0.7286
Epoch 8/10
 - 6s - loss: 0.0267 - acc: 0.9911 - val_loss: 1.1999 - val_acc: 0.7624
Epoch 9/10
 - 6s - loss: 0.0276 - acc: 0.9899 - val_loss: 1.1580 - val_acc: 0.7810
Epoch 10/10
 - 6s - loss: 0.0273 - acc: 0.9910 - val_loss: 1.3187 - val_acc: 0.7484


In [18]:
scores = model.evaluate(x_test, y_test, verbose=1)
scores[1]



0.8056

In [22]:
predict = model.predict_classes(x_test)
predict[:10]

array([[1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [21]:
predict[12501:12510]

array([[0],
       [0],
       [0],
       [0],
       [1],
       [1],
       [0],
       [0],
       [0]])

In [23]:
predict_classes = predict.reshape(-1)
predict_classes[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [24]:
predict_classes[12501:12511]

array([0, 0, 0, 0, 1, 1, 0, 0, 0, 1])

In [25]:
SentimentDict = {1:'Positive', 0:'Negative'}
def display_test_Sentiment(i):
    print(test_text[i])
    print('\nLabel:', SentimentDict[y_test[i]],
        ', Predict:', SentimentDict[predict_classes[i]])

In [26]:
display_test_Sentiment(2)

As a recreational golfer with some knowledge of the sport's history, I was pleased with Disney's sensitivity to the issues of class in golf in the early twentieth century. The movie depicted well the psychological battles that Harry Vardon fought within himself, from his childhood trauma of being evicted to his own inability to break that glass ceiling that prevents him from being accepted as an equal in English golf society. Likewise, the young Ouimet goes through his own class struggles, being a mere caddie in the eyes of the upper crust Americans who scoff at his attempts to rise above his standing. What I loved best, however, is how this theme of class is manifested in the characters of Ouimet's parents. His father is a working-class drone who sees the value of hard work but is intimidated by the upper class; his mother, however, recognizes her son's talent and desire and encourages him to pursue his dream of competing against those who think he is inferior.Finally, the golf scenes

In [27]:
display_test_Sentiment(12501)

This is an example of why the majority of action films are the same. Generic and boring, there's really nothing worth watching here. A complete waste of the then barely-tapped talents of Ice-T and Ice Cube, who've each proven many times over that they are capable of acting, and acting well. Don't bother with this one, go see New Jack City, Ricochet or watch New York Undercover for Ice-T, or Boyz n the Hood, Higher Learning or Friday for Ice Cube and see the real deal. Ice-T's horribly cliched dialogue alone makes this film grate at the teeth, and I'm still wondering what the heck Bill Paxton was doing in this film? And why the heck does he always play the exact same character? From Aliens onward, every film I've seen with Bill Paxton has him playing the exact same irritating character, and at least in Aliens his character died, which made it somewhat gratifying...Overall, this is second-rate action trash. There are countless better films to see, and if you really want to see this one, 

In [28]:
input_text = '''
Being a big fan of Disney, well most of Disney's cartoon movies and seeing that some of the real life movies were not bad, I had high hopes for this. Just watched it on streaming and couldn't make it through the last 2 hours but I did. 
Where oh where did the people at Disney approve this rolling CGI, actors, acting and singing choice of just about everything, NIGHTMARE!!!
I watched it on streaming thinking I would purchase it and pair it with the original. I really want to get my money back from the streaming service. It was that bad. 
Sorry modern Disney is fast slipping from the movie lexicon of memorable movies and into the dumpster.
'''

In [29]:
input_seq = token.texts_to_sequences([input_text])

In [30]:
print(input_seq[0])

[108, 3, 190, 333, 4, 909, 69, 87, 4, 1068, 98, 2, 315, 11, 45, 4, 1, 143, 109, 98, 67, 20, 75, 9, 65, 308, 1905, 14, 10, 39, 292, 8, 19, 2, 422, 93, 8, 139, 1, 232, 237, 630, 17, 9, 118, 117, 445, 117, 118, 1, 80, 29, 909, 10, 1679, 152, 112, 2, 1115, 1097, 4, 39, 40, 282, 1720, 9, 292, 8, 19, 531, 9, 58, 8, 2, 8, 15, 1, 200, 9, 62, 177, 5, 74, 57, 274, 141, 35, 1, 8, 12, 11, 75, 801, 677, 909, 6, 699, 35, 1, 16, 4, 902, 98, 2, 79, 1]


In [31]:
len(input_seq[0])

106

In [32]:
pad_input_seq = sequence.pad_sequences(input_seq, maxlen=100)
len(pad_input_seq[0])

100

In [33]:
predict_result = model.predict_classes(pad_input_seq)
predict_result

array([[0]])

In [34]:
predict_result[0][0]

0

In [35]:
SentimentDict[predict_result[0][0]]

'Negative'

In [36]:
def predict_review(input_text):
    input_seq = token.texts_to_sequences([input_text])
    pad_input_seq = sequence.pad_sequences(input_seq, maxlen=100)
    predict_result = model.predict_classes(pad_input_seq)
    print(SentimentDict[predict_result[0][0]])

In [41]:
# Positive
predict_review('''
While Beauty and the Beast boasts some fantastic set pieces, wonderful costumes and great performances. The entire film just feels like a retread of the original animated classic, and as a result doesn't feel new (like The Jungle Book) or boring (like Malificent) just uninspired. If I had a choice I'd watch the original, and while I don't like to compare remakes it's hard when this film's job is to give you the nostalgia you felt for the first movie.
''')

Positive


In [42]:
# Negative
predict_review('''
I really cant find the words to put my disatisfaction with this movie the best way i can. At one hand i really enjoyed the graphics and in general whole production was great, the songs, designs , even the story was ok when observed detached from the whole movie, BUT once it all is merged together with actors and all.... it was just not.... good. I did like Gaston.
''')

Positive
