In [1]:
from keras.datasets import imdb
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

import re
re_tag = re.compile(r'<[^>]+>')

def rm_tags(text):
    return re_tag.sub('', text)
    
import os
def read_files(filetype):
    path = "data/aclImdb/"
    file_list=[]

    positive_path=path + filetype+"/pos/"
    for f in os.listdir(positive_path)[:5000]:
        file_list+=[positive_path+f]
    
    negative_path=path + filetype+"/neg/"
    for f in os.listdir(negative_path)[:5000]:
        file_list+=[negative_path+f]
        
    print('read',filetype, 'files:',len(file_list))
       
    all_labels = ([1] * 5000 + [0] * 5000) 
    
    all_texts  = []
    
    for fi in file_list:
        with open(fi,encoding='utf8') as file_input:
            all_texts += [rm_tags(" ".join(file_input.readlines()))]
            
    return all_labels,all_texts

In [3]:
y_train,train_text=read_files("train")
y_test,test_text=read_files("test")

token = Tokenizer(num_words=3000)
token.fit_on_texts(train_text)

read train files: 10000
read test files: 10000


In [4]:
x_train_seq = token.texts_to_sequences(train_text)
x_test_seq  = token.texts_to_sequences(test_text)

In [5]:
x_train = sequence.pad_sequences(x_train_seq, maxlen=100)
x_test  = sequence.pad_sequences(x_test_seq,  maxlen=100)

In [6]:
import numpy as np
y_train_array = np.array(y_train)
y_test_array = np.array(y_test)

In [8]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM

model = Sequential()
model.add(Embedding(output_dim=32,
                    input_dim=3000, 
                    input_length=100))
model.add(Dropout(0.35))
model.add(LSTM(32))
model.add(Dense(units=256,activation='relu' ))
model.add(Dropout(0.35))
model.add(Dense(units=1,activation='sigmoid' ))

In [9]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 32)           96000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 100, 32)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 32)                8320      
_________________________________________________________________
dense (Dense)                (None, 256)               8448      
_________________________________________________________________
dropout_2 (Dropout)          (None, 256)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 257       
Total params: 113,025
Trainable params: 113,025
Non-trainable params: 0
________________________________________________

In [10]:
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])

In [11]:
train_history =model.fit(x_train, y_train_array,batch_size=100, 
                         epochs=20,verbose=2,
                         validation_split=0.2)

Epoch 1/20
80/80 - 9s - loss: 0.6033 - accuracy: 0.6646 - val_loss: 0.5825 - val_accuracy: 0.7815
Epoch 2/20
80/80 - 7s - loss: 0.3359 - accuracy: 0.8562 - val_loss: 0.5094 - val_accuracy: 0.7390
Epoch 3/20
80/80 - 7s - loss: 0.2623 - accuracy: 0.8959 - val_loss: 0.5463 - val_accuracy: 0.7680
Epoch 4/20
80/80 - 8s - loss: 0.2103 - accuracy: 0.9202 - val_loss: 0.4851 - val_accuracy: 0.8015
Epoch 5/20
80/80 - 9s - loss: 0.2044 - accuracy: 0.9197 - val_loss: 0.5155 - val_accuracy: 0.7960
Epoch 6/20
80/80 - 9s - loss: 0.1724 - accuracy: 0.9321 - val_loss: 0.7288 - val_accuracy: 0.7715
Epoch 7/20
80/80 - 7s - loss: 0.1599 - accuracy: 0.9392 - val_loss: 0.9525 - val_accuracy: 0.7055
Epoch 8/20
80/80 - 6s - loss: 0.1413 - accuracy: 0.9463 - val_loss: 0.8556 - val_accuracy: 0.7355
Epoch 9/20
80/80 - 6s - loss: 0.1288 - accuracy: 0.9539 - val_loss: 0.9787 - val_accuracy: 0.7475
Epoch 10/20
80/80 - 6s - loss: 0.1200 - accuracy: 0.9571 - val_loss: 1.0880 - val_accuracy: 0.7495
Epoch 11/20
80/80 -

In [12]:
scores = model.evaluate(x_test,y_test_array,verbose = 1)
scores[1]



0.798799991607666

In [13]:
predict = model.predict_classes(x_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [14]:
predict[:10]

array([[1],
       [1],
       [0],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1],
       [1]])

In [15]:
y_train_array[:10]

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1])