In [1]:
import numpy as np
import re
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Conv1D, MaxPool1D, Flatten, Embedding
from tensorflow.keras.preprocessing import sequence

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [41]:
def read_from_file(filename, max_review_length):
    text = ''
    with open(filename, 'r') as f:
        text = f.read().lower()
        text = (re.sub(r"[^a-zA-Z0-9']", " ", text)).split()
    dictionary = imdb.get_word_index()
    vectorized = []
    for word in text:
        word = dictionary.get(word)
        if word in range(1, 10000):
            vectorized.append(word + 3)
    padded = []
    padded.append(vectorized)
    result = sequence.pad_sequences(padded, maxlen=max_review_length)
    return result

In [38]:
def ensemble_predict(models, x):
    y = []
    for model in models:
        y.append(model.predict(x, verbose=1))
    result = np.asarray(y)
    return np.round(np.mean(result, 0))

In [31]:
(training_data, training_targets), (testing_data, testing_targets) = imdb.load_data(num_words=10000)
data = np.concatenate((training_data, testing_data), axis=0)
targets = np.concatenate((training_targets, testing_targets), axis=0)

test_size = 10000
X_test = data[:test_size]
Y_test = targets[:test_size]
X_train = data[test_size:]
Y_train = targets[test_size:]

In [32]:
max_review_length = 500
voc_size = 10000
embedding_len = 32
X_train = sequence.pad_sequences(X_train, maxlen=max_review_length)
X_test = sequence.pad_sequences(X_test, maxlen=max_review_length)

In [16]:
def model_1():
    model = Sequential()
    model.add(Embedding(voc_size, embedding_len, input_length=max_review_length))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPool1D(pool_size=2))
    model.add(LSTM(100, dropout=0.3))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [17]:
def model_2():
    model = Sequential()
    model.add(Embedding(voc_size, embedding_len, input_length=max_review_length))
    model.add(Conv1D(filters=16, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPool1D(pool_size=2))
    model.add(Dropout(0.25))
    model.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
    model.add(MaxPool1D(pool_size=2))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [18]:
def model_3():
    model = Sequential()
    model.add(Embedding(voc_size, embedding_len, input_length=max_review_length))
    model.add(Dense(64))
    model.add(Dropout(0.25))
    model.add(Dense(128))
    model.add(Dropout(0.5))
    model.add(LSTM(100))
    model.add(Dense(1, activation='sigmoid'))
    return model

In [19]:
models = [model_1(), model_2(), model_3()]
train_size = len(X_train) // len(models)
test_size = len(X_test) // len(models)
for i, model in enumerate(models):
    x_train = X_train[i * train_size: (i + 1) * train_size]
    y_train = Y_train[i * train_size: (i + 1) * train_size]
    x_test = X_test[i * test_size: (i + 1) * test_size]
    y_test = Y_test[i * test_size: (i + 1) * test_size]
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    print("TRAINING MODEL {}".format(i))
    model.fit(x_train, y_train, validation_split=0.1, epochs=2, batch_size=64, verbose=1)
    scores = model.evaluate(x_test, y_test, verbose=0)
    print("model_{} accuracy: {}".format(i, scores[1]))

TRAINING MODEL 0
Train on 11999 samples, validate on 1334 samples
Epoch 1/2
Epoch 2/2
model_0 accuracy: 0.8502850532531738
TRAINING MODEL 1
Train on 11999 samples, validate on 1334 samples
Epoch 1/2
Epoch 2/2
model_1 accuracy: 0.8787878751754761
TRAINING MODEL 2
Train on 11999 samples, validate on 1334 samples
Epoch 1/2
Epoch 2/2
model_2 accuracy: 0.8454845547676086


In [39]:
from sklearn.metrics import accuracy_score
ensemble_prediction = ensemble_predict(models, X_test)
acc = accuracy_score(Y_test, ensemble_prediction)
print("Ensamble accuracy: {}".format(acc))

Ensamble accuracy: 0.8869


In [34]:
print(models[0].predict(X_test, verbose=1))

[[0.9295374 ]
 [0.02988201]
 [0.02072987]
 ...
 [0.9422792 ]
 [0.04105219]
 [0.03728533]]


In [49]:
for i in range(1, 5):
    text = read_from_file(str(i), max_review_length)
    ensemble_prediction = ensemble_predict(models, text)
    print('Prediction for text {} is {}'.format(i, ensemble_prediction))

Prediction for text 1 is [[1.]]
Prediction for text 2 is [[0.]]
Prediction for text 3 is [[1.]]
Prediction for text 4 is [[0.]]
