In [None]:
import keras
import os

imdb_dir = 'input'
train_dir = os.path.join(imdb_dir, 'imdb')

labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fname in os.listdir(dir_name):
        if fname[-4:] == '.txt':
            f = open(os.path.join(dir_name, fname))
            texts.append(f.read())
            f.close()
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)
                
L1 = labels[0:1250]
L2 = labels[12500:13750]
L3 = labels[1250:12500]
L4 = labels[13750:25000]
T1 = texts[0:1250]
T2 = texts[12500:13750]
T3 = texts[1250:12500]
T4 = texts[13750:25000]

L1.extend(L2)
L3.extend(L4)
T1.extend(T2)
T3.extend(T4)
labels_test = L1
texts_test = T1
labels = L3
texts = T3

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 500  # We will cut reviews after 500 words
max_words = 10000  # We will only consider the top 10,000 words in the dataset

tokenizer = Tokenizer(num_words=max_words)  # https://keras.io/preprocessing/text/
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)  # This turns strings into lists of integer indices.

word_index = tokenizer.word_index
# print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
# print('Shape of data tensor:', data.shape)
# print('Shape of label tensor:', labels.shape)

indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).

onehot = np.zeros((data.shape[0], max_words))
for i in range(data.shape[0]):
    for j in range(maxlen):
        temp2 = data [i]
        if temp2[j] != 0:
            a = temp2[j] - 1
            onehot[i][a] = 1
            
from sklearn.model_selection import train_test_split
from sklearn.metrics.classification import accuracy_score
from sklearn.metrics import confusion_matrix
from models import SupervisedDBNClassification

# Loading dataset
X, Y = onehot, labels

# Data scaling
X = (X).astype(np.float32)

# Splitting data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=2/9, random_state=0)

# Training
classifier = SupervisedDBNClassification(hidden_layers_structure=[16, 16],
                                         learning_rate_rbm=0.01,
                                         learning_rate=0.1,
                                         n_epochs_rbm=10,
                                         n_iter_backprop=20,
                                         batch_size=256,
                                         activation_function='relu',
                                         dropout_p=0.1)

In [None]:
%%time
classifier.fit(X_train, Y_train)

Collect the training time on the training set.

In [None]:
# Test
sequences = tokenizer.texts_to_sequences(texts_test)
x_test = pad_sequences(sequences, maxlen=maxlen)
y_test = np.asarray(labels_test)

onehot_test = np.zeros((x_test.shape[0], max_words))
for i in range(x_test.shape[0]):
    for j in range(maxlen):
        temp2 = x_test [i]
        if temp2[j] != 0:
            a = temp2[j] - 1
            onehot_test[i][a] = 1
            
x_test = (onehot_test).astype(np.float32)

In [None]:
%%time
y_pred = classifier.predict(x_test)
print('Done.\nAccuracy: %f' % accuracy_score(y_test, y_pred))

Collect the test accuracy on the test set.