In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
from tensorflow import keras
from keras.datasets import imdb
from keras.models import Sequential, load_model
from keras.layers import LSTM, Embedding, Dense, Conv1D, GRU, Dropout, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [4]:
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

In [5]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [6]:
len(x_train), len(x_test)

(25000, 25000)

In [7]:
category = max(y_train) + 1
category

2

In [8]:
max(len(l) for l in x_train)

2494

In [9]:
sum(map(len, x_train)) / len(x_train)

238.71364

In [10]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
np.asarray((unique_elements, counts_elements))

array([[    0,     1],
       [12500, 12500]], dtype=int64)

In [11]:
word_to_index = imdb.get_word_index()
index_to_word = {}
for key, value in word_to_index.items():
    index_to_word[value] = key
len(word_to_index)

88584

In [12]:
print('빈도수 상위 1번 단어 : {}'.format(index_to_word[1]))
print('빈도수 상위 12345번 단어 : {}'.format(index_to_word[12345]))
print('빈도수 최하위(88584) 단어 : {}'.format(index_to_word[88584]))

빈도수 상위 1번 단어 : the
빈도수 상위 12345번 단어 : liberated
빈도수 최하위(88584) 단어 : 'l'


In [13]:
# 일곱번째 리뷰 복원
print(' '.join([index_to_word[s] for s in x_train[6]]))

the boiled full involving to impressive boring this as murdering naschy br villain council suggestion need has of costumes b message to may of props this echoed concentrates concept issue skeptical to god's he is dedications unfolds movie women like isn't surely i'm rocketed to toward in here's for from did having because very quality it is captain's starship really book is both too worked carl of mayfair br of reviewer closer figure really there will originals things is far this make mistakes kevin's was couldn't of few br of you to don't female than place she to was between that nothing dose movies get are 498 br yes female just its because many br of overly to descent people time very bland


### LSTM 5000/500

In [14]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)

In [15]:
max_len = 500
x_train = pad_sequences(x_train, max_len)
x_test = pad_sequences(x_test, max_len)

In [16]:
model = Sequential([
    Embedding(5000, 120),
    LSTM(120),
    Dense(1, activation='sigmoid')
])

In [17]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 120)         600000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 120)               115680    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 121       
Total params: 715,801
Trainable params: 715,801
Non-trainable params: 0
_________________________________________________________________


In [18]:
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [19]:
es1 = EarlyStopping(mode='min', verbose=1, patience=4)
mc1 = ModelCheckpoint('model/imdb_lstm_best_model.h5', mode='max', verbose =1, monitor='val_accuracy', save_best_only=True)

In [20]:
history = model.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_test, y_test), callbacks=[es1, mc1])

Train on 25000 samples, validate on 25000 samples
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.83516, saving model to model/imdb_lstm_best_model.h5
Epoch 2/10

Epoch 00002: val_accuracy improved from 0.83516 to 0.85784, saving model to model/imdb_lstm_best_model.h5
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.85784
Epoch 4/10

Epoch 00004: val_accuracy improved from 0.85784 to 0.86480, saving model to model/imdb_lstm_best_model.h5
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.86480
Epoch 6/10

KeyboardInterrupt: 

### GRU

In [None]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)

In [None]:
max_len = 500
x_train = pad_sequences(x_train, max_len)
x_test = pad_sequences(x_test, max_len)

In [None]:
model2 = Sequential([
    Embedding(5000, 120),
    GRU(120),
    Dense(1, activation='sigmoid')
])

In [None]:
model2.summary()

In [None]:
model2.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [None]:
es2 = EarlyStopping(mode='min', verbose=1, patience=4)
mc2 = ModelCheckpoint('model/imdb_lstm_best_model.h5', mode='max', verbose =1, monitor='val_accuracy', save_best_only=True)

In [None]:
history2 = model2.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_test, y_test), callbacks=[es1, mc1])

### LSTM + CNN

In [None]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)

In [None]:
max_len = 100
x_train = pad_sequences(x_train, max_len)
x_test = pad_sequences(x_test, max_len)

In [None]:
model3 = Sequential([
    Embedding(5000,120),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(4),
    LSTM(55),
    Dense(1, activation='sigmoid')
])

In [None]:
model3.summary()

In [None]:
model3.compile(loss = 'binary_crossentropy',
                 optimizer = 'adam',
                 metrics=['accuracy'])

In [None]:
es3 = EarlyStopping(mode='min', verbose=1, patience=4)
mc3 = ModelCheckpoint('model/imdb_cnn_best_model.h5', monitor='val_accuracy', verbose=1, mode = 'max', save_best_only=True)

In [None]:
history3 = model3.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_test, y_test), callbacks=[es3, mc3])

In [None]:
del model3

In [None]:
model3 = load_model('model/imdb_cnn_best_model.h5')

In [None]:
model3.evaluate(x_test, y_test)[1]