In [1]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from tensorflow import keras
from keras.datasets import imdb
from keras.models import Sequential, load_model
from keras.layers import LSTM, Embedding, Dense, Conv1D, GRU, Dropout, MaxPooling1D
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical

Using TensorFlow backend.


In [3]:
(x_train, y_train), (x_test, y_test) = imdb.load_data()

In [4]:
len(x_train), len(x_test)

(25000, 25000)

In [5]:
category = max(y_train) + 1
category

2

In [6]:
max(len(l) for l in x_train)

2494

In [7]:
sum(map(len, x_train)) / len(x_train)

238.71364

In [8]:
unique_elements, counts_elements = np.unique(y_train, return_counts=True)
np.asarray((unique_elements, counts_elements))

array([[    0,     1],
       [12500, 12500]], dtype=int64)

In [9]:
word_to_index = imdb.get_word_index()
index_to_word = {}
for key, value in word_to_index.items():
    index_to_word[value] = key
len(word_to_index)

88584

In [10]:
print('빈도수 상위 1번 단어 : {}'.format(index_to_word[1]))
print('빈도수 상위 12345번 단어 : {}'.format(index_to_word[12345]))
print('빈도수 최하위(88584) 단어 : {}'.format(index_to_word[88584]))

빈도수 상위 1번 단어 : the
빈도수 상위 12345번 단어 : liberated
빈도수 최하위(88584) 단어 : 'l'


In [11]:
# 일곱번째 리뷰 복원
print(' '.join([index_to_word[s] for s in x_train[6]]))

the boiled full involving to impressive boring this as murdering naschy br villain council suggestion need has of costumes b message to may of props this echoed concentrates concept issue skeptical to god's he is dedications unfolds movie women like isn't surely i'm rocketed to toward in here's for from did having because very quality it is captain's starship really book is both too worked carl of mayfair br of reviewer closer figure really there will originals things is far this make mistakes kevin's was couldn't of few br of you to don't female than place she to was between that nothing dose movies get are 498 br yes female just its because many br of overly to descent people time very bland


### LSTM 5000/500

In [12]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)

In [13]:
max_len = 500
x_train = pad_sequences(x_train, max_len)
x_test = pad_sequences(x_test, max_len)

In [14]:
model = Sequential([
    Embedding(5000, 120),
    LSTM(120),
    Dense(1, activation='sigmoid')
])

In [15]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 120)         600000    
_________________________________________________________________
lstm_1 (LSTM)                (None, 120)               115680    
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 121       
Total params: 715,801
Trainable params: 715,801
Non-trainable params: 0
_________________________________________________________________


In [16]:
model.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [17]:
es1 = EarlyStopping(mode='min', verbose=1, patience=4)
mc1 = ModelCheckpoint('model/imdb_lstm_best_model.h5', mode='max', verbose =1, monitor='val_accuracy', save_best_only=True)

In [18]:
history = model.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_test, y_test), callbacks=[es1, mc1])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/10
 5824/25000 [=====>........................] - ETA: 11:36 - loss: 0.6593 - accuracy: 0.6494

KeyboardInterrupt: 

### GRU

In [20]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)

In [21]:
max_len = 500
x_train = pad_sequences(x_train, max_len)
x_test = pad_sequences(x_test, max_len)

In [22]:
model2 = Sequential([
    Embedding(5000, 120),
    GRU(120),
    Dense(1, activation='sigmoid')
])

In [23]:
model2.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 120)         600000    
_________________________________________________________________
gru_1 (GRU)                  (None, 120)               86760     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 121       
Total params: 686,881
Trainable params: 686,881
Non-trainable params: 0
_________________________________________________________________


In [24]:
model2.compile(loss='binary_crossentropy', optimizer = 'adam', metrics=['accuracy'])

In [25]:
es2 = EarlyStopping(mode='min', verbose=1, patience=4)
mc2 = ModelCheckpoint('model/imdb_lstm_best_model.h5', mode='max', verbose =1, monitor='val_accuracy', save_best_only=True)

In [None]:
history2 = model2.fit(x_train, y_train, batch_size=64, epochs=10, validation_data=(x_test, y_test), callbacks=[es1, mc1])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/10
  256/25000 [..............................] - ETA: 17:33 - loss: 0.6926 - accuracy: 0.5273

### LSTM + CNN

In [15]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)

In [16]:
max_len = 100
x_train = pad_sequences(x_train, max_len)
x_test = pad_sequences(x_test, max_len)

In [17]:
model3 = Sequential([
    Embedding(5000,120),
    Dropout(0.5),
    Conv1D(64, 5, activation='relu'),
    MaxPooling1D(4),
    LSTM(55),
    Dense(1, activation='sigmoid')
])

In [18]:
model3.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 120)         600000    
_________________________________________________________________
dropout_3 (Dropout)          (None, None, 120)         0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, None, 64)          38464     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, None, 64)          0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 55)                26400     
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 56        
Total params: 664,920
Trainable params: 664,920
Non-trainable params: 0
________________________________________________

In [19]:
model3.compile(loss = 'binary_crossentropy',
                 optimizer = 'adam',
                 metrics=['accuracy'])

In [20]:
es3 = EarlyStopping(mode='min', verbose=1, patience=4)
mc3 = ModelCheckpoint('model/imdb_cnn_best_model.h5', monitor='val_accuracy', verbose=1, mode = 'max', save_best_only=True)

In [21]:
history3 = model3.fit(x_train, y_train, epochs=10, batch_size=64, validation_data=(x_test, y_test), callbacks=[es3, mc3])

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/10

Epoch 00001: val_accuracy improved from -inf to 0.85812, saving model to model/imdb_cnn_best_model.h5
Epoch 2/10

Epoch 00002: val_accuracy did not improve from 0.85812
Epoch 3/10

Epoch 00003: val_accuracy did not improve from 0.85812
Epoch 4/10

Epoch 00004: val_accuracy did not improve from 0.85812
Epoch 5/10

Epoch 00005: val_accuracy did not improve from 0.85812
Epoch 6/10

Epoch 00006: val_accuracy did not improve from 0.85812
Epoch 00006: early stopping


In [22]:
del model3

In [23]:
model3 = load_model('model/imdb_cnn_best_model.h5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


In [24]:
model3.evaluate(x_test, y_test)[1]



0.8581200242042542