In [1]:
import numpy as np
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential,load_model
from tensorflow.keras.layers import Dense,Dropout,LSTM, Embedding,Flatten
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.callbacks import ModelCheckpoint
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

vocab_size = 35000
maxlen=100
dictionary_word = imdb.get_word_index(path='imdb_word_index.json')
dictionary_index = {value:key for key,value in zip(dictionary_word.keys(),dictionary_word.values())}
stopwords_ = stopwords.words('english')

(x_train, y_train), (x_test,y_test) = imdb.load_data(num_words = vocab_size)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lyt09\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
stopwords_idx=[]
for idx in stopwords_:
    try:
        stopwords_idx.append(dictionary_word[idx])
    except:
        continue

In [3]:
def word_preprocessing(stopwords_idx,x_train):
    x_train_pre = [np.array(x) for x in x_train]
    x_train_pre = np.array(x_train_pre)

    for word in stopwords_idx:
        for idx,x in enumerate(x_train_pre):
            x_train_pre[idx] = np.delete(x,np.where(x==word))
    return x_train_pre

In [4]:
x_train = word_preprocessing(stopwords_idx,x_train)
x_test = word_preprocessing(stopwords_idx,x_test)

In [5]:
x_train = sequence.pad_sequences(x_train, maxlen=maxlen,padding='post')
x_test = sequence.pad_sequences(x_test, maxlen=maxlen,padding='post')
print(x_train[1])

[  194  1153   194  8255   228  1463  4369  5012   715  1634   394   954
   189   102   207   110  3103   188     7   249    93   114  2300  1523
   647   116  8163   229   340  1322  4901    19  1002   952    37   455
  1543   398  1649  6853   163  3215 10156  1153   194   775     7  8255
 11596   349  2637   148   605 15358  8003   123   125 23141  6853   349
   165  4362   228  1157   299   120   120   174   220   175   136  4373
   228  8255 25249   656   245  2350  9837   152   491  7464  1212   371
   625    64  1382  1690  1355    28   154   462   285     0     0     0
     0     0     0     0]


In [9]:
for i in x_train[1]:
    print(i, ':', dictionary_index[i])

194 : thought
1153 : solid
194 : thought
8255 : senator
228 : making
1463 : spot
4369 : nomination
5012 : assumed
715 : jack
1634 : picked
394 : getting
954 : hands
189 : fact
102 : characters
207 : always
110 : life
3103 : thrillers
188 : can't
7 : br
249 : sure
93 : way
114 : little
2300 : strongly
1523 : random
647 : view
116 : love
8163 : principles
229 : guy
340 : used
1322 : producer
4901 : icon
19 : film
1002 : outside
952 : unique
37 : like
455 : direction
1543 : imagination
398 : keep
1649 : queen
6853 : diverse
163 : makes
3215 : stretch
10156 : stefan
1153 : solid
194 : thought
775 : begins
7 : br
8255 : senator
11596 : machinations
349 : budget
2637 : worthwhile
148 : though
605 : ok
15358 : brokedown
8003 : awaiting
123 : ever
125 : better
23141 : lugia
6853 : diverse
349 : budget
165 : look
4362 : kicked
228 : making
1157 : follows
299 : effects
120 : show
120 : show
174 : cast
220 : family
175 : us
136 : scenes
4373 : severe
228 : making
8255 : senator
25249 : levant's
6

KeyError: 0

In [10]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size,output_dim=128,input_length=maxlen))
model.add(LSTM(128,dropout = 0.2, recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam', metrics=['acc'])

che = 'keras_model.h5'
point = ModelCheckpoint(filepath=che , monitor='val_loss', verbose=1, save_best_only=True)

In [18]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 100, 128)          4480000   
_________________________________________________________________
lstm (LSTM)                  (None, 128)               131584    
_________________________________________________________________
dense (Dense)                (None, 1)                 129       
Total params: 4,611,713
Trainable params: 4,611,713
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(x_train,y_train,batch_size=32, epochs=10,validation_data = (x_test, y_test),callbacks=[point])

Epoch 1/10
Epoch 00001: val_loss improved from inf to 0.37573, saving model to keras_model1
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.
INFO:tensorflow:Assets written to: keras_model1\assets
Epoch 2/10
Epoch 00002: val_loss did not improve from 0.37573
Epoch 3/10
Epoch 00003: val_loss did not improve from 0.37573
Epoch 4/10
Epoch 00004: val_loss did not improve from 0.37573
Epoch 5/10
Epoch 00005: val_loss did not improve from 0.37573
Epoch 6/10
Epoch 00006: val_loss did not improve from 0.37573
Epoch 7/10
Epoch 00007: val_loss did not improve from 0.37573
Epoch 8/10
Epoch 00008: val_loss did not improve from 0.37573
Epoch 9/10
Epoch 00009: val_loss did not improve from 0.37573
Epoch 10/10
Epoch 00010: val_loss did not improve from 0.37573


<tensorflow.python.keras.callbacks.History at 0x2c177932520>

In [None]:
model = load_model('keras_model.h5')

In [13]:
pred = model.predict(x_train[1])
print(np.round(pred),y_train[1])


[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [1.]
 [1.]
 [1.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [1.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]] 0


In [17]:
pred1 = np.round(pred)
pred1 = pred1.reshape(-1).astype('int')

for i,pred in zip(x_train[1],pred1):
    print(i, ':', dictionary_index[i],'/', 'predict : ', pred)

194 : thought / predict :  0
1153 : solid / predict :  0
194 : thought / predict :  0
8255 : senator / predict :  0
228 : making / predict :  0
1463 : spot / predict :  1
4369 : nomination / predict :  1
5012 : assumed / predict :  1
715 : jack / predict :  1
1634 : picked / predict :  1
394 : getting / predict :  0
954 : hands / predict :  0
189 : fact / predict :  0
102 : characters / predict :  0
207 : always / predict :  0
110 : life / predict :  1
3103 : thrillers / predict :  0
188 : can't / predict :  1
7 : br / predict :  0
249 : sure / predict :  0
93 : way / predict :  0
114 : little / predict :  0
2300 : strongly / predict :  1
1523 : random / predict :  0
647 : view / predict :  0
116 : love / predict :  0
8163 : principles / predict :  0
229 : guy / predict :  0
340 : used / predict :  0
1322 : producer / predict :  0
4901 : icon / predict :  1
19 : film / predict :  1
1002 : outside / predict :  1
952 : unique / predict :  0
37 : like / predict :  1
455 : direction / pred

KeyError: 0

In [24]:
print('테스트 정확도 : %.4f'% (model.evaluate(x_test,y_test)[1]))

테스트 정확도 : 0.8311
