In [1]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

### 1. 讀入深度學習套件

In [3]:
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding
from tensorflow.keras.layers import LSTM
from tensorflow.keras.datasets import imdb

### 2. 讀入數據

In [27]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=8000)

### *檢視數據集資料

In [28]:
print(len(x_train))
print(len(x_test))

25000
25000


In [29]:
x_train[0]

[1,
 14,
 22,
 16,
 43,
 530,
 973,
 1622,
 1385,
 65,
 458,
 4468,
 66,
 3941,
 4,
 173,
 36,
 256,
 5,
 25,
 100,
 43,
 838,
 112,
 50,
 670,
 2,
 9,
 35,
 480,
 284,
 5,
 150,
 4,
 172,
 112,
 167,
 2,
 336,
 385,
 39,
 4,
 172,
 4536,
 1111,
 17,
 546,
 38,
 13,
 447,
 4,
 192,
 50,
 16,
 6,
 147,
 2025,
 19,
 14,
 22,
 4,
 1920,
 4613,
 469,
 4,
 22,
 71,
 87,
 12,
 16,
 43,
 530,
 38,
 76,
 15,
 13,
 1247,
 4,
 22,
 17,
 515,
 17,
 12,
 16,
 626,
 18,
 2,
 5,
 62,
 386,
 12,
 8,
 316,
 8,
 106,
 5,
 4,
 2223,
 5244,
 16,
 480,
 66,
 3785,
 33,
 4,
 130,
 12,
 16,
 38,
 619,
 5,
 25,
 124,
 51,
 36,
 135,
 48,
 25,
 1415,
 33,
 6,
 22,
 12,
 215,
 28,
 77,
 52,
 5,
 14,
 407,
 16,
 82,
 2,
 8,
 4,
 107,
 117,
 5952,
 15,
 256,
 4,
 2,
 7,
 3766,
 5,
 723,
 36,
 71,
 43,
 530,
 476,
 26,
 400,
 317,
 46,
 7,
 4,
 2,
 1029,
 13,
 104,
 88,
 4,
 381,
 15,
 297,
 98,
 32,
 2071,
 56,
 26,
 141,
 6,
 194,
 7486,
 18,
 4,
 226,
 22,
 21,
 134,
 476,
 26,
 480,
 5,
 144,
 30,
 5535,
 18,

In [30]:
print(len(x_train[0])) #第一則評論有218個字
print(len(x_train[1])) #第二則評論則有189個字

218
189


In [31]:
print(y_train[0]) #正評
print(y_train[1]) #負評

1
0


### 3. 資料處理

In [32]:
x_train = sequence.pad_sequences(x_train, maxlen=120)
x_test = sequence.pad_sequences(x_test, maxlen=120)
#numpy array無法接受資料長度不同，因此需要調整成一樣長

### Step1: 打造函數學習機

In [33]:
model = Sequential()

In [34]:
model.add(Embedding(8000,128))
#壓縮成128維

In [35]:
model.add(LSTM(64, dropout=0.2, recurrent_dropout=0.2))

In [36]:
model.add(Dense(1, activation='sigmoid')) 
#0-1數字→選用sigmoid

### Step2: 組裝神經網路

In [37]:
model.compile(loss='binary_crossentropy', #分類型問題可使用
             optimizer='adam',
             metrics=['accuracy'])

In [38]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 128)         1024000   
_________________________________________________________________
lstm_2 (LSTM)                (None, 64)                49408     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 65        
Total params: 1,073,473
Trainable params: 1,073,473
Non-trainable params: 0
_________________________________________________________________


In [39]:
(64+128+1)*4*64
#計算需要調整的參數：
#1個LSTM Cell: 64個hidden state(自己回傳+其他cell分享)+128個壓縮後的輸入+bias
#乘上4(3個gates+1個更新狀態)、乘上64(共64個LSTM)

49408

### Step3:訓練

In [40]:
model.fit(x_train, y_train, batch_size=35, epochs=10,
         validation_data=(x_test, y_test))
#validation: 用測試資料去算誤差(不參與訓練)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x27188e40808>

In [41]:
model_json = model.to_json()
open('imdb_model_architecture.json', 'w').write(model_json)
model.save_weights('imdb_model_weights.h5')

### 調整項目：
1. 調降常用字數量(10000→8000)
2. 調高共同資料長度(100→120)
3. 調降LSTM數量(128→64)
4. 調高batch_size(32→35)

#### 在訓練次數不變，其他數據依上述數字調整後，訓練準確率略為下降。(0.9834→0.9751)