In [1]:
%env KERAS_BACKED=tensorflow

env: KERAS_BACKED=tensorflow


In [2]:
%matplotlib inline

import numpy as np
import matplotlib.pyplot as plt

In [3]:
from keras.datasets import imdb

Using TensorFlow backend.


In [4]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=10000)
#這裡限制只選「最常用」1 萬字

Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [5]:
print('訓練總筆數:', len(x_train))
print('測試總筆數:', len(x_test))

訓練總筆數: 25000
測試總筆數: 25000


### 輸入資料部份
我們來看一下輸入部份長什麼樣子?

In [7]:
x_train[24999]
#注意這其實是一個 list 而不是 array, 原因是每筆資料 (每段影評) 長度自然是不一樣的! 我們檢查一下前 10 筆的長度就可以知道。 (from 老師的git hub)
#數字是字常不常用的頻率

[1,
 17,
 6,
 194,
 337,
 7,
 4,
 204,
 22,
 45,
 254,
 8,
 106,
 14,
 123,
 4,
 2,
 270,
 2,
 5,
 2,
 2,
 732,
 2098,
 101,
 405,
 39,
 14,
 1034,
 4,
 1310,
 9,
 115,
 50,
 305,
 12,
 47,
 4,
 168,
 5,
 235,
 7,
 38,
 111,
 699,
 102,
 7,
 4,
 4039,
 9245,
 9,
 24,
 6,
 78,
 1099,
 17,
 2345,
 2,
 21,
 27,
 9685,
 6139,
 5,
 2,
 1603,
 92,
 1183,
 4,
 1310,
 7,
 4,
 204,
 42,
 97,
 90,
 35,
 221,
 109,
 29,
 127,
 27,
 118,
 8,
 97,
 12,
 157,
 21,
 6789,
 2,
 9,
 6,
 66,
 78,
 1099,
 4,
 631,
 1191,
 5,
 2642,
 272,
 191,
 1070,
 6,
 7585,
 8,
 2197,
 2,
 2,
 544,
 5,
 383,
 1271,
 848,
 1468,
 2,
 497,
 2,
 8,
 1597,
 8778,
 2,
 21,
 60,
 27,
 239,
 9,
 43,
 8368,
 209,
 405,
 10,
 10,
 12,
 764,
 40,
 4,
 248,
 20,
 12,
 16,
 5,
 174,
 1791,
 72,
 7,
 51,
 6,
 1739,
 22,
 4,
 204,
 131,
 9]

In [8]:
print(len(x_train[24999]),len(x_train[9982]))

153 156


In [9]:
from keras.preprocessing import sequence

### 送入神經網路的輸入處理
● 設輸入文字長度的上限 
● 把每段文字都弄成一樣長, 太短的後面補上 0

In [10]:
x_train = sequence.pad_sequences(x_train, maxlen=150)
x_test = sequence.pad_sequences(x_test, maxlen=150)

In [11]:
x_train.shape

(25000, 150)

## 打造 RNN¶ 
### 上課版本
選用 LSTM



In [12]:
N = 3 # 文字要壓到 N 維
K = 4 # LSTM 有 K 個神經元

In [13]:
from keras.models import Sequential
from keras.layers import Dense, Embedding
from keras.layers import LSTM

In [15]:
model = Sequential()
model.add(Embedding(10000, N))

#LSTM 層, 我們做 K 個 LSTM Cells。
model.add(LSTM(K))

#單純透過 sigmoid 輸出
model.add(Dense(1, activation='sigmoid'))

In [16]:
#組裝
model.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, None, 3)           30000     
_________________________________________________________________
lstm_2 (LSTM)                (None, 4)                 128       
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 5         
Total params: 30,133
Trainable params: 30,133
Non-trainable params: 0
_________________________________________________________________


In [17]:
#Train
model.fit(x_train, y_train,
         batch_size=32,
         epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7ffb65b770b8>

In [18]:
score = model.evaluate(x_test, y_test)
print(f'測試資料的 loss = {score[0]}')
print(f'測試資正確率 = {score[1]}')

測試資料的 loss = 0.4096148190832138
測試資正確率 = 0.85092


### 把結果存檔

In [19]:
model_json = model.to_json()
open('imdb_model_arch.json',
     'w').write(model_json)
model.save_weights('imdb_model_weights.h5')

## 做一個自己的RNN模型

### 調高維度


In [20]:
#維度調高
N = 10 # 文字要壓到 N 維
K = 10 # LSTM 有 K 個神經元

In [22]:
RNNmodel = Sequential()
RNNmodel.add(Embedding(10000, N))

#LSTM 層, 我們做 K 個 LSTM Cells。
RNNmodel.add(LSTM(K))

#單純透過 sigmoid 輸出
RNNmodel.add(Dense(1, activation='sigmoid'))

In [24]:
RNNmodel.compile(loss='binary_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])
RNNmodel.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, None, 10)          100000    
_________________________________________________________________
lstm_3 (LSTM)                (None, 10)                840       
_________________________________________________________________
dense_3 (Dense)              (None, 1)                 11        
Total params: 100,851
Trainable params: 100,851
Non-trainable params: 0
_________________________________________________________________


In [26]:
#Train
RNNmodel.fit(x_train, y_train,
         batch_size=100,
         epochs=2)

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7ffb501ae160>