# 使用遷移學習Transfer Learning Sentiment analysis

## (trainable=True)
## Test accuracy: 0.49

## 比 trainable=False 高很多

In [1]:
# 使用:GoogleNews-vectors-negative300.bin
# glove.6B.zip
# code from:
# https://github.com/keras-team/keras/blob/master/examples/pretrained_word_embeddings.py

In [37]:
'''This script loads pre-trained word embeddings (GloVe embeddings)
into a frozen Keras Embedding layer, and uses it to
train a text classification model on the 20 Newsgroup dataset
(classification of newsgroup messages into 20 different categories).

GloVe embedding data can be found at:
http://nlp.stanford.edu/data/glove.6B.zip
(source page: http://nlp.stanford.edu/projects/glove/)

20 Newsgroup data can be found at:
http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html
'''

'This script loads pre-trained word embeddings (GloVe embeddings)\ninto a frozen Keras Embedding layer, and uses it to\ntrain a text classification model on the 20 Newsgroup dataset\n(classification of newsgroup messages into 20 different categories).\n\nGloVe embedding data can be found at:\nhttp://nlp.stanford.edu/data/glove.6B.zip\n(source page: http://nlp.stanford.edu/projects/glove/)\n\n20 Newsgroup data can be found at:\nhttp://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.html\n'

In [1]:
import fasttext
from pyfasttext import FastText
import time

In [2]:
from __future__ import print_function

import os
import sys
import numpy as np
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model

Using TensorFlow backend.


In [3]:
BASE_DIR = ''
GLOVE_DIR = os.path.join(BASE_DIR, './data/')  #向量檔案位置  glove.6B.100d.txt
TEXT_DATA_DIR = os.path.join(BASE_DIR, './data/20_newsgroup')   #20_newsgroup整包資料夾檔案位置  
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# 把glove.6B.100d.txt檔裡面的向量 與標題單字  切割開來存進字典

In [4]:
# first, build index mapping words in the embeddings set
# to their embedding vector

print('Indexing word vectors.')

embeddings_index = {}  #為字典
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))  #打開檔案
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')   #[1:] 第二位到最後一位給予...   因為第一位是英文單字
    embeddings_index[word] = coefs  #存進去字典
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Indexing word vectors.
Found 400000 word vectors.


In [5]:
# 一個單字有100維度向量   [0]為單字
for x in range(10):
    print(values[x])

sandberger
0.28365
-0.6263
-0.44351
0.2177
-0.087421
-0.17062
0.29266
-0.024899
0.26414


In [6]:
# 去除掉第一位的英文單字
coefs[0] ,coefs[1]

(0.28365, -0.6263)

In [7]:
# 觀看單字長度
word[0:-1] , len(word)

('sandberge', 10)

In [8]:
# 觀看字典中 單字向量   第一個
list(embeddings_index.items())[2]

('hoelter', array([-0.084826 , -0.21959  , -0.68825  , -0.15856  ,  0.0037678,
        -0.25393  , -0.014122 , -0.25515  ,  0.15734  , -0.28621  ,
        -0.10495  ,  0.3319   , -0.26232  , -0.27706  ,  0.017544 ,
         0.21251  , -0.13695  ,  0.21676  ,  0.13667  , -0.10023  ,
        -0.48343  ,  0.098403 , -0.23836  , -0.29935  , -0.49155  ,
         0.0035513,  0.07616  ,  0.11695  , -0.1576   ,  0.36189  ,
         0.017703 , -0.22847  ,  0.12772  , -0.33151  , -0.16558  ,
         0.049934 , -0.010591 , -0.1085   , -0.024022 ,  0.032036 ,
         0.24984  ,  0.20566  ,  0.18528  ,  0.11274  , -0.28665  ,
         0.082614 , -0.031535 ,  0.10971  ,  0.052535 ,  0.23758  ,
         0.16821  , -0.44542  , -0.11942  , -0.73416  ,  0.31036  ,
         0.60805  , -0.12909  ,  0.03542  , -0.68638  ,  0.20252  ,
         0.17505  , -0.23696  ,  0.36493  , -0.23961  , -0.41811  ,
        -0.023996 , -0.068086 ,  0.18158  , -0.17651  ,  0.047313 ,
         0.2711   ,  0.24319  ,  0.05

# 把資料夾裡面檔案的文章 前處理存進texts

In [9]:
# second, prepare text samples and their labels
print('Processing text dataset')

texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):  
    # listdir() = 回傳目錄底下的所有檔案名稱列表(list) (抓資料夾裡的所有檔案名稱)
    # TEXT_DATA_DIR =檔案位置 data中的資料夾包名稱  , 存進name裡面   sorted = 排序
    # sorted()可用來排序任何的iterable(string, dictionary, tuple...)  會產生新的list 不變動原本的
    
    path = os.path.join(TEXT_DATA_DIR, name)   # TEXT_DATA_DIR = './data/20_newsgroup'  , name = 各個資料夾名稱
    if os.path.isdir(path):  # os.path.isdir 判斷是否為... 路徑  , path = 各個資料夾路徑
        label_id = len(labels_index)   
        # labels_index這時內容空白為:{}字典  len(labels_index) = 0 , 存進去label_id
        labels_index[name] = label_id  
        # name資料夾名稱存進去 labels_index 變成每一個資料夾名稱 字典裡 有+標記0,1,2,3 依序排序
        
        for fname in sorted(os.listdir(path)): 
        #path = 裡面每一個子資料夾路徑 存進去fname  這邊有先排序

            if fname.isdigit():
            # isdigit():檢查字串是否由數字組成  因為在資料集裡面檔案都是由數字組成名稱
                
                fpath = os.path.join(path, fname)
                # fpath = 資料夾路徑 + 檔案名稱 
                
                if sys.version_info < (3,):
                # sys.version_info: 判斷目前python 版本 
                # , 似乎是用來取整數
                    f = open(fpath)
                
                else:
                    f = open(fpath, encoding='latin-1') 
                    # # fpath = 資料夾路徑 + 檔案名稱 
                    # Latin-1 表示一個很小的拉丁語言符號集，與ASCII基本上一致 其實不能用來表示龐大的東方語言字符集
                    
                t = f.read()
                # t = 每一個資料裡面的內容
                i = t.find('\n\n')  # skip header
                # i 剩下一堆數字
                if 0 < i:
                    t = t[i:]
                    # i: = 除了i那個元素 後面都要存進去  = 去除掉資料裡面 Lines:xxx 以上的東西(包含Line)
                texts.append(t)
                # texts[0] =  49960 第一筆資料 (去除掉資料裡面 Lines:xxx 以上的東西(包含Line)
                f.close()
                labels.append(label_id)
                # label_id = 0~19  每個數字有1000筆 存進去 labels[]

print('Found %s texts.' % len(texts))

Processing text dataset
Found 19997 texts.


In [10]:
# 用來觀看平台python版本
sys.version_info

sys.version_info(major=3, minor=5, micro=2, releaselevel='final', serial=0)

In [11]:
# 觀看資料夾內容
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    print(name)

alt.atheism
comp.graphics
comp.os.ms-windows.misc
comp.sys.ibm.pc.hardware
comp.sys.mac.hardware
comp.windows.x
misc.forsale
rec.autos
rec.motorcycles
rec.sport.baseball
rec.sport.hockey
sci.crypt
sci.electronics
sci.med
sci.space
soc.religion.christian
talk.politics.guns
talk.politics.mideast
talk.politics.misc
talk.religion.misc


In [12]:
# 底下的各個子資料夾
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    print(path)

./data/20_newsgroup/alt.atheism
./data/20_newsgroup/comp.graphics
./data/20_newsgroup/comp.os.ms-windows.misc
./data/20_newsgroup/comp.sys.ibm.pc.hardware
./data/20_newsgroup/comp.sys.mac.hardware
./data/20_newsgroup/comp.windows.x
./data/20_newsgroup/misc.forsale
./data/20_newsgroup/rec.autos
./data/20_newsgroup/rec.motorcycles
./data/20_newsgroup/rec.sport.baseball
./data/20_newsgroup/rec.sport.hockey
./data/20_newsgroup/sci.crypt
./data/20_newsgroup/sci.electronics
./data/20_newsgroup/sci.med
./data/20_newsgroup/sci.space
./data/20_newsgroup/soc.religion.christian
./data/20_newsgroup/talk.politics.guns
./data/20_newsgroup/talk.politics.mideast
./data/20_newsgroup/talk.politics.misc
./data/20_newsgroup/talk.religion.misc


In [13]:
# 路徑內容
name , path

('talk.religion.misc', './data/20_newsgroup/talk.religion.misc')

In [14]:
# 觀看一下 labels_index.items 內容

for name in sorted(os.listdir(TEXT_DATA_DIR)):  
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path): 
        label_id = len(labels_index)
        labels_index[name] = label_id
print(list(labels_index.items())[0:5])    

[('misc.forsale', 20), ('comp.windows.x', 20), ('sci.crypt', 20), ('talk.politics.guns', 20), ('talk.religion.misc', 20)]


# 將texts的文章轉為2值化序列  整理好格式之後切割資料集

In [15]:
# finally, vectorize the text samples into a 2D integer tensor
'''
MAX_SEQUENCE_LENGTH = 1000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2
'''

tokenizer = Tokenizer(num_words=MAX_NUM_WORDS) 
# MAX_NUM_WORDS = 20000
tokenizer.fit_on_texts(texts) 
# texts[0] =  49960 第一筆資料 (去除掉資料裡面 Lines:xxx 以上的東西(包含Line)
# texts： 要用來訓練的文件內容 , 存進tokenizer

sequences = tokenizer.texts_to_sequences(texts)
# texts：待轉為序列的文件內容
# return值：序列的列表，列表中每個序列對應於一段輸入文件
# sequences 為[] 每一個陣列內容有序列

word_index = tokenizer.word_index
# word_index: 字典，將單字（字串）映射為它們的排名或者索引。僅在調用fit_on_texts之後設定

print('Found %s unique tokens.' % len(word_index))


data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
# 這邊還沒有補0 每一個內容為1000個值

labels = to_categorical(np.asarray(labels))
# np.asarray  = 轉換為矩陣
# to_categorical: 將類別向量 映射為二值類別矩陣
# labels[] = label_id = 0~19  每個數字有1000筆 存進去 


print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# split the data into a training set and a validation set
indices = np.arange(data.shape[0])
# np.arange(3) = array([0, 1, 2])
# data.shape[0] = 19997

np.random.shuffle(indices)
# np.random.shuffle = 打亂順序 , 0~19996 順序打亂

data = data[indices]
# 把 data內容補0 每一個內容為1000個值

labels = labels[indices]
# 變成2維陣列 20個數值

num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])
# VALIDATION_SPLIT = 0.2 , data.shape[0] = 19997
# 取全部樣本20% , 分割資料集 training 跟 test


x_train = data[:-num_validation_samples]
# [:-3] = 最後一個陣列元素算0開始 , 從0到從後面數來第4個元素
# 也就是說 除了最後20% 前面全部拿來訓練
# data裡面有補0數值

y_train = labels[:-num_validation_samples]
# labels = 19997
# labels 裡面是20個數值


x_val = data[-num_validation_samples:]
# 後面20%
y_val = labels[-num_validation_samples:]
# 後面20%

Found 174074 unique tokens.
Shape of data tensor: (19997, 1000)
Shape of label tensor: (19997, 20)


## pad_sequences 參數

In [None]:
# 將長為nb_samples的序列轉化為形如 (nb_samples,nb_timesteps)2D numpy array
# 如果提供了參數 maxlen , nb_timesteps= maxlen
# 否則其值為最長序列的長度,其他短於長度的該序都會在後面填充0以達到該長度
# 長於 nb_timesteps 的序列都會被截斷 , 以使其批配目標長度
# padding 和截斷的發生的位置分別取決於 padding 和 truncating

# maxlen：None或整数，為序列的最大長度。 
# 其他短於長度的該序都會在後面填充0以達到該長度
# 長於 maxlen 的序列都會被截斷 , 以使其批配目標長度

# dtype：返回的numpy array的數據類型
# padding：‘pre’或‘post’，確定當需要補0時，在序列的起始還是結尾補
# truncating：‘pre’或‘post’，確定當需要截斷序列時，從起始還是結尾截斷
# value：浮點數 , 此值將在填充時代替默認的填充值0

# return:如(nb_samples,nb_timesteps)的2D張量


In [16]:
# 這邊會20000 筆資料是因為前面 ,MAX_NUM_WORDS = 20000
tokenizer.num_words

20000

In [17]:
# 第一個陣列內容
len(sequences[0])

1528

In [18]:
# 文件的序列內容
# sequences[0]

# 列印結果如下 共1528筆
'''
[1237,
 273,
 1213,
 1439,
 1071,
 1213,
 1237,
 273,
 1439,
 192,
 2515
 ...]
'''

'\n[1237,\n 273,\n 1213,\n 1439,\n 1071,\n 1213,\n 1237,\n 273,\n 1439,\n 192,\n 2515\n ...]\n'

In [19]:
# 前3筆字典內容
list(word_index.items())[0:3]

[('invincible', 32947), ("u''h", 128182), ('malzbender', 151112)]

In [20]:
len(word_index)

174074

In [21]:
len(data[0])

1000

In [22]:
# 觀看內容 從起始補0 滿足格式
data[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [23]:
# 共20個數值
labels[0] 

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0.], dtype=float32)

In [24]:
# 觀看幾筆
len(labels)

19997

In [25]:
len(indices)

19997

In [26]:
# 內容共19997筆
indices[0:10]

array([18194, 18907,  4557, 18306,  3307,  2222,  5093, 18551, 15429,
        1046])

In [27]:
# 經過 labels = labels[indices] 變成2維陣列
labels[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 1., 0.], dtype=float32)

# 預先訓練好的詞向量在這裡讀(前面整理過的glove.6B.100d.txt 字典)

In [28]:
print('Preparing embedding matrix.')

# prepare embedding matrix
num_words = min(MAX_NUM_WORDS, len(word_index))
# MAX_NUM_WORDS = 20000 
# len(word_index) =174074
# 取 MAX_NUM_WORDS 跟 len(word_index) 小的

embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
# return 一組用0填充的數組
# 用來初始化用
# num_words = 20000
# EMBEDDING_DIM = 100

for word, i in word_index.items():
    # len(word_index) =174074
    # word 裡面有儲存字典的單字
    # word_index.items = 字典 可以查詢單字序列
    # word_index.items = [('5rk5n', 99863), ('quu3', 136237), ('doupt', 167589)] 內容
    
    if i >= MAX_NUM_WORDS:
    # MAX_NUM_WORDS = 20000 
        continue
    embedding_vector = embeddings_index.get(word)
    # embeddings_index.get = 抓某個單字的向量 , 都存進去 embedding_vector
    
    if embedding_vector is not None:
    # not None == not False == not '' == not 0 == not [] == not {} == not () 不等於空值
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
        # embedding_vector 裡面的單字向量 存進embedding_matrix[i]
        

Preparing embedding matrix.


In [29]:
num_words

20000

In [30]:
len(embedding_matrix[0])

100

In [31]:
embedding_matrix[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [32]:
embedding_matrix[1]

array([-0.038194  , -0.24487001,  0.72812003, -0.39961001,  0.083172  ,
        0.043953  , -0.39140999,  0.3344    , -0.57545   ,  0.087459  ,
        0.28786999, -0.06731   ,  0.30906001, -0.26383999, -0.13231   ,
       -0.20757   ,  0.33395001, -0.33848   , -0.31742999, -0.48335999,
        0.1464    , -0.37303999,  0.34577   ,  0.052041  ,  0.44946   ,
       -0.46970999,  0.02628   , -0.54154998, -0.15518001, -0.14106999,
       -0.039722  ,  0.28277001,  0.14393   ,  0.23464   , -0.31020999,
        0.086173  ,  0.20397   ,  0.52623999,  0.17163999, -0.082378  ,
       -0.71787   , -0.41531   ,  0.20334999, -0.12763   ,  0.41367   ,
        0.55186999,  0.57907999, -0.33476999, -0.36559001, -0.54856998,
       -0.062892  ,  0.26583999,  0.30204999,  0.99774998, -0.80480999,
       -3.0243001 ,  0.01254   , -0.36941999,  2.21670008,  0.72201002,
       -0.24978   ,  0.92136002,  0.034514  ,  0.46744999,  1.10790002,
       -0.19358   , -0.074575  ,  0.23353   , -0.052062  , -0.22

In [33]:
embeddings_index.get('the')

array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
        0.20335 , -0.12763 ,  0.41367 ,  0.55187 ,  0.57908 , -0.33477 ,
       -0.36559 , -0.54857 , -0.062892,  0.26584 ,  0.30205 ,  0.99775 ,
       -0.80481 , -3.0243  ,  0.01254 , -0.36942 ,  2.2167  ,  0.72201 ,
       -0.24978 ,  0.92136 ,  0.034514,  0.46745 ,  1.1079  , -0.19358 ,
       -0.074575,  0.23353 , -0.052062, -0.22044 ,  0.057162, -0.15806 ,
       -0.30798 , -0.41625 ,  0.37972 ,  0.15006 , -0.53212 , -0.2055  ,
       -1.2526  ,  0.071624,  0.70565 ,  0.49744 , 

In [34]:
# 矩陣大小
embedding_matrix.shape

(20000, 100)

In [35]:
# 20000 筆 100維度
embedding_matrix[19999]

array([ 2.57459998e-01, -4.41240013e-01,  8.28769982e-01,  9.33689997e-02,
       -5.14159977e-01, -7.45450020e-01,  8.68600011e-01, -4.46969986e-01,
        8.08830023e-01, -3.11150014e-01,  5.45189977e-01,  4.24360007e-01,
       -8.99980031e-03,  2.95760006e-01,  2.87349999e-01, -2.65609995e-02,
        2.01289997e-01, -2.54180014e-01,  2.99490001e-02,  4.67679985e-02,
        2.29809999e-01,  2.14110002e-01, -1.33609995e-01,  2.25020006e-01,
        1.66389998e-02,  8.14490020e-01,  3.16689998e-01, -1.18500002e-01,
        4.48980004e-01, -4.35649991e-01,  2.80209988e-01, -5.80240011e-01,
        6.99059963e-02,  4.92170006e-01, -3.09399992e-01,  1.11950003e-01,
        9.14430022e-01,  5.88450015e-01, -2.63900012e-01, -1.26719999e+00,
        3.54950011e-01,  1.30540001e+00,  3.84090006e-01,  1.03120005e+00,
       -2.48889998e-01, -7.94210017e-01,  1.06119998e-01, -3.39399993e-01,
       -3.03770006e-01,  8.93829986e-02, -2.63289988e-01,  2.00729996e-01,
       -1.62799999e-01, -

# CNN 卷積層
## (weights加進前面glove.6B.100d.txt 整理的embedding_matrix)
## 載入預訓練模型和訓練模型

In [36]:
# load pre-trained word embeddings into an Embedding layer
# note that we set trainable = False so as to keep the embeddings fixed
# 開始載入預訓練模型
embedding_layer = Embedding(num_words,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=True)
# num_words = 20000 不超過20000 字典長度
# EMBEDDING_DIM = 100 = Output_dim
# MAX_SEQUENCE_LENGTH = 1000 不超過1000字典長度
# trainable = False 表示不重新計算

# 開始訓練模型
print('Training model.')

# train a 1D convnet with global maxpooling
sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
# MAX_SEQUENCE_LENGTH = 1000 , shape = 1000 , 不超過1000字典長度
# sequence_input = ([Dimension(None), Dimension(1000)])

# MAX_NUM_WORDS = 20000
# EMBEDDING_DIM = 100
# VALIDATION_SPLIT = 0.2

embedded_sequences = embedding_layer(sequence_input)
# ([Dimension(None), Dimension(1000), Dimension(100)])


x = Conv1D(128, 5, activation='relu')(embedded_sequences)
# (filters , kernel_size)
# filters = 卷積核數目(輸出的維度) 
# kernel_size = 卷積核的大小 

x = MaxPooling1D(5)(x)
# (pool_size=2, strides=None, padding='valid')
# pool_size = 池化窗口大小 

x = Conv1D(128, 5, activation='relu')(x)

x = MaxPooling1D(5)(x)

x = Conv1D(128, 5, activation='relu')(x)

x = GlobalMaxPooling1D()(x)
# 對於時間信號的全局最大池化

x = Dense(128, activation='relu')(x)

preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)

model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# loss='categorical_crossentropy'
# 亦稱作多類的對數損失, 注意使用該目標函數時，需要將標籤轉化為形如(nb_samples, nb_classes)的二值序列
# optimizer = 優化器 , 
# optimizer='rmsprop' = 該優化器通常是面對遞歸神經網路時的一個良好選擇


model.summary()

Training model.
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 1000)              0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 1000, 100)         2000000   
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 996, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 199, 128)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 195, 128)          82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 39, 128)           0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 35, 128)           82048

In [37]:
%%time
model.fit(x_train, y_train,
          batch_size=128,
          epochs=10,
          validation_data=(x_val, y_val) )

Train on 15998 samples, validate on 3999 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
CPU times: user 36min 46s, sys: 2min 7s, total: 38min 54s
Wall time: 46min 26s


<keras.callbacks.History at 0x7fba65026a58>

In [38]:
score = model.evaluate(x_val, y_val, verbose=1)



In [39]:
print('Test score:', score[0])
print('Test accuracy:', score[1])

Test score: 1.4884786900355536
Test accuracy: 0.49462365598850294
