In [1]:
import pandas as pd
import numpy as np

In [2]:
source_data = pd.read_csv("final_data.csv", index_col=0)
source_data.dropna(how='any',inplace=True) # 處理缺失直

source_data = source_data.reindex(index=source_data.index[::-1])

source_data
# target_1 為未來三天的漲跌加總，target_2 為蔣總後的正負。

Unnamed: 0_level_0,指數,漲跌,動盪,交易量,融資,融券,自營商,投信,外資,3日,...,14日,15日,16日,17日,18日,19日,20日,60日,target_1,target_2
日期,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2004/7/1,5836.91,-2.53,59.71,692.00694,206.16345,84.9099,7.329094,-0.221718,9.635603,30.95333,...,140.86500,129.43400,113.29500,97.81588,86.88667,88.20947,92.0720,-238.52617,-103.34,0
2004/7/2,5746.70,-90.21,58.99,531.13441,206.57564,78.7084,-13.862259,-16.050376,-82.260722,-60.98333,...,49.82429,47.27800,36.77250,21.72706,7.18333,-3.14842,-1.9005,-313.30017,-18.92,0
2004/7/5,5659.78,-86.92,79.36,440.94036,205.87461,72.0990,-14.960335,-19.800116,-16.629336,-88.01667,...,-43.21714,-34.62267,-37.16438,-47.19765,-61.57111,-75.54000,-85.5650,-384.21050,53.61,1
2004/7/6,5733.57,73.79,76.65,414.49502,205.70078,75.2901,3.310531,-7.703426,-0.348274,20.22000,...,24.35286,28.53467,36.71937,34.47118,25.11500,11.57579,-1.6625,-293.01700,44.15,1
2004/7/7,5727.78,-5.79,101.45,503.79043,205.75792,71.5637,-4.501554,6.698410,-1.273157,20.73667,...,6.59000,17.32533,21.32312,29.11000,27.08778,18.30789,5.4965,-281.03117,30.96,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2020/8/14,12795.46,32.33,122.12,2145.85977,72.41246,68.0235,4.332400,6.888281,39.489178,52.48000,...,67.87643,77.16200,103.05312,119.48647,130.74778,144.80895,168.6145,863.02700,-16.82,0
2020/8/17,12956.11,160.65,155.42,2228.58664,72.94443,68.3645,12.277874,7.280122,96.455224,117.87667,...,202.14214,213.29133,222.94875,248.19118,264.57333,276.06105,290.1860,991.21367,-593.47,0
2020/8/18,12872.14,-83.97,141.33,2504.51420,72.98284,68.5825,-5.175571,-0.132520,-29.954802,-2.43000,...,94.51714,110.29400,121.23875,130.80353,155.09778,171.09789,182.4865,872.89383,-264.30,0
2020/8/19,12778.64,-93.50,171.47,2555.84456,73.11886,68.6890,-20.282008,-1.413083,-5.197372,-90.32333,...,-2.96286,0.94933,15.74438,26.10706,35.23111,58.35579,73.7180,747.60283,-131.51,0


In [3]:
def data_preprocess(df, time_frame):
    data_value = df.to_numpy()  # 將 data_frame 轉為  numpy array
    
    result = []
    for index in range(len(data_value) - time_frame):
        result.append(data_value[index: index + time_frame])
    
    result = np.array(result)
    number_train = round(0.9 * result.shape[0]) # 90% 資料用來訓練
    
    # 訓練資料
    x_train = result[:int(number_train), :, :-2]
    y_train = result[:int(number_train), -1, -1]
    # 測試資料
    x_test = result[int(number_train):, :, :-2]
    y_test = result[int(number_train):, -1, -1]
    
    return [x_train, y_train, x_test, y_test]

In [4]:
x_train, y_train, x_test, y_test = data_preprocess(source_data, 20)

In [5]:
print("x_train.shape: ", x_train.shape)
print("y_train.shape: ", y_train.shape)
print("x_test.shape: ", x_test.shape)
print("y_test.shape: ", y_test.shape)

x_train.shape:  (3571, 20, 28)
y_train.shape:  (3571,)
x_test.shape:  (397, 20, 28)
y_test.shape:  (397,)


In [6]:
import tensorflow as tf

def get_angles(pos, i, d_model):
  angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
  return pos * angle_rates

# positional encoding 的產出需要句長與詞深度這兩個參數。
def positional_encoding(position, d_model):
  angle_rads = get_angles(np.arange(position)[:, np.newaxis],
                          np.arange(d_model)[np.newaxis, :],
                          d_model)
  # apply sin to even indices in the array; 2i
  sines = np.sin(angle_rads[:, 0::2])
  # apply cos to odd indices in the array; 2i+1
  cosines = np.cos(angle_rads[:, 1::2])
  pos_encoding = np.concatenate([sines, cosines], axis=-1)
  pos_encoding = pos_encoding[np.newaxis, ...]
  return tf.cast(pos_encoding, dtype=tf.float32)

seq_len = 20 # 20 天資料
d_model = 28 # 28 維度資料
pos_encoding = positional_encoding(seq_len, d_model)
print(pos_encoding)

tf.Tensor(
[[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
    0.00000000e+00  0.00000000e+00  1.00000000e+00  1.00000000e+00
    1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
    1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00
    1.00000000e+00  1.00000000e+00  1.00000000e+00  1.00000000e+00]
  [ 8.41470957e-01  4.95097876e-01  2.65063316e-01  1.38502866e-01
    7.19064549e-02  3.72673050e-02  1.93057787e-02  9.99983307e-03
    5.17945131e-03  2.68269260e-03  1.38949510e-03  7.19685631e-04
    3.72759358e-04  1.93069776e-04  5.40302277e-01  8.68837237e-01
    9.64231014e-01  9.90362048e-01  9.97411370e-01  9.99305308e-01
    9.99813616e-01  9.99949992e-01  9.99986589e-01  9.99996424e-01
    9.99999046e-01  9.99999762e-01  9.99999940e-01  1.00000000e+00]
  [ 9.09297407e-01  8.60318899e-01  5.11164546e-0

In [7]:
# look ahead mask
# 建立一個 2 維矩陣，維度為 (size, size)，
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [8]:
seq_len = x_train.shape[1]
look_ahead_mask = create_look_ahead_mask(seq_len)
print("look_ahead_mask", look_ahead_mask)

look_ahead_mask tf.Tensor(
[[0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0.

In [9]:
def scaled_dot_product_attention(q, k, v, mask):
    """計算注意權重.
    mask 可以根據不同的 mask 型態有不同的 shape (padding or look ahead)，
    但必須具有 broadcasting 特性.
  
    Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.
    
    Returns:
    output, attention_weights
    """
    # 將 `q`、 `k` 做點積再 scale，(transpose_b=True 意思是讓第二個參數 k 先轉置)
    # 轉置後相乘我想不必多說，就是 dot product 的觀念。
    matmul_qk = tf.matmul(q, k, transpose_b=True)  # (..., seq_len_q, seq_len_k)
  
    dk = tf.cast(tf.shape(k)[-1], tf.float32)  # 取得 seq_k 的序列長度
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)  # scale by sqrt(dk)

    # 將遮罩「加」到被丟入 softmax 前的 logits
    if mask is not None:
        # 經過此步之後，scaled_attention_logits 的 padding 部分會變成一個很大的負數
        scaled_attention_logits += (mask * -1e9) 

    # 取 softmax 是為了得到總和為 1 的比例之後對 `v` 做加權平均
    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)  # (..., seq_len_q, seq_len_k)
  
    # 以注意權重對 v 做加權平均（weighted average）
    output = tf.matmul(attention_weights, v)  # (..., seq_len_q, depth_v)

    return output, attention_weights

In [10]:
# 實作一個執行多頭注意力機制的 keras layer
class MultiHeadAttention(tf.keras.layers.Layer):

    # 初始化，建立參數 d_model(詞向量深度)，nm_heads(頭的數量)。
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        # 驗證 d_model 是否可以被 num_heads 整除
        assert d_model % self.num_heads == 0  
        # 每個頭的新詞向量深度
        self.depth = d_model // self.num_heads  
        
        # 提供的 q k v 三個參數的線性轉換。
        self.wq = tf.keras.layers.Dense(d_model)  
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)
        
        # 多個 heads 串接之後通過的一次線性轉換
        self.dense = tf.keras.layers.Dense(d_model)
  
    # 這個就是包裝過後的切頭公式
    def split_heads(self, x, batch_size):
        """把 x 最後一維切割成 (num_heads, depth).
        傳置加重塑後 x 變這樣: (batch_size, num_heads, seq_len, depth)
        """
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
  
    # 定義這個方法之後主程式宣告完 class 之後就可以直接用呼叫了，這個晚一點會說明。
    def __call__(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]
        
        # 把 q k v 必須都分別做一次線性轉換，老師的影片裡面有說明 q*w, k*w, v*w。
        # w 是 AI 可以學習出來的
        q = self.wq(q)  # (batch_size, seq_len, d_model)
        k = self.wk(k)  # (batch_size, seq_len, d_model)
        v = self.wv(v)  # (batch_size, seq_len, d_model)
        
        # q, k, v 分別切頭
        q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
        k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
        v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
        
        # 我們前面有實作一個 self-attention 的 func，這個時候就賣上用場了。
        # 利用 broadcasting 讓每個句子的每個 head 的 qi, ki, vi 都各自進行注意力機制。
        # 還記得前面 padding 產出的 shape 嗎? 一樣是四維的，就是為了呼應這邊切的多頭機制。
        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)
        # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
        # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
        
        # 多頭算完之後記得要合併回來呀 ! 很重要!先 Transpose， seq 又變回第二維度了。
        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(scaled_attention, 
                                    (batch_size, -1, self.d_model)) 


        # 合併好後通過最後一個線性轉換
        output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
        return output, attention_weights

In [14]:

d_model = 28
num_heads = 2  # 再次提醒，num_heads 必須要可以整除 d_model

print(f"d_model: {d_model}")
print(f"num_heads: {num_heads}\n")

# 初始化一個 multi-head attention layer
mha = MultiHeadAttention(d_model, num_heads)

# q, k, v 都設定維 emb_inp。
v = k = q = x_train
look_ahead_mask = create_look_ahead_mask(20)
print("q.shape: ", q.shape)
print("k.shape: ", k.shape)
print("v.shape: ", v.shape)
print("look_ahead_mask.shape: ", look_ahead_mask.shape)  # 注意這邊 mask 是一個 4 維張量。

d_model: 28
num_heads: 2

q.shape:  (3571, 20, 28)
k.shape:  (3571, 20, 28)
v.shape:  (3571, 20, 28)
look_ahead_mask.shape:  (20, 20)


In [15]:
output, attention_weights = mha(v, k, q, look_ahead_mask)
print("output.shape: ", output.shape)
# attention_weights.shape 仍然保有多頭的樣子
print("attention_weights.shape: ", attention_weights.shape)  
print("---"*20)
print("---"*20)
print("output: ", output)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

output.shape:  (3571, 20, 28)
attention_weights.shape:  (3571, 2, 20, 20)
-----------------------

In [16]:
# 建立 Transformer 裡 Encoder / Decoder layer 都有使用到的 Feed Forward 元件
def point_wise_feed_forward_network(d_model, dff):
  
  # 此 Model 對輸入做兩次線性轉換，中間加了一個 ReLU activation func
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [18]:
class TheLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(TheLayer, self).__init__()

        # 建立 2 個 Sub-Layer mha丶ffn
        self.mha = MultiHeadAttention(d_model, num_heads)
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        # layer norm 很常在 RNN-based 的模型被使用。一個 sub-layer 配一個 layer norm
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6) # 給一個小的 float 避免算標準差時除以 0。
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        # 一樣，一個 sub-layer 一個 dropout layer
        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)

    def call(self, x, training, mask):
        
        x = self.layernorm1(x)  
        
        attn_output, attn = self.mha(x, x, x, mask)  
        attn_output = self.dropout1(attn_output, training=training) 
        out1 = self.layernorm2(x + attn_output)  

        # sub-layer 2: FFN
        # 強調資料關鍵特徵
        ffn_output = self.ffn(out1) 
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm3(out1 + ffn_output)

        return out2

In [19]:
the_layer = TheLayer(28, 2, 112)
out = the_layer(x_train, True, look_ahead_mask)
out



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.



<tf.Tensor: id=770, shape=(3571, 20, 28), dtype=float32, numpy=
array([[[ 3.4103537 ,  0.01985091,  0.74918485, ...,  0.58627474,
          0.5234207 , -1.314049  ],
        [ 3.2908432 , -0.07954286, -0.60534906, ..., -0.8994587 ,
          0.9736384 , -0.7727401 ],
        [ 3.0684516 , -0.09027258, -0.5664243 , ...,  0.22996233,
          0.83399105, -0.9939292 ],
        ...,
        [ 2.9768589 , -0.03298631, -0.44867444, ...,  0.14257766,
          0.6116736 , -0.99784887],
        [ 2.9365754 ,  0.14070904, -1.2753508 , ...,  0.23160835,
          0.85259664, -1.032717  ],
        [ 3.1422853 ,  1.0041803 , -0.6628128 , ...,  0.12204027,
          0.7118294 , -1.1834507 ]],

       [[ 3.1329968 ,  0.05732667, -0.83186316, ...,  0.08261821,
          0.89034736, -0.6229142 ],
        [ 3.1012864 , -0.05822286, -0.5505539 , ...,  0.23435576,
          0.85639966, -0.81267583],
        [ 3.1120343 ,  0.05693801, -0.5416526 , ...,  0.32935756,
          0.5624053 , -1.0190328 ],
   

In [21]:
class ParentLayer(tf.keras.layers.Layer):
  # 初始化參數 : 
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1):
        super(ParentLayer, self).__init__()

        self.d_model = d_model
        self.pos_encoding = positional_encoding(20, self.d_model)

        # 建立 `num_layers` 個 EncoderLayers
        self.enc_layers = [TheLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        # 建立一個 Dropout
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        # 輸入的 x.shape == (batch_size, input_seq_len)
        input_seq_len = tf.shape(x)[1]

        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))  # 並依照論文乘上 sqrt(d_model)
        x += self.pos_encoding[:, :input_seq_len, :] # 再加上對應長度的位置編碼

        # 對 embedding 跟位置編碼的總合做 regularization
        x = self.dropout(x, training=training)

        for i, enc_layer in enumerate(self.enc_layers):
          x = enc_layer(x, training, mask)

        return x 


In [22]:
num_layers = 1 # 2 層的 Encoder
d_model = 28 # 詞向量深度
num_heads = 2 # 切 2 頭
dff = 112  # FFN 神經元個數

# 初始化一個 Encoder
parent_layer = ParentLayer(num_layers, d_model, num_heads, dff)

# 將 2 維的索引序列丟入 Encoder 做編碼
parent_out = parent_layer(x_train, training=True, mask=look_ahead_mask) # 這邊關閉 dropout，不使用 mask
print("parent_out:", parent_out)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

parent_out: tf.Tensor(
[[[ 2.8746946   1.633085    0.6185652  ...  0.8669781   0.7489447
   -1.7548804 ]
  [ 2.7052305   0.91807085  0.77326584 ...  0.88077307  0.6397235
   -1.604751  ]
  [ 2.6815305   0.9202407   0.78430384 ...  0.91225845  0.6332881
   -1.5919785 ]
  ...
  [ 3.20774     1.0407766   0.85505474 ...  0.39874393  0.21156798
   -1.4060303 ]
  [ 3.2708867   0.92264533  0.9258672  ...  0.81493974  0.09914461
   -1.540654  ]
  [ 3.209761    0.9063616   0.8277502  ...  0.97886366  0.15030946
   -1.3752272 ]]

 [[ 2.6172798   0.84637177  0.7646651  ...  0.9550586   0.50934225
   -1.546025  ]
  [ 2.3273442   1.041599    0.797919   ...  1.0015316   0.64235353
   -1.6591523 ]
  [ 2.6

In [65]:
class BigMama(tf.keras.Model):
    
    def __init__(self, num_layers, d_model, num_heads, dff, rate=0.1):
        super(BigMama, self).__init__()

        self.parent_layer = ParentLayer(num_layers, d_model, num_heads, dff, rate)

        self.final_layer = tf.keras.layers.Dense(1, activation='sigmoid')
  
  
    def call(self, inp, training, mask):

        output = self.parent_layer(inp, training, mask)  

        # final_output.shape == (batch_size, tar_seq_len, 1)
        final_output = self.final_layer(output)  
        
        final_output = final_output[:, -1, :]
        
        return final_output, attention_weights

In [66]:
# 超參數
num_layers = 1
d_model = 28
num_heads = 2
dff = 112


# 建立 transformer
mama = BigMama(num_layers, d_model, num_heads, dff)

# 將英文、中文序列丟入取得 Transformer 預測下個中文字的結果
predictions, attn_weights = mama(x_train, True, look_ahead_mask)

print("predictions:", predictions)



To change all layers to have dtype float64 by default, call `tf.keras.backend.set_floatx('float64')`. To change just this layer, pass dtype='float64' to the layer constructor. If you are the author of this layer, you can disable autocasting by passing autocast=False to the base Layer constructor.

predictions: tf.Tensor(
[[0.53099674]
 [0.4591116 ]
 [0.6237141 ]
 ...
 [0.73760617]
 [0.50679874]
 [0.6166745 ]], shape=(3571, 1), dtype=float32)


In [67]:
def loss_function(real, pred):
  # 這次的 mask 將 real 序列中不等於 0 的位置視為 1，其餘為 0 
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  # 照樣計算所有位置的 cross entropy 但不加總
  loss_ = loss_object(real, pred)
  mask = tf.cast(mask, dtype=loss_.dtype) # 統一型別
  loss_ *= mask  # 只計算非 <pad> 位置的損失 
  
  return tf.reduce_mean(loss_) # 計算 loss 張量所有數字的平均值

In [68]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  # 論文預設 `warmup_steps` = 4000，4000 個 step 之後開始降低 learning_rates
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()
    self.d_model = tf.cast(d_model, tf.float32)
    self.warmup_steps = warmup_steps
    
  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)
    
    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
  
# 將客製化 learning rate schdeule 丟入 Adam opt.
# Adam opt. 的參數都跟論文相同
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [69]:
BATCH_SIZE = 10  # 64 筆資料綁成一批
BUFFER_SIZE = 100

num_layers = 1  # 幾層
d_model =28  # 詞向量深度
dff = 112  # FFN 神經元個數
num_heads = 4  # multi-head 個數
dropout_rate = 0.1
EPOCHS = 20 # 訓練週期

In [70]:
import os

# 讓 numpy 不要顯示科學記號
np.set_printoptions(suppress=True)

# GPU 設定
print("GPU Available: ", tf.test.is_gpu_available())
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 定義一些之後在儲存檔案時會用到的路徑變數
output_dir = "nmt"
checkpoint_path = os.path.join(output_dir, "checkpoints")
log_dir = os.path.join(output_dir, 'logs')

if not os.path.exists(output_dir):
    os.makedirs(output_dir)

GPU Available:  True


In [76]:
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none')
train_loss = tf.keras.metrics.Mean(name='train_loss') # 看 log 需要
train_accuracy = tf.keras.metrics.BinaryAccuracy(name='train_accuracy') # 看 log 需要

def loss_function(real, pred):
    loss_ = loss_object(real, pred)
    return tf.reduce_mean(loss_)

In [77]:
# 將客製化 learning rate schdeule 丟入 Adam opt.
# Adam opt. 的參數都跟論文相同
learning_rate = CustomSchedule(d_model)
optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

# 實際訓練以及及時存檔
mama = BigMama(num_layers, d_model, num_heads, dff)

print(f"""這個 mama 有 {num_layers} 層
d_model: {d_model}
num_heads: {num_heads}
dff: {dff}
dropout_rate: {dropout_rate}
""")

這個 mama 有 1 層
d_model: 28
num_heads: 4
dff: 112
dropout_rate: 0.1



In [78]:
train_perc = 90  # 用 90% 資料訓練

# 方便比較不同實驗/ 不同超參數設定的結果
run_id = f"{num_layers}layers_{d_model}d_{num_heads}heads_{dff}dff_{train_perc}train_perc"
checkpoint_path = os.path.join(checkpoint_path, run_id)
log_dir = os.path.join(log_dir, run_id)

# tf.train.Checkpoint 可以幫我們把想要存下來的東西整合起來，方便儲存與讀取
# 一般來說會存 transformer 以及 optimizer 的狀態
ckpt = tf.train.Checkpoint(model=mama, optimizer=optimizer)

# ckpt_manager 會去 checkpoint_path 看有沒有符合 ckpt 裡頭定義的東西
# 存檔的時候只保留最近 5 次 checkpoints，其他自動刪除
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)


# 如果在 checkpoint 路徑上有發現檔案就讀進來
if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)

    # 用來確認之前訓練多少 epochs 了
    last_epoch = int(ckpt_manager.latest_checkpoint.split("-")[-1])
    print(f'已讀取最新的 checkpoint，模型已訓練 {last_epoch} epochs。')
else:
    last_epoch = 0
    print("沒找到 checkpoint，從頭訓練。")

沒找到 checkpoint，從頭訓練。


In [79]:
train_step_signature = [
    tf.TensorSpec(shape=(None, None), dtype=tf.float32),
    tf.TensorSpec(shape=(None), dtype=tf.float32),
]
@tf.function(input_signature=train_step_signature)
def train_step(inp, label):

    with tf.GradientTape() as tape:  # GradientTape 紀錄數據轉換計算 loss
        predictions, _ = mama(inp, True, look_ahead_mask)
        print("predictions: ", predictions, " label: ", label)
        loss = loss_function(label, predictions)

        gradients = tape.gradient(loss, mama.trainable_variables)
        optimizer.apply_gradients(zip(gradients, mama.trainable_variables))  # 將梯度取出並用 optimizer 隊訓練權重做升降

        train_loss(loss)
        train_accuracy(label, predictions)

In [None]:
import time

print(f"此超參數組合的 Transformer 已經訓練 {last_epoch} epochs。")
print(f"剩餘 epochs：{min(0, last_epoch - EPOCHS)}")
# 用來寫資訊到 TensorBoard，非必要。
summary_writer = tf.summary.create_file_writer(log_dir)


# 比對設定的 `EPOCHS` 以及已訓練的 `last_epoch` 來決定還要訓練多少 epochs
for epoch in range(last_epoch, EPOCHS):
    start = time.time()

    # 重置紀錄 TensorBoard 的 metrics
    train_loss.reset_states()
    train_accuracy.reset_states()

    # 一個 epoch 就是把我們定義的訓練資料集一個一個 batch 拿出來處理，直到看完整個數據集
    for (step_idx, inp) in enumerate(x_train):
        # 每次 step 就是將數據丟入 Transformer，讓它生預測結果並計算梯度最小化 loss
        
        train_step(inp, y_train[step_idx])

        # 每個 epoch 完成就存一次檔
    if (epoch + 1) % 1 == 0:
        ckpt_save_path = ckpt_manager.save()
        print('Saving checkpoint for epoch {} at {}'.format(epoch + 1, ckpt_save_path))

    # 將 loss 以及 accuracy 寫到 TensorBoard 上
    with summary_writer.as_default():
        tf.summary.scalar("train_loss", train_loss.result(), step=epoch + 1)
        tf.summary.scalar("train_acc", train_accuracy.result(), step=epoch + 1)

    print('Epoch {} Loss {:.4f} Accuracy {:.4f}'.format(epoch + 1, train_loss.result(), train_accuracy.result()))
    print('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

print("訓練已完成，可以進行測試。")

此超參數組合的 Transformer 已經訓練 0 epochs。
剩餘 epochs：-20
predictions:  Tensor("big_mama_7/strided_slice:0", shape=(None, 1), dtype=float32)  label:  Tensor("label:0", dtype=float32)
predictions:  Tensor("big_mama_7/strided_slice:0", shape=(None, 1), dtype=float32)  label:  Tensor("label:0", dtype=float32)
Saving checkpoint for epoch 1 at nmt\checkpoints\1layers_28d_4heads_112dff_90train_perc\1layers_28d_4heads_112dff_90train_perc\ckpt-1
Epoch 1 Loss 0.6965 Accuracy 0.4562
Time taken for 1 epoch: 18.641513109207153 secs

Saving checkpoint for epoch 2 at nmt\checkpoints\1layers_28d_4heads_112dff_90train_perc\1layers_28d_4heads_112dff_90train_perc\ckpt-2
Epoch 2 Loss 0.6930 Accuracy 0.4598
Time taken for 1 epoch: 17.354198455810547 secs

Saving checkpoint for epoch 3 at nmt\checkpoints\1layers_28d_4heads_112dff_90train_perc\1layers_28d_4heads_112dff_90train_perc\ckpt-3
Epoch 3 Loss 0.6917 Accuracy 0.4584
Time taken for 1 epoch: 17.06368350982666 secs

Saving checkpoint for epoch 4 at nmt\checkpoi