In [82]:
import ipdb
#ipdb.set_trace()

In [83]:
import tensorflow as tf
import tensorflow.keras as K
import tensorflow.keras.layers as L
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from tensorflow.keras.preprocessing.text import Tokenizer 

In [84]:
class TrainingConfig():
    epochs = 5
    batch_size= 128
    learningRate=0.001
    valRate=0.1
    loss=['sparse_categorical_crossentropy']
    metrics=['accuracy']
    optimizer='adam'
    
class WordEmbedding():
    sequenceLength=256
    wordCount=10000
    wordDim=48

class DataSourceConfig():
    source=["./dataset/imdb_train_data.npy",
           "./dataset/imdb_test_data.npy",
           "./dataset/imdb_train_labels.npy",
           "./dataset/imdb_test_labels.npy"]

$$\Large{PE_{(pos, 2i)} = sin(pos / 10000^{2i / d_{model}})} $$
$$\Large{PE_{(pos, 2i+1)} = cos(pos / 10000^{2i / d_{model}})} $$

In [85]:
def get_angles(pos,i,d_model):
    angle_rates=1/np.power(10000,(2*(i//2))/np.float32(d_model))
    return pos * angle_rates
def positional_encoding(position,d_model):
    angle_rads=get_angles(np.arange(position)[:,np.newaxis],
                          np.arange(d_model)[np.newaxis,:],
                          d_model
                         )
    angle_rads[:,0::2]=np.sin(angle_rads[:,0::2])
    angle_rads[:,1::2]=np.cos(angle_rads[:,1::2])
    #ipdb.set_trace()
    pos_encoding=angle_rads[np.newaxis,...]
    return tf.cast(pos_encoding,dtype=tf.float32)

In [86]:
def create_padding_mask(seq):
    seq=tf.cast(tf.math.equal(seq,0),tf.float32)
    # 添加额外的维度来将填充加到
    # 注意力对数（logits）。
    return seq[:,tf.newaxis,tf.newaxis,:]

In [87]:
def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask  # (seq_len, seq_len)

In [88]:
def scaled_dot_product_attention(q,k,v,mask):
    matmul_qk=tf.matmul(q,k,transpose_b=True)#(batch_size,seq_q,seq_k)
    #缩放
    dk=tf.cast(tf.shape(k)[-1],tf.float32)
    scaled_attention_logits=matmul_qk/tf.math.sqrt(dk)
    
    if mask is not None:
        scaled_attention_logits+=(mask*-1e9)
        
    attention_weights=tf.nn.softmax(scaled_attention_logits,axis=-1)
    output=tf.matmul(attention_weights,v)
    
    return output,attention_weights    

In [89]:
def print_out(q, k, v):
    temp_out, temp_attn = scaled_dot_product_attention(
      q, k, v, None)
    print ('Attention weights are:')
    print (temp_attn)
    print ('Output is:')
    print (temp_out)

In [90]:
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model
    
    assert d_model % self.num_heads == 0
    
    self.depth = d_model // self.num_heads
    
    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)
    
    self.dense = tf.keras.layers.Dense(d_model)
        
  def split_heads(self, x, batch_size):
    """分拆最后一个维度到 (num_heads, depth).
    转置结果使得形状为 (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)
    
    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)
    
    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)
    
    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)
        
    return output, attention_weights

In [91]:
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Dense(d_model)  # (batch_size, seq_len, d_model)
  ])

In [92]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    
    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    
  def call(self, x, training, mask):

    attn_output, _ = self.mha(x, x, x, mask)  # (batch_size, input_seq_len, d_model)
    attn_output = self.dropout1(attn_output, training=training)
    out1 = self.layernorm1(x + attn_output)  # (batch_size, input_seq_len, d_model)
    
    ffn_output = self.ffn(out1)  # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training=training)
    out2 = self.layernorm2(out1 + ffn_output)  # (batch_size, input_seq_len, d_model)
    
    return out2

In [118]:
def fc_layer(dff):
  return tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),  # (batch_size, seq_len, dff)
      tf.keras.layers.Flatten(),
      tf.keras.layers.Dense(2,activation='softmax')  # (batch_size, seq_len, d_model)
  ])

In [120]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size,
               maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers
    
    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, 
                                            self.d_model)
    
    
    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]
    
    self.fc_layer=fc_layer(dff)
  
    self.dropout = tf.keras.layers.Dropout(rate)
        
  def call(self, x, training=True, mask=None):

    seq_len = tf.shape(x)[1]
    
    # 将嵌入和位置编码相加。
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, :seq_len, :]

    x = self.dropout(x, training=training)
    
    for i in range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)
    x=self.fc_layer(x)
    return x  # (batch_size, input_seq_len, d_model)

In [121]:
class Transformer():
    def __init__(self,config,wordEmbedding):
        
        self.model = None
        self.history = None
        self.config=config
        self.emb=wordEmbedding
        self.name="TextCNN"
        
    
    def design_model(self,hiden_size=64):
        
        sample_encoder = Encoder(num_layers=2, d_model=48, num_heads=8, 
                         dff=hiden_size, input_vocab_size=self.emb.wordCount,
                         maximum_position_encoding=self.emb.sequenceLength)
        self.model=sample_encoder
        return self.model
    
    def compile_model(self):
        #assert()
        if not self.model:
            print("Call design_modelXX() to build the model first.")
            return
        self.model.compile(optimizer=self.config.optimizer,loss=self.config.loss,metrics=self.config.metrics)

    def train_model(self,x,y):
        if not self.model:
            print("Call design_modelXX() to build the model first.")
            return
        self.compile_model()
        self.history=self.model.fit(x=x,y=y,batch_size=self.config.batch_size,epochs=self.config.epochs,validation_split=self.config.valRate)
        #self.model.compile()
    def train_model_custom(self,x,y):
        if not self.model:
            print("Call design_modelXX() to build the model first.")
        x_train,x_val,y_train,y_val=train_test_split(x,y,test_size=0.1)
        print(x_train.shape)
        print(x_val.shape)
        train_dataset = tf.data.Dataset.from_tensor_slices((x_train,y_train)).batch(self.config.batch_size)
        
        val_dataset = tf.data.Dataset.from_tensor_slices((x_val,y_val)).batch(self.config.batch_size)

        #loss
        loss_fn=K.losses.get(self.config.loss[0])
        #opt
        optimizer=K.optimizers.Adam()
        #metric
        train_acc_metric=K.metrics.SparseCategoricalAccuracy()#self.config.metrics[0])
        val_acc_metric  =K.metrics.SparseCategoricalAccuracy()#self.config.metrics[0])
        
        for times in range(self.config.epochs):
            step=0
            # 训练过程
            for x_batch_train, y_batch_train in train_dataset:
                with tf.GradientTape() as tp:
                    logits=self.model(x_batch_train)
                    loss=loss_fn(y_batch_train,logits)
                grads=tp.gradient(loss,self.model.trainable_weights)
                optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
                train_acc_metric.update_state(y_batch_train,logits)
                
                if step % 50 == 0:
                    print('\r','Training loss (for one batch) at step %s: %s' % (step, float(tf.reduce_mean(loss))),end="",flush=True)
                    #print('Seen so far: %s samples' % ((step + 1) * self.config.batch_size))
                step+=1
            print()
            # 每个周期结束看一次
            train_acc = train_acc_metric.result()
            print('Training     acc over epoch %s: %s' % (times,float(train_acc)))
            # Reset training metrics at the end of each epoch
            train_acc_metric.reset_states()
            
            #验证过程
            for x_batch_val, y_batch_val in val_dataset:
                val_logits=self.model(x_batch_val)
                val_acc_metric.update_state(y_batch_val,val_logits)
            
            val_acc=val_acc_metric.result()
            print('Training val_acc over epoch %s: %s '%(times,float(val_acc)))
            val_acc_metric.reset_states()

            
    def save_model(self):
        if not self.model:
            print("Call design_modelXX() to build the model first.")
        pass
    
    def eval_model(self):
        if not self.model:
            print("Call design_modelXX() to build the model first.")
        pass
    
    def view_train(self):
        if not self.history:
            print("Model has not been trained, train it first")
            return
        
        train=self.history.history["loss"]
        valid=self.history.history["val_loss"]
        name='loss'
        plt.title('The %s with epoch runs'%name,fontsize=30)
        plt.xlabel('epoch',fontsize=20)
        plt.ylabel(name,fontsize=20)
        plt.xticks(fontsize=20)
        plt.yticks(fontsize=20)
        plt.plot(train,label=name)
        plt.plot(valid,label="val_"+name)
        plt.legend()
        plt.gcf().set_size_inches(15,4)
        plt.show()
    #model=K.models.Model(inputs=[input_layer],outputs=[output_layer])
    #model.compile('adam',loss='categorical_crossentropy',metrics=metrics)
    #return model

In [122]:
train_config=TrainingConfig()
word_embedding=WordEmbedding()
model=Transformer(train_config,word_embedding)

In [123]:
class Dataset():
    def __init__(self,config):
        self.config=config
    def load(self):
        L=[]
        for item in self.config.source:
            L.append(np.load(item,allow_pickle=True, fix_imports=True))
        return L

In [124]:
data=Dataset(DataSourceConfig)
train_data,test_data,train_labels,test_labels=data.load()

In [125]:
model.design_model()

<__main__.Encoder at 0x153c60450>

In [None]:
model.train_model_custom(train_data,train_labels)

(22500, 256)
(2500, 256)
 Training loss (for one batch) at step 50: 0.7049741744995117