In [1]:
import os, datetime, random, json, warnings
import gensim
import pandas as pd
import numpy as np
import tensorflow as tf
from math import sqrt
from collections import Counter
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score


  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
warnings.filterwarnings("ignore")
directory_path = "\\".join(os.getcwd().split("\\")[:-1]) + "\\data"
os.path.exists(directory_path)

True

In [3]:
class TrainingConfig(object):
    epochs = 10
    evaluateEvery = 100
    checkpointEvery = 100
    learningRate = 0.001


class ModelConfig(object):
    embeddingSize = 200
    
    # 内层一位卷积核数量、外层卷积核数量应该等于 embeddingSize
    filters = 128
    # attention head number
    numHeads = 8
    # set the number of transformer block
    numBlocks = 1
    epsilon = 1e-8
    
    # multi-head-attention layer dropout-rate
    keepProb = 0.9
    
    # fully-connection layer dropout-rate
    dropoutKeepProb = 0.5
    l2RegLambda = 0.0
    
    
class Config(object):
    sequenceLength = 200
    batchSize = 128
    dataSource = directory_path + "\\preProcess\\labeledTrain.csv"
    stopWordSource = directory_path + "\\english"
    
    # binary-classes set to 1, otherwise set to other number
    numClasses = 1
    rate = 0.8
    training = TrainingConfig()
    model = ModelConfig()


config = Config()

In [6]:
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self._dataSource = config.dataSource
        self._stopWordSource = config.stopWordSource
        self._sequenceLength = config.sequenceLength
        self._embeddingSize = config.model.embeddingSize
        self._batchSize = config.batchSize
        self._rate = config.rate
        
        self._stopWordDict = dict()
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        self.wordEmbedding = None
        self.labelList = []
    
    
    def _readData(self, file_path):
        df = pd.read_csv(file_path)
        if self.config.numClasses == 1:
            labels = df["sentiment"].tolist()
        elif self.config.numClasses > 1:
            labels = df["rate"].tolist()
        
        review = df["review"].tolist()
        reviews = [line.strip().split() for line in review]
        
        return reviews, labels
    
    
    def _labelToIndex(self, labels, label2idx):
        labelIds = [label2idx[label] for label in labels]
        return labels
    
    
    def _wordToIndex(self, reviews, word2idx):
        reviewIds = [[word2idx.get(item, word2idx["UNK"]) for item in review] for review in reviews]
        return reviewIds
    
    
    def _genTrainEvalData(self, x, y, word2idx, rate):
        reviews = []
        for review in x:
            if len(review) > self._sequenceLength:
                reviews.append(review[:self._sequenceLength])
            else:
                reviews.append(review + [word2idx["PAD"]] * (self._sequenceLength - len(review)))
        
        trainIndex = int(len(x) * rate)
        trainReviews = np.asarray(reviews[: trainIndex], dtype="int64")
        trainLabels = np.array(y[: trainIndex], dtype="float32")
        evalReviews = np.asarray(reviews[trainIndex :], dtype="int64")
        evalLabels = np.array(y[trainIndex :], dtype="float32")
        
        return trainReviews, trainLabels, evalReviews, evalLabels
    
    
    def _getWordEmbedding(self, words):
        wordVec = gensim.models.KeyedVectors.load_word2vec_format(directory_path + "\\word2vec\\word2Vec.bin", 
                                                                 binary=True)
        vocab = []
        wordEmbedding = []
        vocab.append("PAD")
        vocab.append("UNK")
        wordEmbedding.append(np.zeros(self._embeddingSize))
        wordEmbedding.append(np.random.randn(self._embeddingSize))
        
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                print("{} is not exist in vocab...".format(word))
        
        return vocab, np.array(wordEmbedding)
    
    
    def _readStopWord(self, stopWordPath):
        with open(stopWordPath, "r") as f:
            stopWords = f.read()
            stopWordList = stopWords.splitlines()
            self.stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
        
        
    def _genVocabulary(self, reviews, labels):
        allWords = [word for review in reviews for word in review]
        subWords = [word for word in allWords if word not in self.stopWordDict]
        wordCount = Counter(subWords)
        sortedWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortedWordCount if item[1] >= 5]
        
        vocab, wordEmbedding = self._getWordEmbedding(words)
        self.wordEmbedding = wordEmbedding
        word2idx = dict(zip(vocab, list(range(len(vocab)))))
        
        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))
        
        with open(directory_path + "\\wordJson\\word2idx.json", "w", encoding="utf-8") as f:
            json.dump(word2idx, f)
        with open(directory_path + "\\wordJson\\label2idx.json", "w", encoding="utf-8") as f:
            json.dump(label2idx, f)
        
        return word2idx, label2idx
    
    
    
    def dataGen(self):
        self._readStopWord(self._stopWordSource)
        reviews, labels = self._readData(self._dataSource)
        word2idx, label2idx = self._genVocabulary(reviews, labels)
        
        labelIds = self._labelToIndex(labels, label2idx)
        reviewIds = self._wordToIndex(reviews, word2idx)
        
        self.trainReviews, self.trainLabels, self.evalReviews, self.evalLabels = self._genTrainEvalData(reviewIds, 
                                                                                                       labelIds, 
                                                                                                       word2idx, 
                                                                                                       self._rate)

data = Dataset(config)
data.dataGen()

In [27]:
def nextBatch(x, y, batchSize):
    perm = np.arange(len(x))
    np.random.shuffle(perm)
    x = x[perm]
    y = y[perm]
    
    numBatches = len(x) // batchSize
    for index in range(numBatches):
        start = index * batchSize
        end = start + batchSize
        batchX = np.array(x[start: end], dtype="int64")
        batchY = np.array(y[start: end], dtype="float32")
        
        yield batchX, batchY

In [28]:
def fixedPositionEmbedding(batchSize, sequenceLen):
    """
        生成 position embedding
    """
    embeddedPosition = []
    for batch in range(batchSize):
        x = []
        for step in range(sequenceLen):
            a = np.zeros(sequenceLen)
            a[step] = 1
            x.append(a)
        embeddedPosition.append(x)
    
    return np.array(embeddedPosition, dtype="float32")

In [29]:
trainReviews = data.trainReviews
trainLabels = data.trainLabels
evalReviews = data.evalReviews
evalLabels = data.evalLabels
wordEmbedding = data.wordEmbedding
labelList = data.labelList
embeddedPosition = fixedPositionEmbedding(config.batchSize, config.sequenceLength)

In [30]:
a = fixedPositionEmbedding(4, 6)
a.shape

(4, 6, 6)

In [31]:
def mean(item: list) -> float:
    """
    计算列表中元素的平均值
    :param item: 列表对象
    :return:
    """
    res = sum(item) / len(item) if len(item) > 0 else 0
    return res


def accuracy(pred_y, true_y):
    """
    计算二类和多类的准确率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]
    corr = 0
    for i in range(len(pred_y)):
        if pred_y[i] == true_y[i]:
            corr += 1
    acc = corr / len(pred_y) if len(pred_y) > 0 else 0
    return acc


def binary_precision(pred_y, true_y, positive=1):
    """
    二类的精确率计算
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param positive: 正例的索引表示
    :return:
    """
    corr = 0
    pred_corr = 0
    for i in range(len(pred_y)):
        if pred_y[i] == positive:
            pred_corr += 1
            if pred_y[i] == true_y[i]:
                corr += 1

    prec = corr / pred_corr if pred_corr > 0 else 0
    return prec


def binary_recall(pred_y, true_y, positive=1):
    """
    二类的召回率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param positive: 正例的索引表示
    :return:
    """
    corr = 0
    true_corr = 0
    for i in range(len(pred_y)):
        if true_y[i] == positive:
            true_corr += 1
            if pred_y[i] == true_y[i]:
                corr += 1

    rec = corr / true_corr if true_corr > 0 else 0
    return rec



def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
    """
    二类的f beta值
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param beta: beta值
    :param positive: 正例的索引表示
    :return:
    """
    precision = binary_precision(pred_y, true_y, positive)
    recall = binary_recall(pred_y, true_y, positive)
    try:
        f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
    except:
        f_b = 0
    return f_b

def multi_precision(pred_y, true_y, labels):
    """
    多类的精确率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    precisions = [binary_precision(pred_y, true_y, label) for label in labels]
    prec = mean(precisions)
    return prec




def multi_recall(pred_y, true_y, labels):
    """
    多类的召回率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    recalls = [binary_recall(pred_y, true_y, label) for label in labels]
    rec = mean(recalls)
    return rec


def multi_f_beta(pred_y, true_y, labels, beta=1.0):
    """
    多类的f beta值
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :param beta: beta值
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
    f_beta = mean(f_betas)
    return f_beta




def get_binary_metrics(pred_y, true_y, f_beta=1.0):
    """
    得到二分类的性能指标
    :param pred_y:
    :param true_y:
    :param f_beta:
    :return:
    """
    acc = accuracy(pred_y, true_y)
    recall = binary_recall(pred_y, true_y)
    precision = binary_precision(pred_y, true_y)
    f_beta = binary_f_beta(pred_y, true_y, f_beta)
    return acc, recall, precision, f_beta


def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
    """
    得到多分类的性能指标
    :param pred_y:
    :param true_y:
    :param labels:
    :param f_beta:
    :return:
    """
    acc = accuracy(pred_y, true_y)
    recall = multi_recall(pred_y, true_y, labels)
    precision = multi_precision(pred_y, true_y, labels)
    f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
    return acc, recall, precision, f_beta


In [32]:
class Transformer(object):
    def __init__(self, config, wordEmbedding):
        self.config = config
        self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
        self.inputY = tf.placeholder(tf.int32, [None], name="inputY")
        self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeppProb")
        self.embeddedPosition = tf.placeholder(tf.float32, [None, config.sequenceLength, config.sequenceLength], 
                                              name="embeddedPosition")
        
        l2Loss = tf.constant(0.0)
        
        """
            word embedding layer 中 position embedding 有两种定义方式
            a) 直接使用固定的 one-hot 形式传入，然后与词向量进行拼接
            b) 采用 attention is all you need 论文中的方式
        """
        with tf.name_scope("embedding"):
            self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")
            self.embedded = tf.nn.embedding_lookup(self.W, self.inputX)
            
            # 采用方式 a)，表现不错
            self.embeddedWords = tf.concat([self.embedded, self.embeddedPosition], -1)
        
        with tf.name_scope("transformer"):
            for index in range(config.model.numBlocks):
                with tf.name_scope("transformer-{}".format(index + 1)):
                    # 维度为 [batch_size, sequence_length, embedding_size]
                    multiHeadAtt = self._multiheadAttention(rawKeys=self.inputX, 
                                                           queries=self.embeddedWords, 
                                                           keys=self.embeddedWords)
                    
                    # 维度为 [batch_size, sequence_length, embedding_size]
                    self.embeddedWords = self._feedForward(multiHeadAtt, 
                                                          [config.model.filters, config.model.embeddingSize + config.sequenceLength])
             
            outputs = tf.reshape(self.embeddedWords, [-1, config.sequenceLength * (config.model.embeddingSize + config.sequenceLength)])
        
        outputSize = outputs.get_shape()[-1].value
        
        
        with tf.name_scope("dropout"):
            outputW = tf.get_variable("outputW", shape=[outputSize, config.numClasses], 
                                     initializer=tf.contrib.layers.xavier_initializer())
            outputB = tf.Variable(tf.constant(0.1, shape=[config.numClasses]), name="outputB")
            
            l2Loss += tf.nn.l2_loss(outputW)
            l2Loss += tf.nn.l2_loss(outputB)
            self.logits = tf.nn.xw_plus_b(outputs, outputW, outputB, name="logits")
            
            if config.numClasses == 1:
                self.predictions = tf.cast(tf.greater_equal(self.logits, 0.0), tf.float32, name="predictions")
            elif config.numClasses > 1:
                self.predictions = tf.argmax(self.logits, axis=-1, name="predictions")
        
        
        with tf.name_scope("loss"):
            if config.numClasses == 1:
                losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, 
                                                                labels=tf.cast(tf.reshape(self.inputY, [-1, 1]), 
                                                                              dtype=tf.float32))
            elif config.numClasses > 1:
                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, 
                                                                       labels=self.inputY)
            
            self.loss = tf.reduce_mean(losses) + config.model.l2RegLambda * l2Loss
    
    
    def _layerNormalization(self, inputs, scope="layerNorm"):
        # LayerNorm 层与 BN 层有所区别
        epsilon = self.config.model.epsilon
        
        # [batch_size, sequence_length, embedding_size]
        inputsShape = inputs.get_shape()
        paramsShape = inputsShape[-1:]
        
        # LayerNorm 是在最后的维度上计算输入数据的均值和方差，BN 层考虑的是所有维度的
        # 此处的 mean、variance 的维度均是 [batch_size, sequence_length, 1]
        mean, variance = tf.nn.moments(inputs, [-1], keep_dims=True)
        
        beta = tf.Variable(tf.zeros(paramsShape))
        gamma = tf.Variable(tf.ones(paramsShape))
        
        normalized = (inputs - mean) / ((variance + epsilon) ** .5)
        outputs = gamma * normalized + beta
        
        return outputs
    
    
    def _multiheadAttention(self, rawKeys, queries, keys, numUnits=None, 
                            causality=False, scope="multiheadAttention"):
        """
            rawKeys: 用于计算 mask，因为 keys 是加上了 position embedding 的，其中不存在 padding 为 0 的值
        """
        numHeads = self.config.model.numHeads
        keepProb = self.config.model.keepProb
        if numUnits is None:
            # 无输入值，则将数据的最后一维作为输入，亦即：embedding_size
            numUnits = queries.get_shape().as_list()[-1]
        
        """
            tf.layers.dense 可以做多维 tensor 的非线性映射，在计算 self-attention 过程中，一定要对
            这三个值进行非线性映射；此步骤对应于论文中的 multi-head attention 中的对分割后的数据进行
            权重映射的过程；在此实现过程中，先映射、后分割，原则上是等价的
            Q、K、V 的维度均是 [batch_size, sequence_length, embedding_size]
        """
        # 映射
        Q = tf.layers.dense(queries, numUnits, activation=tf.nn.relu)
        K = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)
        V = tf.layers.dense(keys, numUnits, activation=tf.nn.relu)
        
        # 分割：将数据按照最后一维分割成 num_heads 个，然后按照第一维进行拼接
        # Q、K、V 的维度均是 [batch_size * numHeads, sequenceLength, embedding_size / numHeads]
        Q_ = tf.concat(tf.split(Q, numHeads, axis=-1), axis=0)
        K_ = tf.concat(tf.split(K, numHeads, axis=-1), axis=0)
        V_ = tf.concat(tf.split(V, numHeads, axis=-1), axis=0)
        
        # 计算 queries 和 keys 之间的点积，维度为：[batch_size * numHeads, queries_len, key_len]
        # queries_len：表示的是 queries 的序列长度
        # key_len：表示的是 keys 的序列长度
        similary = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
        
        # 针对计算结果进行缩放
        scaledSimilary = similary / (K_.get_shape().as_list()[-1] ** 0.5)
        
        """
            在输入的系列中，会存在 padding 这样的填充词，这样的词针对 model 毫无帮助，理论上当 padding 
            均为 0 时，计算出来的相应的权重也该是 0；但是在 transformer model 中引入了 position embedding 
            后，word-embedding + position-embedding 后就不为 0 了，因此，需要在添加 position embedding 之前，
            将其位置 mask 为 0；虽然在 queries 中也存在这样的填充词，但理论上 model 的结果和输入有关，且在
            self-attention 中，queries = keys，因此只要一个为 0，计算出来的 weight 就为 0
        """
        
        # 利用 tf.tile() 进行张量扩张、维度为：[batch_size * numHeads, keys_len]
        keyMasks = tf.tile(rawKeys, [numHeads, 1])
        # 新增一个维度，并进行扩展，结果维度为：[batch_size * numHeads, queries_len, keys_len]
        keyMasks = tf.tile(tf.expand_dims(keyMasks, 1), [1, tf.shape(queries)[1], 1])
        
        # tf.ones_like() 生成元素全为 1，维度与 scaledSimilary 相同，然后得到负无穷大的值
        paddings = tf.ones_like(scaledSimilary) ** (-2 ** (32 + 1))
        
        # 维度为：[batch_size * numHeads, queries_len, keys_len]
        maskedSimilary = tf.where(tf.equal(keyMasks, 0), paddings, scaledSimilary)
        
        # 在计算当前词的同时，只考虑上文、不考虑下文
        # 在文本分类是，可以只使用 transformer encoder；decoder是生成模型，主要用在语言生成中
        if causality:
            # shape: [queries_len, keys_len]
            diagVals = tf.ones_like(maskedSimilary[0, :, :])
            
            # shape: [queries_len, keys_len]
            tril = tf.contrib.linalg.LinearOperatorTril(diagVals).to_dense()
            
            # shape: [batch_size * numHeads, queries_len, keys_len]
            masks = tf.tile(tf.expand_dims(tril, 0), [tf.shape(maskedSimilary)[0], 1, 1])
            paddings = tf.ones_like(masks) ** (-2 ** (32 + 1))
            
            # shape: [batch_size * numHeads, queries_len, keys_len]
            maskedSimilary = tf.where(tf.equal(masks, 0), paddings, maskedSimilary)
        
        weights = tf.nn.softmax(maskedSimilary)
        # 加权
        outputs = tf.matmul(weights, V_)
        # 将 multi-head-attention 计算的输出重组为： [batch_size * numHeads, sequence_Length, embedding_size / numHeads]
        outputs = tf.concat(tf.split(outputs, numHeads, axis=0), axis=2)
        outputs = tf.nn.dropout(outputs, keep_prob=keepProb)
        
        # 针对每个 sub-layers 建立残差连接、以及： H(x) = F(x) + x
        outputs += queries
        # normalization layer
        outputs = self._layerNormalization(outputs)
        
        return outputs
    
    
    def _feedForward(self, inputs, filters, scope="multiheadAttention"):
        # 在此，前向传播采用 CNN 
        # inner-layer
        params = {
            "inputs": inputs, 
            "filters": filters[0], 
            "kernel_size": 1, 
            "activation": tf.nn.relu, 
            "use_bias": True
        }
        outputs = tf.layers.conv1d(**params)
        
        # outer-layer
        params = {
            "inputs": outputs, 
            "filters": filters[1], 
            "kernel_size": 1, 
            "activation": None, 
            "use_bias": True
        }
        # 在此用到了一维卷积，实际上卷积尺寸还是二维的，只不过需要制定高度，宽度和 embedding_size 的尺寸一致
        # 维度：[batch_size, sequence_length, embedding_size]
        outputs = tf.layers.conv1d(**params)
        
        # residual-layer
        outputs += inputs
        # 归一化
        outputs = self._layerNormalization(outputs)
        
        return outputs
    
    
    def _positionEmbedding(self, scope="positionEmbedding"):
        """
            生成可训练的位置向量
        """
        batchSize = self.config.batchSize
        sequenceLen = self.config.sequenceLength
        embeddingSize = self.config.model.embeddingSize
        
        # 生成位置的索引，并扩张在 batch 中的所有样本
        positionIndex = tf.tile(tf.expand_dims(tf.range(sequenceLen), 0), [batchSize, 1])
        
        # 根据正弦和余弦函数来回去每个位置上的 embedding 的第一部分
        positionEmbedding = np.array([[pos / np.power(10000, (i - i % 2) / embeddingSize) for i in range(embeddingSize)] 
                                      for pos in range(sequenceLen)])
        
        # 根据奇偶性分别使用 sin() 和 cos() 函数进行包装
        positionEmbedding[:, 0::2] = np.sin(positionEmbedding[:, 0::2])
        positionEmbedding[:, 1::2] = np.cos(positionEmbedding[:, 1::2])
        
        positionEmbedding_ = np.cast(positionEmbedding, dtype=tf.float32)
        
        # [batchSize, sequenceLen, embeddingSize]
        positionEmbedded = tf.nn.embedding_lookup(positionEmbedding_, positionIndex)
    
        return positionEmbedded
    
    

In [None]:
with tf.Graph().as_default():
    sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    sess = tf.Session(config=sess_config)
    with sess.as_default():
        transformer = Transformer(config, wordEmbedding)
        globalStep = tf.Variable(0, name="globalStep", trainable=False)
        
        optimizer = tf.train.AdamOptimizer(config.training.learningRate)
        gradsAndVars = optimizer.compute_gradients(transformer.loss)
        trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)
        
        gradSummaries = []
        for g, v in gradsAndVars:
            if g is not None:
                tf.summary.histogram("{}/grad/hist".format(v.name), g)
                tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
        
        outDir = os.path.abspath(os.path.join(os.path.curdir, "summarys"))
        print("writing to {}".format(outDir))
        
        lossSummary = tf.summary.scalar("loss", transformer.loss)
        summaryOp = tf.summary.merge_all()
        trainSummaryDir = os.path.join(outDir, "train")
        trainSummaryWriter = tf.summary.FileWriter(trainSummaryDir, sess.graph)
        evalSummaryDir = os.path.join(outDir, "eval")
        evalSummaryWriter = tf.summary.FileWriter(evalSummaryDir, sess.graph)
        
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
        savedModelPath = "\\".join(directory_path.split("\\")[:-1]) + "\\model\\transformer\\savedModel"
        if os.path.exists(savedModelPath):
            os.rmdir(savedModelPath)
#             import shutil
#             shutil.rmtree(savedModelPath)
        
        sess.run(tf.global_variables_initializer())
        
        
        def trainStep(batchX, batchY):
            feed_dict = {
                transformer.inputX: batchX, 
                transformer.inputY: batchY, 
                transformer.dropoutKeepProb: config.model.dropoutKeepProb, 
                transformer.embeddedPosition: embeddedPosition
            }
            _, summary, step, loss, predictions = sess.run(
            [trainOp, summaryOp, globalStep, transformer.loss, transformer.predictions], feed_dict)
            
            if config.numClasses == 1:
                acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
            elif config.numClasses > 1:
                acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
            
            trainSummaryWriter.add_summary(summary, step)
            
            return loss, acc, recall, prec, f_beta
        
        
        
        def evalStep(batchX, batchY):
            feed_dict = {
                transformer.inputX: batchX, 
                transformer.inputY: batchY, 
                transformer.dropoutKeepProb: 1.0, 
                transformer.embeddedPosition: embeddedPosition
            }
            summary, step, loss, predictions = sess.run(
            [summaryOp, globalStep, transformer.loss, transformer.predictions], feed_dict)
            
            if config.numClasses == 1:
                acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
            elif config.numClasses > 1:
                acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
            
            evalSummaryWriter.add_summary(summary, step)
            
            return loss, acc, recall, prec, f_beta
        
        
        print("{} start to train model...".format(datetime.datetime.now().isoformat()))
        for index in range(config.training.epochs):
            print("the {}-th/{} epoch training...".format(str(index), str(config.training.epochs)))
            for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize):
                loss, acc, recall, prec, f_beta = trainStep(batchTrain[0], batchTrain[1])
                currentStep = tf.train.global_step(sess, globalStep)
                print("train: {}, step: {}, loss: {}, acc: {}, recall: {}, prec: {}, f_beta: {}".format(
                    datetime.datetime.now().isoformat(), currentStep, loss, acc, recall, prec, f_beta))
                if currentStep % config.training.evaluateEvery == 0:
                    print("start to evaluate...")
                    losses, accs, recalls, precs, f_betas = [], [], [], [], []
                    for evalBatch in nextBatch(evalReviews, evalLabels, config.batchSize):
                        loss, acc, recall, prec, f_beta = evalStep(evalBatch[0], evalBatch[1])
                        losses.append(loss)
                        accs.append(acc)
                        recalls.append(recall)
                        precs.append(prec)
                        f_betas.append(f_beta)
                    
                    time_str = datetime.datetime.now().isoformat()
                    print("evaluate: \n")
                    print("{}, step: {}, loss: {}, acc: {}, recall: {}, prec: {}, f_beta: {}".format(
                        time_str, currentStep, mean(losses), mean(accs), mean(recalls), mean(precs), mean(f_betas)))
                    
                if currentStep % config.training.checkpointEvery == 0:
                    path = saver.save(sess, "\\".join(savedModelPath.split("\\")[:-1]) + "\\model\\transformer", 
                                     global_step=currentStep)
                    print("save model checkpoint to {} \n".format(path))
        
        