In [1]:
import os, csv, time, datetime, random, json, warnings
import gensim
from collections import Counter
from math import sqrt
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

warnings.filterwarnings("ignore")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
directory_path = "\\".join(os.getcwd().split("\\")[:-1]) + "\\data"
os.path.exists(directory_path)

True

In [3]:
class TrainingConfig(object):
    epochs = 4
    evaluateEvery = 100
    checkpointEvery = 100
    learningRate = 0.001

    
class ModelConfig(object):
    embeddingSize = 200
    hiddenSizes = [256, 128]
    dropoutKeepProb = 0.5
    l2RegLambda = 0.0

    
class Config(object):
    sequenceLength = 200
    batchSize = 128
    dataSource = directory_path + "\\preProcess\\labeledTrain.csv"
    stopWordSource = directory_path + "\\english"
    
    # 1 表示二分类，多分类可使用其他数字
    numClasses = 1
    rate = 0.8
    training = TrainingConfig()
    model = ModelConfig()

    
config = Config()

In [12]:
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self._dataSource = config.dataSource
        self._stopWordSource = config.stopWordSource
        self._sequenceLength = config.sequenceLength
        self._embeddingSize = config.model.embeddingSize
        self._batchSize = config.batchSize
        self._rate = config.rate
        
        self._stopWordDict = dict()
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        self.embedding = None
        self.labelList = []
    
    
    def _readData(self, file_path):
        df = pd.read_csv(file_path)
        if self.config.numClasses == 1:
            labels = df["sentiment"].tolist()
        elif self.config.numClasses > 1:
            labels = df["rate"].tolist()
        
        review = df["review"].tolist()
        reviews = [line.strip().split() for line in review]
        
        return reviews, labels
    
    
    def _labelToIndex(self, labels, label2idx):
        labelIds = [label2idx[label] for label in labels]
        return labelIds
    
    def _wordToIndex(self, reviews, word2idx):
        reviewIds = [[word2idx.get(item, word2idx["UNK"]) for item in review] for review in reviews]
        return reviewIds
    
    
    def _genTrainEvalData(self, x, y, word2idx, rate):
        reviews = []
        for review in x:
            if len(review) > self._sequenceLength:
                reviews.append(review[:self._sequenceLength])
            else:
                reviews.append(review + [word2idx["PAD"]] * (self._sequenceLength - len(review)))
        
        trainIndex = int(len(x) * rate)
        trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
        trainLabels = np.array(y[:trainIndex], dtype="float32")
        evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
        evalLabels = np.array(y[trainIndex:], dtype="float32")
        
        return trainReviews, trainLabels, evalReviews, evalLabels
    
    
    def _genVocabulary(self, reviews, labels):
        all_words = [word for review in reviews for word in review]
        subWords = [word for word in all_words if word not in self.stopWordDict]
        
        wordCount = Counter(subWords)
        sortedWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortedWordCount if item[1] >= 5]
        
        vocab, wordEmbedding = self._getWordEmbedding(words)
        self.wordEmbedding = wordEmbedding
        word2idx = dict(zip(vocab, list(range(len(vocab)))))
        
        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))
        
        with open(directory_path + "\\wordJson\\word2idx.json", "w", encoding="utf-8") as f:
            json.dump(word2idx, f)
        
        with open(directory_path + "\\wordJson\\label2idx.json", "w", encoding="utf-8") as f:
            json.dump(label2idx, f)
        
        return word2idx, label2idx
    
    
    def _getWordEmbedding(self, words):
        wordVec = gensim.models.KeyedVectors.load_word2vec_format(directory_path + "\\word2vec\\word2Vec.bin", 
                                                                 binary=True)
        vocab = []
        wordEmbedding = []
        vocab.append("PAD")
        vocab.append("UNK")
        wordEmbedding.append(np.zeros(self._embeddingSize))
        wordEmbedding.append(np.random.randn(self._embeddingSize))
        
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                print("{} is not exist...".format(word))
            
        return vocab, np.array(wordEmbedding)
    
    
    def _readStopWord(self, stopWordPath):
        with open(stopWordPath) as f:
            stopWords = f.read()
            stopWordList = stopWords.splitlines()
            self.stopWordDict = dict(zip(stopWordList, list(range(len(stopWordList)))))
    
    
    def dataGen(self):
        self._readStopWord(self._stopWordSource)
        reviews, labels = self._readData(self._dataSource)
        word2idx, label2idx = self._genVocabulary(reviews, labels)
        labelIds = self._labelToIndex(labels, label2idx)
        reviewIds = self._wordToIndex(reviews, word2idx)
        
        self.trainReviews, self.trainLabels, self.evalReviews, self.evalLabels = self._genTrainEvalData(reviewIds, 
                                                                                                        labelIds, 
                                                                                                        word2idx, 
                                                                                                        self._rate)
        
data = Dataset(config)
data.dataGen()

In [16]:
def nextBatch(x, y, batchSize):
    perm = np.arange(len(x))
    np.random.shuffle(perm)
    x = x[perm]
    y = y[perm]
    
    batches = len(x) // batchSize
    for index in range(batches):
        start = index * batchSize
        end = start + batchSize
        batchX = np.array(x[start: end], dtype="int64")
        batchY = np.array(y[start: end], dtype="float32")
        
        yield batchX, batchY

In [18]:
def mean(item: list) -> float:
    """
    计算列表中元素的平均值
    :param item: 列表对象
    :return:
    """
    res = sum(item) / len(item) if len(item) > 0 else 0
    return res


def accuracy(pred_y, true_y):
    """
    计算二类和多类的准确率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]
    corr = 0
    for i in range(len(pred_y)):
        if pred_y[i] == true_y[i]:
            corr += 1
    acc = corr / len(pred_y) if len(pred_y) > 0 else 0
    return acc


def binary_precision(pred_y, true_y, positive=1):
    """
    二类的精确率计算
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param positive: 正例的索引表示
    :return:
    """
    corr = 0
    pred_corr = 0
    for i in range(len(pred_y)):
        if pred_y[i] == positive:
            pred_corr += 1
            if pred_y[i] == true_y[i]:
                corr += 1

    prec = corr / pred_corr if pred_corr > 0 else 0
    return prec


def binary_recall(pred_y, true_y, positive=1):
    """
    二类的召回率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param positive: 正例的索引表示
    :return:
    """
    corr = 0
    true_corr = 0
    for i in range(len(pred_y)):
        if true_y[i] == positive:
            true_corr += 1
            if pred_y[i] == true_y[i]:
                corr += 1

    rec = corr / true_corr if true_corr > 0 else 0
    return rec


def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
    """
    二类的f beta值
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param beta: beta值
    :param positive: 正例的索引表示
    :return:
    """
    precision = binary_precision(pred_y, true_y, positive)
    recall = binary_recall(pred_y, true_y, positive)
    try:
        f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
    except:
        f_b = 0
    return f_b

def multi_precision(pred_y, true_y, labels):
    """
    多类的精确率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    precisions = [binary_precision(pred_y, true_y, label) for label in labels]
    prec = mean(precisions)
    return prec


def multi_recall(pred_y, true_y, labels):
    """
    多类的召回率
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    recalls = [binary_recall(pred_y, true_y, label) for label in labels]
    rec = mean(recalls)
    return rec


def multi_f_beta(pred_y, true_y, labels, beta=1.0):
    """
    多类的f beta值
    :param pred_y: 预测结果
    :param true_y: 真实结果
    :param labels: 标签列表
    :param beta: beta值
    :return:
    """
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]

    f_betas = [binary_f_beta(pred_y, true_y, beta, label) for label in labels]
    f_beta = mean(f_betas)
    return f_beta


def get_binary_metrics(pred_y, true_y, f_beta=1.0):
    """
    得到二分类的性能指标
    :param pred_y:
    :param true_y:
    :param f_beta:
    :return:
    """
    acc = accuracy(pred_y, true_y)
    recall = binary_recall(pred_y, true_y)
    precision = binary_precision(pred_y, true_y)
    f_beta = binary_f_beta(pred_y, true_y, f_beta)
    return acc, recall, precision, f_beta


def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
    """
    得到多分类的性能指标
    :param pred_y:
    :param true_y:
    :param labels:
    :param f_beta:
    :return:
    """
    acc = accuracy(pred_y, true_y)
    recall = multi_recall(pred_y, true_y, labels)
    precision = multi_precision(pred_y, true_y, labels)
    f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
    return acc, recall, precision, f_beta

In [39]:
class BiLSTMAttention(object):
    def __init__(self, config, wordEmbedding):
        self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
        self.inputY = tf.placeholder(tf.int32, [None], name="inputY")
        self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb")
        
        l2Loss = tf.constant(0.0)
        with tf.name_scope("embedding"):
            self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")
            self.embeddedWords = tf.nn.embedding_lookup(self.W, self.inputX)
        
        with tf.name_scope("Bi-LSTM"):
            for index, hiddenSize in enumerate(config.model.hiddenSizes):
                with tf.name_scope("Bi-LSTM" + str(index)):
                    lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, 
                                                                                      state_is_tuple=True), 
                                                              output_keep_prob=self.dropoutKeepProb)
                    lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, 
                                                                                      state_is_tuple=True), 
                                                              output_keep_prob=self.dropoutKeepProb)
                    
                    outputs_, self.current_state = tf.nn.bidirectional_dynamic_rnn(lstmFwCell, lstmBwCell, 
                                                                                  self.embeddedWords, 
                                                                                  dtype=tf.float32, 
                                                                                  scope="bi-lstm" + str(index))
                    
                    print("before concat, self.embeddedWords shape: ", self.embeddedWords.shape)
                    print("the shape of outputs_: ", outputs_[0].shape, outputs_[1].shape)
                    self.embeddedWords = tf.concat(outputs_, 2)
                    
                    print("after concat, self.embeddedWords shape: ", self.embeddedWords.shape)
        
        
        outputs = tf.split(self.embeddedWords, 2, -1)
        print("the item of outputs shape: ", outputs[0].shape, outputs[1].shape)
        
        # 在 Bi-LSTM + Attention 论文中，将前向和后向的输出相加
        with tf.name_scope("Attention"):
            H = outputs[0] + outputs[1]
            print("the internal hidden layer shape is: ", H.shape)
            
            # attention 输出
            output = self.attention(H)
            outputSize = config.model.hiddenSizes[-1]
            
        # 全连接层的输出
        with tf.name_scope("output"):
            outputW = tf.get_variable("outputW", shape=[outputSize, config.numClasses], 
                                      initializer=tf.contrib.layers.xavier_initializer())
            outputB = tf.Variable(tf.constant(0.0, shape=[config.numClasses]), name="outputB")
            
            l2Loss += tf.nn.l2_loss(outputW)
            l2Loss += tf.nn.l2_loss(outputB)
            self.logits = tf.nn.xw_plus_b(output, outputW, outputB, name="logits")
            
            if config.numClasses == 1:
                self.predictions = tf.cast(tf.greater_equal(self.logits, 0.0), tf.float32, name="predictions")
            elif config.numClasses > 1:
                self.predictions = tf.argmax(self.logits, axis=-1, name="predictions")
            
        with tf.name_scope("loss"):
            if config.numClasses == 1:
                losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, 
                                                                labels=tf.cast(tf.reshape(self.inputY, [-1, 1]), 
                                                                              dtype=tf.float32))
            elif config.numClasses > 1:
                losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.logits, 
                                                                       labels=self.inputY)
            self.loss = tf.reduce_mean(losses) + l2Loss * config.model.l2RegLambda
    
    
    def attention(self, H):
        """
            利用 attention 得到句子的向量表示
        """
        hiddenSize = config.model.hiddenSizes[-1]
        W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1))
        
        # 针对 Bi-LSTM 的输出做一个非线性转换
        M = tf.tanh(H)
        
        # 针对 W 和 M 做矩阵运算
        # W = [batch_size, step_time, hidden_size] ==> [batch_size * step_time, hidden_size]
        # newM = [batch_size, time_step, 1], 每一个时间步的输出由向量转换成为一个数字
        newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1]))
        restoreM = tf.reshape(newM, [-1, config.sequenceLength])
        self.alpha = tf.nn.softmax(restoreM)
        
        # 利用 alpha 的值，针对 H 进行加权求和
        r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, config.sequenceLength, 1]))
        sequeezeR = tf.reshape(r, [-1, hiddenSize])
        sentence_presentation = tf.tanh(sequeezeR)
        output = tf.nn.dropout(sentence_presentation, self.dropoutKeepProb)
        
        return output
    

In [40]:
trainReviews = data.trainReviews
trainLabels = data.trainLabels
evalReviews = data.evalReviews
evalLabels = data.evalLabels

wordEmbedding = data.wordEmbedding
labelList = data.labelList

In [41]:
with tf.Graph().as_default():
    sess_config = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    sess = tf.Session(config=sess_config)
    with sess.as_default():
        lstm = BiLSTMAttention(config, wordEmbedding)
        globalStep = tf.Variable(0, name="globalStep", trainable=False)
        optimizer = tf.train.AdamOptimizer(config.training.learningRate)
        gradsAndVars = optimizer.compute_gradients(lstm.loss)
        trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)
        
    sess.run(tf.global_variables_initializer())
    
    def trainStep(batchX, batchY):
        feed_dict = {
            lstm.inputX: batchX, 
            lstm.inputY: batchY, 
            lstm.dropoutKeepProb: config.model.dropoutKeepProb
        }
        _, step, loss, predictions = sess.run([trainOp, globalStep, lstm.loss, lstm.predictions], feed_dict)
        timeStr = datetime.datetime.now().isoformat()
        
        if config.numClasses == 1:
            acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
        elif config.numClasses > 1:
            acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
        
        return loss, acc, prec, recall, f_beta
    
    
    def devStep(batchX, batchY):
        feed_dict = {
            lstm.inputX: batchX, 
            lstm.inputY: batchY, 
            lstm.dropoutKeepProb: 1.0
        }
        step, loss, predictions = sess.run([globalStep, lstm.loss, lstm.predictions], feed_dict)
        if config.numClasses == 1:
            acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
        elif config.numClasses > 1:
            acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
        
        return loss, acc, prec, recall, f_beta
    
    
    for index in range(config.training.epochs):
        print("start to train model, {} epoch / epochs {}".format(str(index), str(config.training.epochs)))
        for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize):
            loss, acc, prec, recall, f_beta = trainStep(batchTrain[0], batchTrain[1])
            currentStep = tf.train.global_step(sess, globalStep)
            print("train: step: {}, loss: {}, acc: {}, recall: {}, prec: {}, f_beta: {}".format(
                currentStep, loss, acc, recall, prec, f_beta))
            
            if currentStep % config.training.evaluateEvery == 0:
                print("start to evaluate...")
                losses, accs, precisions, recalls, f_betas = [], [], [], [], []
                for batchEval in nextBatch(evalReviews, evalLabels, config.batchSize):
                    loss, acc, precision, recall, f_beta = devStep(batchEval[0], batchEval[1])
                    losses.append(loss)
                    accs.append(acc)
                    precisions.append(precision)
                    recalls.append(recall)
                    f_betas.append(f_beta)
                
                time_str = datetime.datetime.now().isoformat()
                print("{}, step: {}, loss: {}, acc: {}, precision: {}, recall: {}, f_beta: {}".format(
                    time_str, currentStep, mean(losses), mean(accs), mean(precisions), mean(recalls), mean(f_betas)))

before concat, self.embeddedWords shape:  (?, 200, 200)
the shape of outputs_:  (?, 200, 256) (?, 200, 256)
after concat, self.embeddedWords shape:  (?, 200, 512)
before concat, self.embeddedWords shape:  (?, 200, 512)
the shape of outputs_:  (?, 200, 128) (?, 200, 128)
after concat, self.embeddedWords shape:  (?, 200, 256)
the item of outputs shape:  (?, 200, 128) (?, 200, 128)
the internal hidden layer shape is:  (?, 200, 128)
start to train model, 0 epoch / epochs 4
train: step: 1, loss: 0.7066668272018433, acc: 0.4609375, recall: 0.46296296296296297, prec: 0.38461538461538464, f_beta: 0.4201680672268908
train: step: 2, loss: 1.3528417348861694, acc: 0.546875, recall: 0.0, prec: 0, f_beta: 0
train: step: 3, loss: 0.7422524094581604, acc: 0.484375, recall: 0.03278688524590164, prec: 0.2222222222222222, f_beta: 0.05714285714285715
train: step: 4, loss: 0.8137862682342529, acc: 0.5625, recall: 1.0, prec: 0.5625, f_beta: 0.72
train: step: 5, loss: 0.8296093940734863, acc: 0.5, recall: 1