In [1]:
import os
import time
import datetime
import json
import csv
from math import sqrt
import warnings
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

warnings.filterwarnings("ignore")

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
directory_path = "\\".join(os.getcwd().split("\\")[:-1]) + "\\data"
os.path.exists(directory_path)

True

In [3]:
directory_path

'C:\\Users\\123\\Documents\\python_experence\\nlp_model\\data'

In [4]:
# 配置 model 的一些参数

class TrainingConfig(object):
    epochs = 10
    evaluateEvery = 100
    checkpointEvery = 100
    learningRate = 0.001


class ModelConfig(object):
    
    # 卷积核数量、卷积核尺寸、池化尺寸
    convLayers = [[256, 7, 4], 
                  [256, 7, 4], 
                  [256, 3, 4]]
    
    # 全连接层参数
    fcLayers = [512]
    dropoutKeepProb = 0.5
    epsilon = 1e-3
    
    # BN 中计算滑动平均参数
    decay = 0.999

    
class Config(object):
    alphabet = "abcdefghijklmnopqrstuvwxyz0123456789-,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
    sequenceLength = 1014
    batchSize = 128
    
    # 训练集-测试集比例
    rate = 0.8
    dataSource = directory_path + "\\preProcess\\labeledCharTrain.csv"
    
    assert os.path.exists(dataSource)
    training = TrainingConfig()
    model = ModelConfig()
    

config = Config()

In [5]:
# 预处理数据类，生成训练集和测试集

class Dataset(object):
    def __init__(self, config):
        self._dataSource = config.dataSource
        self._sequenceLength = config.sequenceLength
        self._rate = config.rate
        
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        
        self._alphabet = config.alphabet
        self.charEmbedding = None
        self._charToIndex = dict()
        self._indexToChar = dict()
        
    
    def _readData(self, filepath):
        df = pd.read_csv(filepath)
        labels = df["sentiment"].tolist()
        review = df["review"].tolist()
        reviews = [[char for char in line if char != " "] for line in review]
        
        return reviews, labels
    
    
    def _reviewProcess(self, review, sequenceLength, charToIndex):
        """
            将数据集内的每条评论文本使用 index 来表示
            wordToIndex 中 "pad" 对应为 index 中的 0
        """
        reviewVec = np.zeros((sequenceLength))
        sequenceLen = sequenceLength
        
        # 判断当前序列是否小于所定义的固定序列长度
        if len(review) < sequenceLength:
            sequenceLen = len(review)
        
        for index in range(sequenceLen):
            if review[index] in charToIndex:
                reviewVec[index] = charToIndex[review[index]]
            else:
                reviewVec[index] = charToIndex["UNK"]
        
        return reviewVec
    
    
    def _genTrainEvalData(self, x, y, rate):
        """
            生成训练集和验证集
        """
        reviews = []
        labels = []
        
        # 将 word 转成 index 表示
        for index in range(len(x)):
            reviewVec = self._reviewProcess(x[index], self._sequenceLength, self._charToIndex)
            reviews.append(reviewVec)
            labels.append([y[index]])
        
        trainIndex = int(len(x) * rate)
        trainReviews = np.asarray(reviews[:trainIndex], dtype="int64")
        trainLabels = np.asarray(labels[:trainIndex], dtype="float32")
        evalReviews = np.asarray(reviews[trainIndex:], dtype="int64")
        evalLabels = np.asarray(labels[trainIndex:], dtype="float32")
        
        return trainReviews, trainLabels, evalReviews, evalLabels
    
    
    def _getCharEmbedding(self, chars):
        """
            按照 one-hot 的形式将字符映射成为向量
        """
        alphabet = ["UNK"] + [char for char in self._alphabet]
        vocab = ["pad"] + alphabet
        charEmbedding = []
        charEmbedding.append(np.zeros(len(alphabet), dtype="float32"))
        
        for index, alpha in enumerate(alphabet):
            one_hot = np.zeros(len(alphabet), dtype="float32")
            
            # 生成每个字符对应的 1 位置向量
            one_hot[index] = 1
            charEmbedding.append(one_hot)
        
        return vocab, np.array(charEmbedding)
    
    
    def _genVocabulary(self, reviews):
        """
            生成字符向量以及 字符-索引 映射字典
        """
        chars = [char for char in self._alphabet]
        vocab, charEmbedding = self._getCharEmbedding(chars)
        self.charEmbedding = charEmbedding
        self._charToIndex = dict(zip(vocab, list(range(len(vocab)))))
        self._indexToChar = dict(zip(list(range(len(vocab))), vocab))
        
        
        # 将 word-index 映射表保存为 json 数据(持久化)
        with open(directory_path + "\\charJson\\charToIndex.json", "w", encoding="utf-8") as f:
            json.dump(self._charToIndex, f)
        
        with open(directory_path + "\\charJson\\indexToChar.json", "w", encoding="utf-8") as f:
            json.dump(self._indexToChar, f)
    
    
    def dataGen(self):
        reviews, labels = self._readData(self._dataSource)
        self._genVocabulary(reviews)
        trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviews, labels, self._rate)
        self.trainReviews = trainReviews
        self.trainLabels = trainLabels
        self.evalReviews = evalReviews
        self.evalLabels = evalLabels

        
data = Dataset(config)
data.dataGen()

In [6]:
print("the training data shape: ", data.trainReviews.shape)
print("the training label shape: ", data.trainLabels.shape)
print("the evaluate data shape: ", data.evalReviews.shape)
print("the evaluate label shape: ", data.evalLabels.shape)

the training data shape:  (20000, 1014)
the training label shape:  (20000, 1)
the evaluate data shape:  (5000, 1014)
the evaluate label shape:  (5000, 1)


In [7]:
data.charEmbedding.shape

(71, 70)

In [8]:
# 生成 batch 数据集
def nextBatch(x, y, batchSize):
    """
        利用生成器来生成 batch 数据集
    """
    perm = np.arange(len(x))
    np.random.shuffle(perm)
    x = x[perm]
    y = y[perm]
    
    numBatches = len(x) // batchSize
    for index in range(numBatches):
        start = index * batchSize
        end = start + batchSize
        batchX = np.array(x[start: end], dtype="int64")
        batchY = np.array(y[start: end], dtype="float32")
        
        yield batchX, batchY

In [9]:
# 定义 char-CNN 分类器 model

class CharCNN(object):
    def __init__(self, config, charEmbedding):
        self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
        self.inputY = tf.placeholder(tf.float32, [None, 1], name="inputY")
        self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb")
        
        self.isTraining = tf.placeholder(tf.bool, name="isTraining")
        self.epsilon = config.model.epsilon
        self.decay = config.model.decay
        
        
        # char embedding
        with tf.name_scope("embedding"):
            # 利用 one-hot 编码的字符向量作为初始化的 embedding matrix
            self.W = tf.Variable(tf.cast(charEmbedding, dtype=tf.float32, name="charEmbedding"), name="W")
            
            # 获取 char embedding
            self.embededChars = tf.nn.embedding_lookup(self.W, self.inputX)
            
            # 新增一个通道维度
            self.embededCharsExpand = tf.expand_dims(self.embededChars, -1)
        
        
        for index, current_layer in enumerate(config.model.convLayers):
            print("the {0} conv layer to process".format(index + 1))
            with tf.name_scope("convLayer-%s"%(index + 1)):
                # 获取到字符的向量长度
                filterWidth= self.embededCharsExpand.get_shape()[2].value
                
                # filterShape = [height, width, in_channels, out_channels]
                filterShape = [current_layer[1], filterWidth, 1, current_layer[0]]
                stdv = 1 / sqrt(current_layer[0] * current_layer[1])
                
                # 初始化 W 与 b 的值
                wConv = tf.Variable(tf.random_uniform(filterShape, minval=-stdv, maxval=stdv), dtype="float32", name="w")
                bConv = tf.Variable(tf.random_uniform(shape=[current_layer[0]], minval=-stdv, maxval=stdv), name="b")
                
                # 构造卷积层
                conv = tf.nn.conv2d(self.embededCharsExpand, wConv, strides=[1, 1, 1, 1], padding="VALID", name="conv")
                hConv = tf.nn.bias_add(conv, bConv)
                hConv = tf.nn.relu(hConv)
                
                if current_layer[-1] is not None:
                    ksizeShape = [1, current_layer[-1], 1, 1]
                    hPool = tf.nn.max_pool(hConv, ksize=ksizeShape, strides=ksizeShape, padding="VALID", name="pool")
                else:
                    hPool = hConv
                
                print("the final hPool shape is: ", hPool.shape)
                
                
                # 针对维度进行变换，转换成卷积层的输入维度
                self.embededCharsExpand = tf.transpose(hPool, [0, 1, 3, 2], name="transpose")
        
        
        print("the current expand embedding is: ", self.embededCharsExpand, self.embededCharsExpand.shape)
        print("the shape of embedding is: ", self.embededChars.shape)
        
        with tf.name_scope("reshape"):
            fcDim = self.embededCharsExpand.get_shape()[1].value * self.embededCharsExpand.get_shape()[2].value
            self.inputReshape = tf.reshape(self.embededCharsExpand, [-1, fcDim])
            print("the reshape of input: ", self.inputReshape.shape)
        
        
        # 保存的是 unit number [34 * 256, 1024, 1024]
        weights = [fcDim] + config.model.fcLayers
        print("the weights are: ", weights)
        
        for index, fc_layer in enumerate(config.model.fcLayers):
            with tf.name_scope("fcLayer-%s"%(index + 1)):
                print("begin process {}-th layer".format(index + 1))
                stdv = 1 / sqrt(weights[index])
                
                wFc = tf.Variable(tf.random_uniform([weights[index], fc_layer], minval=-stdv, maxval=stdv), dtype="float32", name="w")
                bFc = tf.Variable(tf.random_uniform(shape=[fc_layer], minval=-stdv, maxval=stdv), dtype="float32", name="b")
                
                self.fcInput = tf.nn.relu(tf.matmul(self.inputReshape, wFc) + bFc)
                
                with tf.name_scope("dropOut"):
                    self.fcInputDrop = tf.nn.dropout(self.fcInput, self.dropoutKeepProb)
                
            self.inputReshape = self.fcInputDrop
        
        
        with tf.name_scope("outputLayer"):
            stdv = 1 / sqrt(weights[-1])
            
            wOut = tf.Variable(tf.random_uniform([config.model.fcLayers[-1], 1], minval=-stdv, maxval=stdv), dtype="float32", name="w")
            bOut = tf.Variable(tf.random_uniform(shape=[1], minval=-stdv, maxval=stdv), name="b")
            
            self.predictions = tf.nn.xw_plus_b(self.inputReshape, wOut, bOut,name="predictions")
            self.binaryPreds = tf.cast(tf.greater_equal(self.predictions, 0.0), tf.float32, name="binaryPreds")
        
        
        with tf.name_scope("loss"):
            losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY)
            self.loss =tf.reduce_sum(losses)
        
    
    def _batchNorm(self, x):
        print("the shape of x: ", x.get_shape())
        gamma = tf.Variable(tf.ones([x.get_shape()[3].value]))
        beta = tf.Variable(tf.zeros([x.get_shape()[3].value]))
        self.popMean = tf.Variable(tf.zeros([x.get_shape()[3].value]), trainable=False, name="popMean")
        self.popVariance = tf.Variable(tf.ones([x.get_shape()[3].value]), trainable=False, name="popVariance")
        
        def batchNormTraining():
            batchMean, batchVariance = tf.nn.moments(x, [0, 1, 2], keep_dims=False)
            decay = 0.99
            trainMean = tf.assign(self.popMean, self.popMean * self.decay + batchMean * (1 - self.decay))
            trainVariance = tf.assign(self.popVariance, self.popVariance * self.decay + batchVariance * (1 - self.decay))
            
            with tf.control_dependencies([trainMean, trainVariance]):
                return tf.nn.batch_normalization(x, batchMean, batchVariance, beta, gamma, self.epsilon)
        
        def batchNormInference():
            return tf.nn.batch_normalization(x, self.popMean, self.popVariance, beta, gamma, self.epsilon)
        
        batchNormalizedOutput = tf.cond(self.isTraining, batchNormTraining, batchNormInference)
        
        return tf.nn.relu(batchNormalizedOutput)
    

In [16]:
def mean(item):
    return sum(item) / len(item)

def getMetrics(trueY, predY, binaryPredY):
    """
        生成 acc 和 auc 值
    """
    auc = roc_auc_score(trueY, predY)
    accuracy = accuracy_score(trueY, binaryPredY)
    precision = precision_score(trueY, binaryPredY, average="macro")
    recall = recall_score(trueY, binaryPredY, average="macro")
    
    return round(accuracy, 4), round(auc, 4), round(precision, 4), round(recall, 4)

In [18]:
# model training

trainReviews = data.trainReviews
trainLabels = data.trainLabels
evalReviews = data.evalReviews
evalLabels = data.evalLabels

charEmbedding = data.charEmbedding

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    
    sess = tf.Session(config=session_conf)
    
    with sess.as_default():
        cnn = CharCNN(config, charEmbedding)
        globalStep = tf.Variable(0, name="globalStep", trainable=False)
        
        optimizer = tf.train.RMSPropOptimizer(config.training.learningRate)
        gradsAndVars = optimizer.compute_gradients(cnn.loss)
        trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)
        
        # 使用 summary 绘制 tensorBoard
        gradSummaries = []
        for g, v in gradsAndVars:
            if g is not None:
                tf.summary.histogram("{}/grad/hist".format(v.name), g)
                tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
        
        
        outDir = os.path.abspath(os.path.join(os.path.curdir, "summarys"))
        print("write to {} \n".format(outDir))
        
        lossSummary = tf.summary.scalar("trainLoss", cnn.loss)
        summaryOp = tf.summary.merge_all()
        trainSummaryDir = os.path.join(outDir, "train")
        trainSummaryWriter = tf.summary.FileWriter(trainSummaryDir, sess.graph)
        
        evalSummaryDir = os.path.join(outDir, "eval")
        evalSummaryWriter = tf.summary.FileWriter(evalSummaryDir, sess.graph)
        
        
        # 初始化
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
        sess.run(tf.global_variables_initializer())
        
        def trainStep(batchX, batchY):
            feed_dict = {
                cnn.inputX: batchX, 
                cnn.inputY: batchY, 
                cnn.dropoutKeepProb: config.model.dropoutKeepProb, 
                cnn.isTraining: True
            }
            
            _, summary, step, loss, predictions, binaryPreds = sess.run([trainOp, 
                                                                        summaryOp, 
                                                                        globalStep, 
                                                                        cnn.loss, 
                                                                        cnn.predictions, 
                                                                        cnn.binaryPreds], feed_dict)
            timeStr = datetime.datetime.now().isoformat()
            acc, auc, precision, recall = getMetrics(batchY, predictions, binaryPreds)
            print("{}, step: {}, loss: {}, acc: {}, auc: {}, precision: {}, recall: {}".format(
                timeStr, step, loss, acc, auc, precision, recall))
            trainSummaryWriter.add_summary(summary, step)
        
        
        def devStep(batchX, batchY):
            feed_dict = {
                cnn.inputX: batchX, 
                cnn.inputY: batchY, 
                cnn.dropoutKeepProb: 1.0, 
                cnn.isTraining: False
            }
            summary, step, loss, predictions, binaryPreds = sess.run([
                    summaryOp, globalStep, cnn.loss, cnn.predictions, cnn.binaryPreds
                ], feed_dict)
            acc, auc, precision, recall = getMetrics(batchY, predictions, binaryPreds)
            evalSummaryWriter.add_summary(summary, step)
            
            return loss, acc, auc, precision, recall
        
        
        
        for index in range(config.training.epochs):
            print("start to train models...")
            for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize):
                trainStep(batchTrain[0], batchTrain[1])
                currentStep = tf.train.global_step(sess, globalStep)
                if currentStep % config.training.evaluateEvery == 0:
                    print("\n Evaluation: ")
                    losses, accs, aucs, precisions, recalls = [], [], [], [], []
                    for batchEval in nextBatch(evalReviews, evalLabels, config.batchSize):
                        loss, acc, auc, precision, recall = devStep(batchEval[0], batchEval[1])
                        losses.append(loss)
                        accs.append(acc)
                        aucs.append(auc)
                        precisions.append(precision)
                        recalls.append(recall)
                    
                    time_str = datetime.datetime.now().isoformat()
                    print("{}, step: {}, loss: {}, acc: {}, auc: {}, precision: {}, recall: {}".format(
                        time_str, currentStep, mean(losses), mean(accs), mean(aucs), mean(precisions), mean(recalls)))
                    
                
                

the 1 conv layer to process
the final hPool shape is:  (?, 252, 1, 256)
the 2 conv layer to process
the final hPool shape is:  (?, 61, 1, 256)
the 3 conv layer to process
the final hPool shape is:  (?, 14, 1, 256)
the current expand embedding is:  Tensor("convLayer-3/transpose:0", shape=(?, 14, 256, 1), dtype=float32) (?, 14, 256, 1)
the shape of embedding is:  (?, 1014, 70)
the reshape of input:  (?, 3584)
the weights are:  [3584, 512]
begin process 1-th layer
write to C:\Users\123\Documents\python_experence\nlp_model\model_code\summarys 

start to train models...
2019-09-23T09:54:50.397980, step: 1, loss: 88.50857543945312, acc: 0.5625, auc: 0.5972, precision: 0.7795, recall: 0.5088
2019-09-23T09:54:52.275045, step: 2, loss: 88.8498306274414, acc: 0.4844, auc: 0.4321, precision: 0.2422, recall: 0.5
2019-09-23T09:54:54.071244, step: 3, loss: 88.65156555175781, acc: 0.5234, auc: 0.5243, precision: 0.2617, recall: 0.5
2019-09-23T09:54:55.871427, step: 4, loss: 88.3875732421875, acc: 0.6