In [1]:
import os, csv, time, datetime, random, json
from collections import Counter
from math import sqrt
import gensim
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [2]:
directory_path = "\\".join(os.getcwd().split("\\")[:-1]) + "\\data"
os.path.exists(directory_path)

True

In [3]:
# 超参配置
class TrainingConfig(object):
    epochs = 5
    evaluateEvery = 100
    checkpointEvery = 100
    learningRate= 0.001


class ModelConfig(object):
    embeddingSize = 200
    numFilters = 128
    filterSizes = [2, 3, 4, 5]
    dropoutKeepProb = 0.5
    l2RegLambda = 0.0

    
class Config(object):
    sequenceLength = 200
    batchSize = 128
    dataSource = directory_path + "\\preProcess\\labeledTrain.csv"
    
    stopWordSource = directory_path + "\\english"
    
    # 二分类时设置为 1 ，对分类时设置为其他数字
    numClasses = 1
    rate = 0.8         # 训练集占比
    training = TrainingConfig()
    model = ModelConfig()


config = Config()

In [4]:
config

<__main__.Config at 0x1eeef8b5358>

In [20]:
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self._dataSource = config.dataSource
        self._stopWordSource = config.stopWordSource
        self._sequenceLength = config.sequenceLength
        
        
        self._embeddingSize = config.model.embeddingSize
        self._batchSize = config.batchSize
        self._rate = config.rate
        
        
        self._stopWordDict = dict()
        self.trainReviews = []
        self.trainLabels = []
        self.evalReviews = []
        self.evalLabels = []
        self.wordEmbedding = None
        self.labelList = []
        
    def _readData(self, file_path):
        df = pd.read_csv(file_path)
        if self.config.numClasses == 1:
            labels = df["sentiment"].tolist()
        elif self.config.numClasses > 1:
            labels = df["rate"].tolist()
        
        review = df["review"].tolist()
        reviews = [line.strip().split() for line in review]
        
        return reviews, labels
    
    def _labelToIndex(self, labels, label2idx):
        """
            本函数旨在将类别标签转换成索引号表示
        """
        labelIds = [label2idx[label] for label in labels]
        
        return labelIds
    
    def _wordToIndex(self, reviews, word2idx):
        """
            将词汇转换成为索引
        """
        reviewIds = [[word2idx.get(item, word2idx["UNK"]) for item in review] for review in reviews]
        return reviewIds
    
    
    def _genTrainEvalData(self, x, y, word2idx, rate):
        reviews = []
        for review in x:
            if len(review) >= self._sequenceLength:
                # contruncate
                reviews.append(review[: self._sequenceLength])
            else:
                # PADDING
                reviews.append(review + [word2idx["PAD"]] * (self._sequenceLength - len(review)))
        
        trainIndex = int(len(x) * rate)

        
        trainReviews = np.asarray(reviews[: trainIndex], dtype="int64")
        trainLabels = np.array(y[: trainIndex], dtype="float32")
        evalReviews = np.asarray(reviews[trainIndex :], dtype="int64")
        evalLabels = np.array(y[trainIndex : ], dtype="float32")
        
        
        return trainReviews, trainLabels, evalReviews, evalLabels
    
    
    def _readStopWord(self, stop_word_path):
        with open(stop_word_path, "r") as f:
            stop_words = f.read()
            stop_word_list = stop_words.splitlines()
            self.stopWordDict = dict(zip(stop_word_list, list(range(len(stop_word_list)))))
    
    
    
    def _getWordEmbedding(self, words):
        wordVec = gensim.models.KeyedVectors.load_word2vec_format(directory_path + "\\word2vec\\word2Vec.bin", binary=True)
        vocab = []
        wordEmbedding = []
        
        # 新增 "UNK" 和 "PAD"
        vocab.append("PAD")
        vocab.append("UNK")
        
        # “PAD”为全0，"UNK"为随机初始化值
        wordEmbedding.append(np.zeros(self._embeddingSize))
        wordEmbedding.append(np.random.randn(self._embeddingSize))
        for word in words:
            try:
                vector = wordVec.wv[word]
                vocab.append(word)
                wordEmbedding.append(vector)
            except:
                print("{} is not exist in vocabulary...".format(word))
        
        
        return vocab, np.array(wordEmbedding)
    
    
    def _genVocabulary(self, reviews, labels):
        """
            生成词汇向量、以及建立 词汇-索引 映射字典
        """
        # 去掉停用词哦
        all_words = [word for review in reviews for word in review]
        subWords = [word for word in all_words if word not in self.stopWordDict]
        
        wordCount = Counter(subWords)
        sortWordCount = sorted(wordCount.items(), key=lambda x: x[1], reverse=True)
        words = [item[0] for item in sortWordCount if item[1] >= 5]     # 过滤低频词汇
        
        vocab, wordEmbedding = self._getWordEmbedding(words)
        self.wordEmbedding = wordEmbedding
        word2idx = dict(zip(vocab, list(range(len(vocab)))))
        
        uniqueLabel = list(set(labels))
        label2idx = dict(zip(uniqueLabel, list(range(len(uniqueLabel)))))
        self.labelList = list(range(len(uniqueLabel)))
        
        with open(directory_path + "\\wordJson\\word2idx.json", "w", encoding="utf-8") as f:
            json.dump(word2idx, f)
        
        with open(directory_path + "\\wordJson\\label2idx.json", "w", encoding="utf-8") as f:
            json.dump(label2idx, f)
        
        return word2idx, label2idx
    
    
    def dataGen(self):
        self._readStopWord(self._stopWordSource)
        reviews, labels = self._readData(self._dataSource)
        
        word2idx, label2idx = self._genVocabulary(reviews, labels)
        
        # 将标签与句子数据化
        labelIds = self._labelToIndex(labels, label2idx)
        reviewIds = self._wordToIndex(reviews, word2idx)
        
        trainReviews, trainLabels, evalReviews, evalLabels = self._genTrainEvalData(reviewIds, labelIds, word2idx, self._rate)
        self.trainReviews = trainReviews
        self.trainLabels = trainLabels
        self.evalReviews = evalReviews
        self.evalLabels = evalLabels
        
    
data = Dataset(config)
data.dataGen()



In [21]:
print("the training data shape: ", data.trainReviews.shape)
print("the training label shape: ", data.trainLabels.shape)
print("the evaluate data shape: ", data.evalReviews.shape)
print("the evaluate label shape: ", data.evalLabels.shape)

the training data shape:  (20000, 200)
the training label shape:  (20000,)
the evaluate data shape:  (5000, 200)
the evaluate label shape:  (5000,)


In [25]:
def nextBatch(x, y, batchSize):
    perm = np.arange(len(x))
    np.random.shuffle(perm)
    x = x[perm]
    y = y[perm]
    
    num_batches = len(x) // batchSize
    for index in range(num_batches):
        start = index * batchSize
        end = start + batchSize
        batchX = np.array(x[start : end], dtype="int64")
        batchY = np.array(y[start : end], dtype="float32")
    
        yield batchX, batchY


In [41]:
class TextCNN(object):
    def __init__(self, config, wordEmbedding):
        self.inputX = tf.placeholder(tf.int32, [None, config.sequenceLength], name="inputX")
        self.inputY = tf.placeholder(tf.int32, [None], name="inputY")
        self.dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb")
        
        l2Loss = tf.constant(0.0)
        with tf.name_scope("embedding"):
            self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")
            self.embeddedWords = tf.nn.embedding_lookup(self.W, self.inputX)
            self.embeddedWordsExpanded = tf.expand_dims(self.embeddedWords, -1)
        
        pooledOutputs = []
        # 卷积池化层：有三种 size 大小的filter，分别为3、4、5；TextCNN是个多通道的单层卷积model，可当作是三个单层的卷积model的融合
        for index, filterSize in enumerate(config.model.filterSizes):
            with tf.name_scope("conv-maxpool-%s" % filterSize):
                # 卷积层，size为filterSize * embeddingSize
                filterShape = [filterSize, config.model.embeddingSize, 1, config.model.numFilters]
                W = tf.Variable(tf.truncated_normal(filterShape, stddev=0.1), name="W")
                b = tf.Variable(tf.constant(0.1, shape=[config.model.numFilters]), name="b")
                
                conv = tf.nn.conv2d(self.embeddedWordsExpanded, 
                                   W, 
                                   strides=[1, 1, 1, 1], 
                                   padding="VALID", 
                                   name="conv")
                h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
                
                pooled = tf.nn.max_pool(h, 
                                       ksize=[1, config.sequenceLength - filterSize + 1, 1, 1], 
                                       strides=[1, 1, 1, 1], 
                                       padding="VALID", 
                                       name="pool")
                pooledOutputs.append(pooled)
        
        #获取到CNN model的输出长度
        numFiltersTotal= config.model.numFilters * len(config.model.filterSizes)
        # 池化后维度不变、按照最后的维度进行concat
        self.hPool = tf.concat(pooledOutputs, 3)
        # 展开成二维的，然后送入全连接层
        self.hPoolFlat = tf.reshape(self.hPool, [-1, numFiltersTotal])
        
        with tf.name_scope("dropout"):
            self.hDrop = tf.nn.dropout(self.hPoolFlat, self.dropoutKeepProb)
        
        with tf.name_scope("output"):
            outputW = tf.get_variable("outputW", shape=[numFiltersTotal, config.numClasses], 
                                     initializer=tf.contrib.layers.xavier_initializer())
            outputB = tf.Variable(tf.constant(0.1, shape=[config.numClasses]), name="outputB")
            
            l2Loss += tf.nn.l2_loss(outputW)
            l2Loss += tf.nn.l2_loss(outputB)
            self.logits = tf.nn.xw_plus_b(self.hDrop, outputW, outputB, name="logits")
            if config.numClasses == 1:
                self.predictions = tf.cast(tf.greater_equal(self.logits, 0.0), tf.int32, name="predictions")
            elif config.numClasses > 1:
                self.predictions = tf.argmax(self.logits, axsi=-1, name="predictions")

#             print("the current predictions are: ", self.predictions)
        
        with tf.name_scope("loss"):
            if config.numClasses == 1:
                losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.logits, 
                                                                labels=tf.cast(tf.reshape(self.inputY, [-1, 1]), 
                                                                              dtype=tf.float32))
            elif config.numClasses > 1:
                losses = tf.nn.sparse_sotfmax_cross_entropy_with_logits(logits=self.logits, 
                                                                        labels=self.inputY)

            self.loss = tf.reduce_mean(losses) + config.model.l2RegLambda * l2Loss

In [42]:
def mean(item):
    res = sum(item) / len(item) if len(item) > 0 else 0
    return res

def accuracy(pred_y, true_y):
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]
    
    corr = 0
    for index in range(len(pred_y)):
        if pred_y[index] == true_y[index]:
            corr += 1
    
    acc = corr / len(pred_y) if len(pred_y) > 0 else 0
    
    return acc


def binary_precision(pred_y, true_y, positive=1):
    """
    :param positive: 正例的索引表示
    """
    corr = 0
    pred_corr = 0
    for index in range(len(pred_y)):
        if pred_y[index] == positive:
            pred_corr += 1
            if pred_y[index] == true_y[index]:
                corr += 1
    
    prec = corr / pred_corr if pred_corr > 0 else 0
    
    return prec


def binary_recall(pred_y, true_y, positive=1):
    corr = 0
    true_corr = 0
    for index in range(len(pred_y)):
        if true_y[index] == positive:
            true_corr += 1
            if true_y[index] == pred_y[index]:
                corr += 1
    
    rec = corr / true_corr if true_corr > 0 else 0
    
    return rec


def binary_f_beta(pred_y, true_y, beta=1.0, positive=1):
    precision = binary_precision(pred_y, true_y, positive)
    recall = binary_recall(pred_y, true_y, positive)
    try:
        f_b = (1 + beta * beta) * precision * recall / (beta * beta * precision + recall)
    except:
        f_b = 0
    
    return f_b



def multi_precision(pred_y, true_y, labels):
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]
    
    precisions = [binary_precision(pred_y, true_y, label) for label in labels]
    prec = mean(precisions)
    return prec


def multi_recall(pred_y, true_y, labels):
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]
    
    recalls = [binary_recall(pred_y, true_y, label) for label in labels]
    rec = mean(recalls)
    return rec


def multi_f_beta(pred_y, true_y, labels, beta=1.0):
    if isinstance(pred_y[0], list):
        pred_y = [item[0] for item in pred_y]
    
    f_beats =[binary_f_bate(pred_y, true_y, beta, label) for label in labels]
    f_beta = mean(f_betas)
    return f_beta



def get_binary_metrics(pred_y, true_y, f_beta=1.0):
    acc = accuracy(pred_y, true_y)
    recall = binary_recall(pred_y, true_y)
    precision = binary_precision(pred_y, true_y)
    f_beta = binary_f_beta(pred_y, true_y, f_beta)
    
    return acc, recall, precision, f_beta



def get_multi_metrics(pred_y, true_y, labels, f_beta=1.0):
    acc = accuracy(pred_y, true_y)
    recall = mulyi_recall(pred_y, true_y, labels)
    precision = multi_precision(pred_y, true_y, labels)
    f_beta = multi_f_beta(pred_y, true_y, labels, f_beta)
    
    return acc, recall, precision, f_beta



In [50]:
trainReviews = data.trainReviews
trainLabels = data.trainLabels
evalReviews = data.evalReviews
evalLabels = data.evalLabels

wordEmbedding = data.wordEmbedding
labelList = data.labelList

with tf.Graph().as_default():
    session_conf = tf.ConfigProto(allow_soft_placement=True, log_device_placement=False)
    sess = tf.Session(config=session_conf)
    
    with sess.as_default():
        cnn = TextCNN(config, wordEmbedding)
        globalStep = tf.Variable(0, name="globalStep", trainable=False)
        optimizer = tf.train.AdamOptimizer(config.training.learningRate)
        gradsAndVars = optimizer.compute_gradients(cnn.loss)
        trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)
        
        gradSummaries = []
        for g, v in gradsAndVars:
            if g is not None:
                tf.summary.histogram("{}/grad/hist".format(v.name), g)
                tf.summary.scalar("{}/grad/sparsity".format(v.name), tf.nn.zero_fraction(g))
        
        outDir = os.path.abspath(os.path.join(os.path.curdir, "summarys"))
        print("writing to {} \n".format(outDir))
        
        lossSummary = tf.summary.scalar("loss", cnn.loss)
        summaryOp = tf.summary.merge_all()
        trainSummaryDir = os.path.join(outDir, "train")
        trainSummaryWriter = tf.summary.FileWriter(trainSummaryDir, sess.graph)
        evalSummaryDir = os.path.join(outDir, "eval")
        evalSummaryWriter = tf.summary.FileWriter(evalSummaryDir, sess.graph)
        
        saver = tf.train.Saver(tf.global_variables(), max_to_keep=5)
        savedModelPath = "\\".join(directory_path.split("\\")[:-1]) + "\\model\\textCNN\\savedModel"
        if os.path.exists(savedModelPath):
            os.rmdir(savedModelPath)
        
        builder = tf.saved_model.builder.SavedModelBuilder(savedModelPath)
        sess.run(tf.global_variables_initializer())
        
        def trainStep(batchX, batchY):
            feed_dict = {
                cnn.inputX: batchX, 
                cnn.inputY: batchY, 
                cnn.dropoutKeepProb: config.model.dropoutKeepProb
            }
            _, summary, step, loss, predictions = sess.run([trainOp, summaryOp, globalStep, cnn.loss, cnn.predictions], feed_dict)
            
            timeStr = datetime.datetime.now().isoformat()
            if config.numClasses == 1:
                acc, recall, prec, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
            elif config.numClasses > 1:
                acc, recall, prec, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
            
            trainSummaryWriter.add_summary(summary, step)
            return loss, acc, prec, recall, f_beta
        
        
        def devStep(batchX, batchY):
            feed_dict = {
                cnn.inputX: batchX, 
                cnn.inputY: batchY, 
                cnn.dropoutKeepProb: 1.0
            }
            summary, step, loss, predictions = sess.run([summaryOp, globalStep, cnn.loss, cnn.predictions], feed_dict)
            if config.numClasses == 1:
                acc, recall, precision, f_beta = get_binary_metrics(pred_y=predictions, true_y=batchY)
            elif config.numClasses > 1:
                acc, recall, precision, f_beta = get_multi_metrics(pred_y=predictions, true_y=batchY, labels=labelList)
            
            evalSummaryWriter.add_summary(summary, step)
            return loss, acc, precision, recall, f_beta
        
        
        for index in range(config.training.epochs):
            print("start to train models...")
            for batchTrain in nextBatch(trainReviews, trainLabels, config.batchSize):
                loss, acc, precision, recall, f_beta = trainStep(batchTrain[0], batchTrain[1])
                currentStep = tf.train.global_step(sess, globalStep)
                print("train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}".format(
                    currentStep, loss, acc, recall, precision, f_beta))
                
                if currentStep % config.training.evaluateEvery == 0:
                    print("\n evaluate model...")
                    losses, accs, f_betas, precisions, recalls = [], [], [], [], []
                    for batchEval in nextBatch(evalReviews, evalLabels, config.batchSize):
                        loss, acc, precision, recall, f_beta = devStep(batchEval[0], batchEval[1])
                        losses.append(loss)
                        accs.append(acc)
                        precisions.append(precision)
                        recalls.append(recall)
                        f_betas.append(f_beta)
                    
                    time_str = datetime.datetime.now().isoformat()
                    print("{}, step: {}, loss: {}, acc: {}, precision: {}, recall: {}, f_beta: {}".format(
                        time_str, currentStep, mean(losses), mean(accs), mean(precisions), mean(recalls), mean(f_betas)))
            
                if currentStep % config.training.checkpointEvery == 0:
                    path = saver.save(sess, "\\".join(savedModelPath.split("\\")[:-1]) + "\\my_model", global_step=currentStep)
                    print("saved mdoel checkpoint tp {}\n".format(path))
        
        
        inputs = {
            "inputX": tf.saved_model.utils.build_tensor_info(cnn.inputX), 
            "keepProb": tf.saved_model.utils.build_tensor_info(cnn.dropoutKeepProb)
        }
        outputs = {
            "precisions": tf.saved_model.utils.build_tensor_info(cnn.predictions)
        }
        prediction_signature = tf.saved_model.signature_def_utils.build_signature_def(inputs=inputs, 
                                                                                      outputs=outputs, 
                                                                                      method_name=tf.saved_model.signature_constants.PREDICT_METHOD_NAME)
        
        legacy_init_op = tf.group(tf.tables_initializer(), name="legacy_init_op")
        builder.add_meta_graph_and_variables(sess, [tf.saved_model.tag_constants.SERVING], 
                                            signature_def_map={"predict": prediction_signature}, 
                                            legacy_init_op=legacy_init_op)
        
        builder.save()
        
        

the current predictions are: self.predictions
writing to C:\Users\123\Documents\python_experence\nlp_model\model_code\summarys 

start to train models...
train: step: 1, loss: 3.054974317550659, acc: 0.453125, recall: 0.0945945945945946, precision: 0.7, f_beta: 0.16666666666666669
train: step: 2, loss: 2.5258941650390625, acc: 0.390625, recall: 0.7, precision: 0.35714285714285715, f_beta: 0.47297297297297297
train: step: 3, loss: 2.1744542121887207, acc: 0.4921875, recall: 0.828125, precision: 0.4953271028037383, f_beta: 0.6198830409356726
train: step: 4, loss: 1.8002183437347412, acc: 0.515625, recall: 0.6440677966101694, precision: 0.4810126582278481, f_beta: 0.5507246376811594
train: step: 5, loss: 1.6515357494354248, acc: 0.5390625, recall: 0.5483870967741935, precision: 0.5230769230769231, f_beta: 0.5354330708661418
train: step: 6, loss: 1.6065058708190918, acc: 0.5078125, recall: 0.2037037037037037, precision: 0.3548387096774194, f_beta: 0.2588235294117647
train: step: 7, loss: 2

W0924 12:06:11.657923  9780 deprecation.py:506] From <ipython-input-50-354640a71ea9>:120: calling SavedModelBuilder.add_meta_graph_and_variables (from tensorflow.python.saved_model.builder_impl) with legacy_init_op is deprecated and will be removed in a future version.
Instructions for updating:
Pass your op to the equivalent parameter main_op instead.


train: step: 780, loss: 0.14485587179660797, acc: 0.9609375, recall: 0.967741935483871, precision: 0.9523809523809523, f_beta: 0.96


In [24]:
config.training.epochs

5