# LSTM NLP 

In [1]:
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
import gensim
from gensim.models import word2vec
import jieba
import tensorflow.compat.v1 as tf
tf.compat.v1.disable_eager_execution()
import numpy as np
import time
from random import randint
from random import shuffle

# Utils

In [2]:
#停用词获取
def makeStopWord():
    with open('stopword.txt','r',encoding = 'utf-8') as f:
        lines = f.readlines()
    stopWord = []
    for line in lines:
        words = jieba.lcut(line,cut_all = False)
        for word in words:
            stopWord.append(word)
    return stopWord



#获得文件中的数据，并且分词，去除其中的停用词
def getWords(file):
    wordList = []
    trans = []
    lineList = []
    with open(file,'r',encoding='utf-8') as f:
        lines = f.readlines()
    for line in lines:
        trans = jieba.lcut(line.replace('\n',''), cut_all = False)
        for word in trans:
            if word not in stopWord:
                wordList.append(word)
        lineList.append(wordList)
        wordList = []
    return lineList

In [3]:
#将词转化为数组
def words2Array(lineList):
    linesArray=[]
    wordsArray=[]
    steps = []
    for line in lineList:
        t = 0
        p = 0
        for i in range(MAX_SIZE):
            if i<len(line):
                try:
                    wordsArray.append(model.wv.word_vec(line[i]))
                    p = p + 1
                except KeyError:
                    t=t+1
                    continue
            else:
               wordsArray.append(np.array([0.0]*dimsh))
        for i in range(t):
            wordsArray.append(np.array([0.0]*dimsh))
        steps.append(p)
        linesArray.append(wordsArray)
        wordsArray = []
    linesArray = np.array(linesArray)
    steps = np.array(steps)
    return linesArray, steps

In [4]:
#将数据转化为数据
def convert2Data(posArray, negArray, posStep, negStep):
    randIt = []
    data = []
    steps = []
    labels = []
    for i in range(len(posArray)):
        randIt.append([posArray[i], posStep[i], [1,0]])
    for i in range(len(negArray)):
        randIt.append([negArray[i], negStep[i], [0,1]])
    shuffle(randIt)
    for i in range(len(randIt)):
        data.append(randIt[i][0])
        steps.append(randIt[i][1])
        labels.append(randIt[i][2])
    data = np.array(data)
    steps = np.array(steps)
    return data, steps, labels



#产生训练数据集和测试数据集
def makeData(posPath,negPath):
    #获取词汇，返回类型为[[word1,word2...],[word1,word2...],...]
    pos = getWords(posPath)
    print("The positive data's length is :",len(pos))
    neg = getWords(negPath)
    print("The negative data's length is :",len(neg))
    #将评价数据转换为矩阵，返回类型为array
    posArray, posSteps = words2Array(pos)
    negArray, negSteps = words2Array(neg)
    #将积极数据和消极数据混合在一起打乱，制作数据集
    Data, Steps, Labels = convert2Data(posArray, negArray, posSteps, negSteps)
    return Data, Steps, Labels

In [5]:
#产生用于分类的数据集
def makeTestData(dataPath):
    #获取词汇，返回类型为[[word1,word2...],[word1,word2...],...]
    data = getWords(dataPath)
    print("The data's length is :",len(data))
    #将评价数据转换为矩阵，返回类型为array
    dataArray, dataSteps = words2Array(data)
    #将积极数据和消极数据混合在一起打乱，制作数据集
    Data, Steps = testConvert2Data(dataArray, dataSteps)
    return Data, Steps



def testConvert2Data(dataArray, dataStep):
    randIt = []
    data = []
    steps = []
    labels = []
    for i in range(len(dataArray)):
        randIt.append([dataArray[i], dataStep[i]])
    shuffle(randIt)
    for i in range(len(randIt)):
        data.append(randIt[i][0])
        steps.append(randIt[i][1])
    data = np.array(data)
    steps = np.array(steps)
    return data, steps

# 设置停用词并创建词向量

In [6]:
timeA=time.time()
word2vec_path = './word2vec/200/word2vec.model'
model=gensim.models.Word2Vec.load(word2vec_path)
dimsh=model.vector_size
MAX_SIZE=25
stopWord = makeStopWord()

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/fw/lzdssnb90018371gm3r0tgph0000gn/T/jieba.cache
Loading model cost 0.521 seconds.
Prefix dict has been built successfully.


# 统计评论数

In [7]:
print("In train data:")
trainData, trainSteps, trainLabels = makeData('data/train/Pos-train.txt',
                                              'data/train/Neg-train.txt')
print("In test data:")
testData, testSteps, testLabels = makeData('data/test/Pos-test.txt', 'data/test/Neg-test.txt')

print("In real data:")
realData, realSteps = makeTestData('data/real/all_posts.txt')

trainLabels = np.array(trainLabels)

del model

In train data:
The positive data's length is : 7840
The negative data's length is : 10853


  wordsArray.append(model.wv.word_vec(line[i]))


In test data:
The positive data's length is : 1961
The negative data's length is : 2714
In real data:
The data's length is : 39199


# 模型准备

In [8]:
num_nodes = 128
batch_size = 16
output_size = 2

In [9]:
def model(dataset, steps):
    outputs, last_states = tf.nn.dynamic_rnn(cell = lstm_cell, dtype = tf.float32, sequence_length = steps, inputs = dataset)
    hidden = last_states[-1]

    hidden = tf.matmul(hidden, w1) + b1
    logits = tf.matmul(hidden, w2) + b2
    return logits

In [10]:
graph = tf.Graph()

with graph.as_default():

    tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size,MAX_SIZE,dimsh))
    tf_train_steps = tf.placeholder(tf.int32,shape=(batch_size))
    tf_train_labels = tf.placeholder(tf.float32,shape=(batch_size,output_size))

    tf_test_dataset = tf.constant(testData,tf.float32)
    tf_test_steps = tf.constant(testSteps,tf.int32)

    tf_real_dataset = tf.constant(realData,tf.float32)
    tf_real_steps = tf.constant(realSteps,tf.int32)

    lstm_cell = tf.nn.rnn_cell.BasicLSTMCell(num_units = num_nodes,
                                             state_is_tuple=True)

    w1 = tf.Variable(tf.truncated_normal([num_nodes,num_nodes // 2], stddev=0.1))
    b1 = tf.Variable(tf.truncated_normal([num_nodes // 2], stddev=0.1))

    w2 = tf.Variable(tf.truncated_normal([num_nodes // 2, 2], stddev=0.1))
    b2 = tf.Variable(tf.truncated_normal([2], stddev=0.1))
    
    #-----------------------------------训练及测试模型-------------------------------------   
    # 训练模型
    train_logits = model(tf_train_dataset, tf_train_steps)
    # 损失函数
    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=tf_train_labels, logits=train_logits))
    # 优化器
    optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(loss)
    # 测试模型
    test_prediction = tf.nn.softmax(model(tf_test_dataset, tf_test_steps))
    # 分类
    real_prediction = tf.nn.softmax(model(tf_real_dataset, tf_real_steps))

Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.





# 训练/测试模型

In [11]:
num_steps = 20001
summary_frequency = 500

In [12]:
with tf.Session(graph = graph) as session:
    tf.global_variables_initializer().run()
    print('Initialized')
    mean_loss = 0
    for step in range(num_steps):
        offset = (step * batch_size) % (len(trainLabels)-batch_size)
        feed_dict={tf_train_dataset:trainData[offset:offset + batch_size],
                   tf_train_labels:trainLabels[offset:offset + batch_size],
                   tf_train_steps:trainSteps[offset:offset + batch_size]}
        
        # 开始训练模型
        _, l = session.run([optimizer,loss],
                           feed_dict = feed_dict)
        mean_loss += l
        if step >0 and step % summary_frequency == 0:
            mean_loss = mean_loss / summary_frequency
            print("The step is: %d"%(step))
            print("In train data,the loss is:%.4f"%(mean_loss))
            mean_loss = 0
            acrc = 0
            pos = 0
            neg = 0
            # print(test_prediction)
            # 开始测试模型
            prediction = session.run(test_prediction)
            
            for i in range(len(prediction)):
                if prediction[i][testLabels[i].index(1)] > 0.5:
                    # print(prediction[i])
                    acrc = acrc + 1
            # print(testLabels)
            print("In test data,the accuracy is:%.2f%%"%((acrc/len(testLabels))*100))
        
        #-----------------------------------开始分类-------------------------------------   
        if(step==20000):
                prediction = session.run(real_prediction)
                for i in range(len(prediction)):
                    if i == 0:
                        print("\nStart object classification\n")
                    if i%1000 == 0 and i != 0:
                        print(i, " posts have been classified.\n")
                    if prediction[i][0] > 0.5:
                        # print(prediction[i])
                        pos += 1
                    else:
                        neg += 1
                print("Positive: ",pos)
                print("Negative: ",neg)
                print("Done!")

Initialized
The step is: 500
In train data,the loss is:0.6055
In test data,the accuracy is:72.90%
The step is: 1000
In train data,the loss is:0.5389
In test data,the accuracy is:74.65%
The step is: 1500
In train data,the loss is:0.5084
In test data,the accuracy is:75.66%
The step is: 2000
In train data,the loss is:0.4923
In test data,the accuracy is:76.43%
The step is: 2500
In train data,the loss is:0.4747
In test data,the accuracy is:77.13%
The step is: 3000
In train data,the loss is:0.4655
In test data,the accuracy is:77.01%
The step is: 3500
In train data,the loss is:0.4511
In test data,the accuracy is:77.16%
The step is: 4000
In train data,the loss is:0.4406
In test data,the accuracy is:77.78%
The step is: 4500
In train data,the loss is:0.4350
In test data,the accuracy is:77.50%
The step is: 5000
In train data,the loss is:0.4211
In test data,the accuracy is:78.03%
The step is: 5500
In train data,the loss is:0.4094
In test data,the accuracy is:78.42%
The step is: 6000
In train data,

# 算时间

In [13]:
timeB=time.time()
print("time cost:",int(timeB-timeA))

time cost: 272
