In [1]:
import jieba
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
import logging
 
import tensorflow
from tensorflow.keras.layers import Dense, LSTM, Dropout, GlobalAveragePooling1D, Embedding
from tensorflow.keras.layers import Bidirectional, GlobalMaxPool1D
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential


In [2]:
stoplist = [i.strip() for i in open('停用词.txt', encoding = 'utf-8').readlines()]
def m_cut(intxt):
    word = [w for w in jieba.cut(intxt) if w not in stoplist and len(w) > 1 and not re.match('^[a-z|A-Z|0-9|.]*$',w)]
    strword = " ".join(word)
    return strword

In [3]:
raw_data = pd.read_excel('review_data.xlsx', sheet_name = 0)
raw_data.head(10)

Unnamed: 0,评论文本,class
0,西安进过一个月半终于低风险了[允悲]#西安今天起恢复正常出行#,1
1,上高速看核酸不,1
2,[哈哈],1
3,终于等来了全面恢复正常出行的好消息[心][good]，大家终于可以赶上过个快乐年了！,1
4,什么时候可以出市,1
5,莫名其妙 低风险地区过来为啥还要48小时核酸，脱了裤子放屁。有绿码还不够吗,1
6,有人知道坐高铁在西安转站有啥要求不？,1
7,不知道上下防疫政策不不同步的难度在哪里？,1
8,[吃瓜],1
9,[赞][赞][赞],1


In [5]:
#预处理
data = raw_data.copy()

#去除空值
data['评论文本'] = data['评论文本'].dropna(how='any')
data['评论文本'] = data['评论文本'].astype(str)

#去停词，分词
data['评论文本'] = data['评论文本'].apply(m_cut)
data['评论文本'].head(10)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\u1128714\AppData\Local\Temp\jieba.cache
Loading model cost 2.460 seconds.
Prefix dict has been built successfully.


0       西安 一个月 终于 风险 允悲 西安 恢复正常 出行
1                            高速 核酸
2                                 
3       终于 恢复正常 出行 好消息 终于 赶上 过个 快乐
4                               出市
5    莫名其妙 风险 地区 为啥 小时 核酸 裤子 放屁 有绿码
6                      有人 高铁 西安 转站
7                      防疫 政策 同步 难度
8                               吃瓜
9                                 
Name: 评论文本, dtype: object

In [6]:
lines=data['评论文本'].values.tolist()
content = "".join(lines)

In [7]:


logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 

model = Word2Vec(content,     # 上文处理过的全部语料
                 min_count=1,  # 词频阈值 词出现的频率 小于这个频率的词 将不予保存
                 window=5  # 窗口大小 表示当前词与预测词在一个句子中的最大距离是多少
                 )
model.save('./models/Word2vec_v1.pkl')  # 保存模型


2022-04-02 11:42:16,361 : INFO : collecting all words and their counts
2022-04-02 11:42:16,363 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2022-04-02 11:42:16,381 : INFO : PROGRESS: at sentence #10000, processed 10000 words, keeping 1006 word types
2022-04-02 11:42:16,414 : INFO : PROGRESS: at sentence #20000, processed 20000 words, keeping 1364 word types
2022-04-02 11:42:16,439 : INFO : PROGRESS: at sentence #30000, processed 30000 words, keeping 1679 word types
2022-04-02 11:42:16,478 : INFO : PROGRESS: at sentence #40000, processed 40000 words, keeping 1836 word types
2022-04-02 11:42:16,498 : INFO : PROGRESS: at sentence #50000, processed 50000 words, keeping 1937 word types
2022-04-02 11:42:16,529 : INFO : PROGRESS: at sentence #60000, processed 60000 words, keeping 2045 word types
2022-04-02 11:42:16,545 : INFO : PROGRESS: at sentence #70000, processed 70000 words, keeping 2166 word types
2022-04-02 11:42:16,581 : INFO : PROGRESS: at sentence #8000

In [8]:
#创建词语字典，并返回word2vec模型中词语的索引，词向量

def create_dictionaries(model):

    gensim_dict = Dictionary()    # 创建词语词典
    gensim_dict.doc2bow(model.wv.key_to_index.keys(), allow_update=True)

    w2indx = {v: k + 1 for k, v in gensim_dict.items()}  # 词语的索引，从1开始编号
    w2vec = {word: model.wv[word] for word in w2indx.keys()}  # 词语的词向量
    return w2indx, w2vec

model = Word2Vec.load('./models/Word2vec_v1.pkl')         # 加载模型
index_dict, word_vectors= create_dictionaries(model)  # 索引字典、词向量字典


2022-04-02 11:43:04,939 : INFO : loading Word2Vec object from ./models/Word2vec_v1.pkl
2022-04-02 11:43:04,962 : INFO : loading wv recursively from ./models/Word2vec_v1.pkl.wv.* with mmap=None
2022-04-02 11:43:04,963 : INFO : setting ignored attribute cum_table to None
2022-04-02 11:43:05,118 : INFO : Word2Vec lifecycle event {'fname': './models/Word2vec_v1.pkl', 'datetime': '2022-04-02T11:43:05.118155', 'gensim': '4.1.2', 'python': '3.7.13 (default, Mar 28 2022, 08:03:21) [MSC v.1916 64 bit (AMD64)]', 'platform': 'Windows-10-10.0.19041-SP0', 'event': 'loaded'}


In [9]:
# 参数设置
vocab_dim = 150 # 向量维度
maxlen = 150 # 文本保留的最大长度
batch_size = 100 # 训练过程中 每次传入模型的特征数量
n_epoch = 1   # 迭代次数

n_symbols = len(index_dict) + 1  # 索引数字的个数，因为有的词语索引为0，所以+1
embedding_weights = np.zeros((n_symbols, 100))  # 创建一个n_symbols * 100的0矩阵

for w, index in index_dict.items():  # 从索引为1的词语开始，用词向量填充矩阵
    embedding_weights[index, :] = word_vectors[w]  # 词向量矩阵，第一行是0向量（没有索引为0的词语，未被填充）


In [11]:
def text_to_index_array(p_new_dic, p_sen): 
    """
    文本或列表转换为索引数字
    :param p_new_dic:
    :param p_sen:
    :return:
    """
    if type(p_sen) == list:
        new_sentences = []
        for sen in p_sen:
            new_sen = []
            for word in sen:
                try:
                    new_sen.append(p_new_dic[word])  # 单词转索引数字
                except:
                    new_sen.append(0)  # 索引字典里没有的词转为数字0
            new_sentences.append(new_sen)
        return np.array(new_sentences)   # 转numpy数组
    else:
        new_sentences = []
        sentences = []
        p_sen = p_sen.split(" ")
        for word in p_sen:
            try:
                sentences.append(p_new_dic[word])  # 单词转索引数字
            except:
                sentences.append(0)  # 索引字典里没有的词转为数字0
        new_sentences.append(sentences)
        return new_sentences


In [12]:
from sklearn.model_selection import train_test_split
'''
with open("./原始语料/neg.txt", "r", encoding='UTF-8') as f:
            neg_data1 = f.readlines()

with open("./原始语料/pos.txt", "r", encoding='UTF-8') as g:
    pos_data1 = g.readlines()

neg_data = sorted(set(neg_data1), key=neg_data1.index)  #列表去重 保持原来的顺序
pos_data = sorted(set(pos_data1), key=pos_data1.index)

neg_data = [process_txt(data) for data in neg_data]
pos_data = [process_txt(data) for data in pos_data]
data = neg_data + pos_data


# 读取语料类别标签
label_list = ([0] * len(neg_data) + [1] * len(pos_data))
'''

labels, vocabulary = list(data['class']), list(data['评论文本'])

# 划分训练集和测试集，此时都是list列表
X_train_l, X_test_l, y_train_l, y_test_l = train_test_split(vocabulary, labels, test_size=0.2)

# 转为数字索引形式

# token = Tokenizer(num_words=3000)   #字典数量
# token.fit_on_texts(train_text)

X_train = text_to_index_array(index_dict, X_train_l)
X_test = text_to_index_array(index_dict, X_test_l)

y_train = np.array(y_train_l)  # 转numpy数组
y_test = np.array(y_test_l)

print("训练集shape： ", X_train.shape)
print("测试集shape： ", X_test.shape)




训练集shape：  (82447,)
测试集shape：  (20612,)


In [21]:
# 参数设置
vocab_dim = 150 # 向量维度
maxlen = 150 # 文本保留的最大长度
batch_size = 100 # 训练过程中 每次传入模型的特征数量
n_epoch = 1   # 迭代次数

X_train = pad_sequences(X_train, maxlen=maxlen)
X_test = pad_sequences(X_test, maxlen=maxlen)

n_symbols = len(index_dict) + 1  # 索引数字的个数，因为有的词语索引为0，所以+1
embedding_weights = np.zeros((n_symbols, 100))  # 创建一个n_symbols * 100的0矩阵

for w, index in index_dict.items():  # 从索引为1的词语开始，用词向量填充矩阵
    embedding_weights[index, :] = word_vectors[w]  # 词向量矩阵，第一行是0向量（没有索引为0的词语，未被填充）


In [22]:
def show_train_history(train_history,train, velidation):
    """
    可视化训练过程 对比
    :param train_history:
    :param train:
    :param velidation:
    :return:
    """
    plt.plot(train_history.history[train])
    plt.plot(train_history.history[velidation])
    plt.title("Train History")   #标题
    plt.xlabel('Epoch')    #x轴标题
    plt.ylabel(train)  #y轴标题
    plt.legend(['train', 'test'], loc='upper left')  #图例 左上角
    plt.show()


In [23]:
def train_lstm(p_n_symbols, p_embedding_weights, p_X_train, p_y_train, p_X_test, p_y_test, X_test_l):
    print('...CREATING MODEL...')
    model = Sequential()
    model.add(Embedding(output_dim=100,  # 输出向量维度
                        input_dim=p_n_symbols,  # 词汇表的维度(总共有多少个不相同的词), 输入向量维度
                        mask_zero=True,         # 使我们填补的0值在后续训练中不产生影响（屏蔽0值）
                        weights=[p_embedding_weights],   # 对数据加权
                        input_length=maxlen))      # 每个特征的长度

    model.add(Bidirectional(LSTM(32, return_sequences = True)))
    # using BiLSTM to represent every word and get the contextual information
    model.add(GlobalAveragePooling1D())
    # get the biggest numbers to represent a sentence
    model.add(Dense(20, activation="relu"))
    # fully connected layer
    model.add(Dropout(0.05))
    model.add(Dense(1, activation="sigmoid"))
    model.summary()

    print('...COMPILING...')
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])
    print("FINISH!!")

    print("...TRAINING...")
    train_history = model.fit(p_X_train, p_y_train, batch_size=batch_size, epochs=n_epoch,
              validation_data=(p_X_test, p_y_test))

    print("...EVALUATING...")
    score, acc = model.evaluate(p_X_test, p_y_test, batch_size=batch_size)
    label = model.predict(p_X_test)
    print('Test score:', score)
    print('Test accuracy:', 1 - acc)
    '''
    for (a, b, c) in zip(p_y_test, X_test_l, labels):
        print("原文为："+ "".join(b))
        print("预测倾向为", a)
        print("真实倾向为", c)

    show_train_history(train_history, 'acc', 'val_acc')    # 训练集准确率与验证集准确率 折线图
    show_train_history(train_history, 'loss', 'val_loss')  # 训练集误差率与验证集误差率 折线图
    '''
    """SAVE MODEL"""
    model.save('./models/emotion_model_LSTM.h5')
    print("模型保存成功")


In [24]:
train_lstm(p_n_symbols=embedding_weights.shape[0], p_embedding_weights=embedding_weights, p_X_train=X_train, p_y_train=y_train, p_X_test=X_test, p_y_test=y_test, X_test_l=X_test_l)

Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


...CREATING MODEL...


Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 150, 100)          438100    
_________________________________________________________________
bidirectional (Bidirectional (None, 150, 64)           34048     
_________________________________________________________________
global_average_pooling1d (Gl (None, 64)                0         
_________________________________________________________________
dense (Dense)                (None, 20)                1300      
_________________________________________________________________
dropout (Dropout)            (None, 20)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 21        
Total params: 473,469
Trainable params: 473,469
Non-trainable params: 0
__________________________________________________