In [1]:
from utils import load_corpus, stopwords, processing
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import os
import datetime
import pandas as pd

In [3]:
today = datetime.date.today().strftime('%Y%m%d')
if not os.path.exists('./evaluation/lstm/{}'.format(today)):
    os.makedirs('./evaluation/lstm/{}'.format(today))
writer = SummaryWriter(log_dir=os.path.join('./evaluation/lstm', today))

if not os.path.exists('./model/classification/lstm/{}'.format(today)):
    os.makedirs('./model/classification/lstm/{}'.format(today))

In [5]:
df_train=pd.read_csv('../Dataset/weibo_senti_120k_train.csv') 
df_test=pd.read_csv('../Dataset/weibo_senti_120k_test.csv') 

In [6]:
df_test['label'].value_counts()

1    19783
0    19364
Name: label, dtype: int64

In [7]:
# 预处理
df_train['text_split'] = df_train['text'].apply(lambda x: processing(x))
df_test['text_split'] = df_test['text'].apply(lambda x: processing(x))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.617 seconds.
Prefix dict has been built successfully.


In [8]:
df_train['text_split']

0                                        我 想 看 啊 求 sponsor
1                                             我 靠 这 是 什么 ！
2                                                从 烟花 中来 ？
3        和 某些 搞 艺术 的 人 聊天 上 一秒 想 骂 傻 X 下 一秒 还算赞 总之 跌倒 起...
4                                                   外国 绕口令
                               ...                        
91336    中国 有 自古 句 俗话 叫 隐恶扬善 足够 解释 王局 的 困惑 了 用 现在 官方 主流...
91337                                       解散 彻底 解散 ！ ！ ！
91338                                                   回复
91339                      这么 早 回家 真是 不可思议 这 可是 周六 呀 ！ ！ ！
91340    我 的 黑米 啊 不知道 你 想 不想 我 还 记得 我 抱 你 回家 的 时候 转眼 6 ...
Name: text_split, Length: 91341, dtype: object

In [9]:
# word2vec要求的输入格式: list(word)
wv_input = df_train['text_split'].map(lambda s: s.split(" "))   # [for w in s.split(" ") if w not in stopwords]
wv_input.head()          

0                             [我, 想, 看, 啊, 求, sponsor]
1                                  [我, 靠, 这, 是, 什么, ！]
2                                       [从, 烟花, 中来, ？]
3    [和, 某些, 搞, 艺术, 的, 人, 聊天, 上, 一秒, 想, 骂, 傻, X, 下,...
4                                            [外国, 绕口令]
Name: text_split, dtype: object

In [10]:
from gensim import models

# Word2Vec
word2vec = models.Word2Vec(wv_input, 
                           vector_size=100,   # 词向量维度
                           min_count=1,      # 最小词频, 因为数据量较小, 这里卡1
                           epochs=1000)      # 迭代轮次



In [11]:
word2vec.save('model/word/word2vec_120k_without_emo.model')

In [9]:
import time
t1 = time.time()
word2vec = models.Word2Vec.load('model/word/word2vec_120k.model')
t2 = time.time()
print(".molde load time %.4f"%(t2-t1))

.molde load time 1.1300


In [12]:
word2vec.wv.most_similar("你")

[('我', 0.9386610984802246),
 ('他', 0.853631854057312),
 ('你们', 0.8486167192459106),
 ('她', 0.8248060345649719),
 ('自己', 0.8246130347251892),
 ('他们', 0.7918105721473694),
 ('的', 0.7791113257408142),
 ('了', 0.774533212184906),
 ('我们', 0.7697728872299194),
 ('？', 0.7607975006103516)]

In [13]:
word2vec.wv.most_similar("悲伤")

[('伤心', 0.4740165174007416),
 ('伤感', 0.4664136469364166),
 ('心酸', 0.4656721353530884),
 ('事情', 0.4350576102733612),
 ('感人', 0.42465442419052124),
 ('忧伤', 0.4231449365615845),
 ('沉默', 0.4070686399936676),
 ('发生', 0.39861395955085754),
 ('世间', 0.3984120488166809),
 ('事', 0.396019846200943)]

### 情绪词典

In [14]:
with open('/root/nas/chinese-sentiment-analysis/data/negative-words.txt', encoding='utf-8') as f:
    negative_words = f.read().splitlines() 
with open('/root/nas/chinese-sentiment-analysis/data/positive-words.txt', encoding='utf-8') as f:
    positive_words = f.read().splitlines() 

In [15]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [16]:
# 超参数
learning_rate = 5e-4
input_size = 768
num_epoches = 20
batch_size = 128
embed_size = 100
hidden_size = 64
num_layers = 2

In [17]:
# 数据集
class MyDataset(Dataset):
    def __init__(self, df):
        self.data = []
        self.emo = []
        self.label = df["label"].tolist()
        # 注意这里，别取错列了！！
        for s in df["text_split"].tolist():
            vectors = []
            for w in s.split(" "):
                if w in word2vec.wv.key_to_index:
                    vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
                else:
                    vectors.append([0]*embed_size)
                
                
            vectors = torch.Tensor(vectors)
            # Lexicon Embedding
            emo_encoded = torch.Tensor([2 if word in positive_words else 1 if word in negative_words else 0 for word in s.split(" ") ])
            self.data.append(vectors)
            self.emo.append(emo_encoded)
    
    def __getitem__(self, index):
        data = self.data[index]
        emo = self.emo[index]
        label = self.label[index]
        return data, emo, label

    def __len__(self):
        return len(self.label)

def collate_fn(data):
    """
    :param data: 第0维：data，第1维：emo情绪词，第2维：label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x1 = [i[0] for i in data]
    x2 = [i[1] for i in data]
    y = [i[2] for i in data]

    data = pad_sequence(x1, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    emo = pad_sequence(x2, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, emo, torch.tensor(y, dtype=torch.float32), torch.tensor(data_length)


# 训练集
train_data = MyDataset(df_train)
train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

# 测试集
test_data = MyDataset(df_test)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [18]:
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x1, x2, lengths):
        x = torch.cat((x2.unsqueeze(-1),x1),2)  
        # x = x1
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

lstm = LSTM(embed_size+1, hidden_size, num_layers).to(device)   

In [19]:
from sklearn import metrics

# 在测试集效果检验
def test(epoch,num_epoches):
    y_pred, y_true = [], []

    with torch.no_grad():
        for x, emo, labels, lengths in test_loader:
            x = x.to(device)
            emo = emo.to(device)
            outputs = lstm(x,emo, lengths)         # 前向传播
            outputs = outputs.view(-1)          # 将输出展平
            y_pred.append(outputs)
            y_true.append(labels)

    y_prob = torch.cat(y_pred).cpu()
    y_true = torch.cat(y_true).cpu()
    y_pred = y_prob.clone()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_true, y_pred)
    roc_auc = metrics.roc_auc_score(y_true, y_prob)
    print('Epoch {}/{}, P {:.4f}, R {:.4f}, F1 {:.4f}, AUC {:.4f}'.format(
        epoch, num_epoches, precision.mean(), recall.mean(), f1.mean(), roc_auc.mean()))
    writer.add_scalar('precision', precision.mean(), epoch)
    writer.add_scalar('recall', recall.mean(), epoch)
    writer.add_scalar('f1score', f1.mean(), epoch)
    writer.add_scalar('auc', roc_auc.mean(), epoch)

In [20]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [21]:
# 迭代训练
for epoch in range(num_epoches):
    total_loss = 0
    for i, (x,emo, labels, lengths) in enumerate(train_loader):
        x = x.to(device)
        emo = emo.to(device)
        labels = labels.to(device)
        outputs = lstm(x, emo, lengths)          # 前向传播
        logits = outputs.view(-1)           # 将输出展平
        loss = criterion(logits, labels)    # loss计算
        total_loss += loss
        optimizer.zero_grad()               # 梯度清零
        loss.backward(retain_graph=True)    # 反向传播，计算梯度
        optimizer.step()                    # 梯度更新
        if (i+1) % 500 == 0:
            print("epoch:{}, step:{}, loss:{}".format(epoch+1, i+1, total_loss/10))
            total_loss = 0
    writer.add_scalar('train_loss', total_loss, epoch)
    # test
    test(epoch,num_epoches)
    
    # save model
    model_path = "./model/classification/lstm/{}/lstm_120k_{}.model".format(today,epoch+1)
    torch.save(lstm, model_path)
    print("saved model: ", model_path)
writer.close()

epoch:1, step:500, loss:28.236026763916016
Epoch 0/20, P 0.7188, R 0.7145, F1 0.7135, AUC 0.7956
saved model:  ./model/classification/lstm/20220303/lstm_120k_1.model
epoch:2, step:500, loss:26.20993995666504
Epoch 1/20, P 0.7212, R 0.7197, F1 0.7196, AUC 0.8014
saved model:  ./model/classification/lstm/20220303/lstm_120k_2.model
epoch:3, step:500, loss:25.227643966674805
Epoch 2/20, P 0.7242, R 0.7200, F1 0.7192, AUC 0.8024
saved model:  ./model/classification/lstm/20220303/lstm_120k_3.model
epoch:4, step:500, loss:23.93129539489746
Epoch 3/20, P 0.7237, R 0.7188, F1 0.7178, AUC 0.8018
saved model:  ./model/classification/lstm/20220303/lstm_120k_4.model
epoch:5, step:500, loss:22.52192497253418
Epoch 4/20, P 0.7182, R 0.7151, F1 0.7145, AUC 0.7947
saved model:  ./model/classification/lstm/20220303/lstm_120k_5.model
epoch:6, step:500, loss:20.962766647338867
Epoch 5/20, P 0.7126, R 0.7076, F1 0.7064, AUC 0.7888
saved model:  ./model/classification/lstm/20220303/lstm_120k_6.model
epoch:7

In [35]:
from utils import processing

strs = ["我想说我会爱你多一点点", "日有所思梦感伤"]
strs = ["我一声龙啸凌云志！热血燃冬扶摇起！","这么大个人了，不嫌丢人？一点素质也没有"]

data = []
for s in strs:
    vectors = []
    print(processing(s).split(" "))
    for w in processing(s).split(" "):
        if w in word2vec.wv.key_to_index:
            vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
    vectors = torch.Tensor(vectors)
    data.append(vectors)

x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))
with torch.no_grad():
    x = x.to(device)
    outputs = lstm(x, lengths)       # 前向传播
    outputs = outputs.view(-1)          # 将输出展平
outputs.cpu()[0].item()

['我', '一声', '龙啸', '凌云志', '！', '热血', '燃冬', '扶摇', '起', '！']
['这么', '大', '个人', '了', '不嫌', '丢人', '？', '一点', '素质', '也', '没有']


IndexError: tuple index out of range

In [None]:
STOP

### 验证

In [None]:
import torch
import pandas as pd
from utils import processing

# 超参数
embed_size = 100
hidden_size = 64
num_layers = 2
# 需先搭建网络模型model
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x1, x2, lengths):
        x = torch.cat((x2.unsqueeze(-1),x1),2)  
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

lstm = LSTM(embed_size+1, hidden_size, num_layers).to(device)   
# 然后通过下面的语句加载参数
# lstm_new = model.load_state_dict(torch.load('model/classification/lstm_10k_7.model'))
lstm=torch.load('model/classification/lstm_10k_7.model')

In [None]:
import time
from gensim import models
t1 = time.time()
word2vec = models.Word2Vec.load('model/word/word2vec_10k.model')
t2 = time.time()
print(".molde load time %.4f"%(t2-t1))

In [None]:
def collate_fn(data):
    """
    :param data: 第0维：data，第1维：label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x = [i[0] for i in data]
    y = [i[1] for i in data]

    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, torch.tensor(y, dtype=torch.float32), torch.tensor(data_length)

In [None]:
def calculate_emo_score(text):
    vectors= []
    for w in processing(text).split(" "):
        if w in word2vec.wv.key_to_index:
            vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
    vectors = torch.Tensor(vectors)
    x, _, lengths = collate_fn(list(zip([vectors], [-1])))
    if lengths[0].item()<1:
        return None
    with torch.no_grad():
        x = x.to(device)
        outputs = lstm(x, lengths)       # 前向传播
        outputs = outputs.view(-1)    # 将输出展平
        result_score = outputs.cpu()[0].item()      
    return result_score

# 验证
crawl_result = pd.read_csv('../Dataset/crawl_result.csv')
crawl_result['emo_score'] = crawl_result['texts'].apply(lambda x: calculate_emo_score(x))
crawl_result.to_csv('result/crawl_result_emo.csv', index=False, encoding='utf-8-sig')