In [2]:
from utils import load_corpus, stopwords, processing
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import os
import datetime
import pandas as pd

In [3]:
today = datetime.date.today().strftime('%Y%m%d')
if not os.path.exists('./evaluation/lstm/{}'.format(today)):
    os.makedirs('./evaluation/lstm/{}'.format(today))
writer = SummaryWriter(log_dir=os.path.join('./evaluation/lstm', today))

if not os.path.exists('./model/classification/lstm/{}'.format(today)):
    os.makedirs('./model/classification/lstm/{}'.format(today))

In [4]:
df_train=pd.read_csv('../Dataset/weibo_senti_100k_train.csv') 
df_test=pd.read_csv('../Dataset/weibo_senti_100k_test.csv') 

In [5]:
df_train['text']

0                                                        怒
1              good 重 口味 的 小伙伴 有 口福 啦 哈哈 哈哈 哈哈 围观 围观 威武 威武
2        一 下班 就 赶着 回家 和 路哥 玩 难怪 路哥 经常 一幅 非 我 不娶 的 脸 哈哈 ...
3                                    原来 我 背负着 的 重任 泪 转发 微博
4        长白山 国际 度假区 即将 开业 想 泡温泉 看 林海 松涛 上次 去 天池 上冻 了 那 ...
                               ...                        
92732    火车 上 碰见 这种 占 别人 位置 还 理直气壮 的 王八蛋 真的 心累 报 了 乘警 半...
92733    倒霉 催 的 坐 上 晚点 一个多 小时 的 汽车 在 高速 上 司机 叔叔 说 他 没 听...
92734       急诊 第一天 上班 说不上 的 心累 这 漫长 的 两个 月 如何 过 啊 悲伤 悲伤 悲伤
92735    我 每个 月 供 着 爱奇艺 网易 云 快 连 芒果 TV 包图 网 这些 都 是 大 企业...
92736    其实 这种 两家 毫无 交集 因为 一个 粉丝 单方面 撩骚 就 撕成 一片 上升 成 两家...
Name: text, Length: 92737, dtype: object

In [6]:
# word2vec要求的输入格式: list(word)
wv_input = df_train['text'].map(lambda s: s.split(" "))   # [for w in s.split(" ") if w not in stopwords]
wv_input.head()          

0                                                  [怒]
1    [good, 重, 口味, 的, 小伙伴, 有, 口福, 啦, 哈哈, 哈哈, 哈哈, 围观...
2    [一, 下班, 就, 赶着, 回家, 和, 路哥, 玩, 难怪, 路哥, 经常, 一幅, 非...
3                       [原来, 我, 背负着, 的, 重任, 泪, 转发, 微博]
4    [长白山, 国际, 度假区, 即将, 开业, 想, 泡温泉, 看, 林海, 松涛, 上次, ...
Name: text, dtype: object

In [None]:
from gensim import models

# Word2Vec
word2vec = models.Word2Vec(wv_input, 
                           vector_size=100,   # 词向量维度
                           min_count=1,      # 最小词频, 因为数据量较小, 这里卡1
                           epochs=1000)      # 迭代轮次

In [8]:
word2vec.save('model/word/word2vec_100k.model')

In [9]:
import time
t1 = time.time()
word2vec = models.Word2Vec.load('model/word/word2vec_100k.model')
t2 = time.time()
print(".molde load time %.4f"%(t2-t1))

.molde load time 1.2917


In [10]:
word2vec.wv.most_similar("你")

[('我', 0.916471004486084),
 ('你们', 0.864096462726593),
 ('他', 0.8077366352081299),
 ('她', 0.787203848361969),
 ('的', 0.7812329530715942),
 ('自己', 0.7535796165466309),
 ('他们', 0.7533543705940247),
 ('了', 0.7523051500320435),
 ('我们', 0.7461169958114624),
 ('都', 0.7347392439842224)]

In [11]:
word2vec.wv.most_similar("灾难")

[('发生', 0.4801124930381775),
 ('翻身', 0.41674289107322693),
 ('招来', 0.40121641755104065),
 ('博弈', 0.3984716832637787),
 ('肆虐', 0.3806152939796448),
 ('丧是', 0.38021472096443176),
 ('我梦到', 0.37822914123535156),
 ('鹿岛', 0.371891587972641),
 ('碰到', 0.3708266615867615),
 ('绝情', 0.36689651012420654)]

In [12]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [13]:
# 超参数
learning_rate = 5e-4
input_size = 768
num_epoches = 20
batch_size = 128
embed_size = 100
hidden_size = 64
num_layers = 2

In [45]:
# 数据集
class MyDataset(Dataset):
    def __init__(self, df):
        self.data = []
        self.label = df["label"].tolist()
        for s in df["text"].tolist():
            vectors = []
            for w in s.split(" "):
                if w in word2vec.wv.key_to_index:
                    vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
                else:
                    vectors.append([0]*embed_size)
                
            vectors = torch.Tensor(vectors)
            self.data.append(vectors)
    
    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.label)

def collate_fn(data):
    """
    :param data: 第0维：data，第1维：emo情绪词，第2维：label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x = [i[0] for i in data]
    y = [i[1] for i in data]

    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, torch.tensor(y, dtype=torch.float32), torch.tensor(data_length)


# 训练集
train_data = MyDataset(df_train)
train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

# 测试集
test_data = MyDataset(df_test)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [46]:
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

lstm = LSTM(embed_size, hidden_size, num_layers).to(device)   

In [47]:
from sklearn import metrics

# 在测试集效果检验
def test(epoch,num_epoches):
    y_pred, y_true = [], []

    with torch.no_grad():
        for x,  labels, lengths in test_loader:
            x = x.to(device)
            outputs = lstm(x, lengths)         # 前向传播
            outputs = outputs.view(-1)          # 将输出展平
            y_pred.append(outputs)
            y_true.append(labels)

    y_prob = torch.cat(y_pred).cpu()
    y_true = torch.cat(y_true).cpu()
    y_pred = y_prob.clone()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_true, y_pred)
    roc_auc = metrics.roc_auc_score(y_true, y_prob)
    print('Epoch {}/{}, P {:.4f}, R {:.4f}, F1 {:.4f}, AUC {:.4f}'.format(
        epoch, num_epoches, precision.mean(), recall.mean(), f1.mean(), roc_auc.mean()))
    writer.add_scalar('precision', precision.mean(), epoch)
    writer.add_scalar('recall', recall.mean(), epoch)
    writer.add_scalar('f1score', f1.mean(), epoch)
    writer.add_scalar('auc', roc_auc.mean(), epoch)

In [48]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [49]:
# 迭代训练
for epoch in range(num_epoches):
    total_loss = 0
    for i, (x, labels, lengths) in enumerate(train_loader):
        x = x.to(device)
        labels = labels.to(device)
        outputs = lstm(x, lengths)          # 前向传播
        logits = outputs.view(-1)           # 将输出展平
        loss = criterion(logits, labels)    # loss计算
        total_loss += loss
        optimizer.zero_grad()               # 梯度清零
        loss.backward(retain_graph=True)    # 反向传播，计算梯度
        optimizer.step()                    # 梯度更新
        if (i+1) % 500 == 0:
            print("epoch:{}, step:{}, loss:{}".format(epoch+1, i+1, total_loss/10))
            total_loss = 0
    writer.add_scalar('train_loss', total_loss, epoch)
    # test
    test(epoch,num_epoches)
    
    # save model
    model_path = "./model/classification/lstm/{}/lstm_100k_{}.model".format(today,epoch+1)
    torch.save(lstm, model_path)
    print("saved model: ", model_path)
writer.close()

epoch:1, step:500, loss:34.54502487182617
Epoch 0/20, P 0.5295, R 0.5283, F1 0.5230, AUC 0.5391
saved model:  ./model/classification/lstm/20220228/lstm_100k_1.model
epoch:2, step:500, loss:34.334102630615234
Epoch 1/20, P 0.5355, R 0.5330, F1 0.5251, AUC 0.5407
saved model:  ./model/classification/lstm/20220228/lstm_100k_2.model
epoch:3, step:500, loss:34.133697509765625
Epoch 2/20, P 0.5260, R 0.5260, F1 0.5259, AUC 0.5382
saved model:  ./model/classification/lstm/20220228/lstm_100k_3.model
epoch:4, step:500, loss:33.98299789428711
Epoch 3/20, P 0.5295, R 0.5276, F1 0.5195, AUC 0.5401
saved model:  ./model/classification/lstm/20220228/lstm_100k_4.model
epoch:5, step:500, loss:33.75844192504883
Epoch 4/20, P 0.5352, R 0.5327, F1 0.5252, AUC 0.5395
saved model:  ./model/classification/lstm/20220228/lstm_100k_5.model
epoch:6, step:500, loss:33.583614349365234
Epoch 5/20, P 0.5346, R 0.5323, F1 0.5250, AUC 0.5392
saved model:  ./model/classification/lstm/20220228/lstm_100k_6.model
epoch:7

KeyboardInterrupt: 

In [None]:
from utils import processing

strs = ["我想说我会爱你多一点点", "日有所思梦感伤"]
strs = ["我一声龙啸凌云志！热血燃冬扶摇起！","这么大个人了，不嫌丢人？一点素质也没有"]

data = []
for s in strs:
    vectors = []
    print(processing(s).split(" "))
    for w in processing(s).split(" "):
        if w in word2vec.wv.key_to_index:
            vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
    vectors = torch.Tensor(vectors)
    data.append(vectors)

x, _, lengths = collate_fn(list(zip(data, [-1] * len(strs))))
with torch.no_grad():
    x = x.to(device)
    outputs = lstm(x, lengths)       # 前向传播
    outputs = outputs.view(-1)          # 将输出展平
outputs.cpu()[0].item()

In [None]:
STOP

### 验证

In [41]:
import torch
import pandas as pd
from utils import processing

# 超参数
embed_size = 100
hidden_size = 64
num_layers = 2
# 需先搭建网络模型model
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

lstm = LSTM(embed_size, hidden_size, num_layers).to(device)   
# 然后通过下面的语句加载参数
# lstm_new = model.load_state_dict(torch.load('model/classification/lstm_10k_7.model'))
lstm=torch.load('model/classification/lstm_10k_7.model')

In [None]:
import time
from gensim import models
t1 = time.time()
word2vec = models.Word2Vec.load('model/word/word2vec_10k.model')
t2 = time.time()
print(".molde load time %.4f"%(t2-t1))

In [None]:
def collate_fn(data):
    """
    :param data: 第0维：data，第1维：label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x = [i[0] for i in data]
    y = [i[1] for i in data]

    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, torch.tensor(y, dtype=torch.float32), torch.tensor(data_length)

In [None]:
def calculate_emo_score(text):
    vectors= []
    for w in processing(text).split(" "):
        if w in word2vec.wv.key_to_index:
            vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
    vectors = torch.Tensor(vectors)
    x, _, lengths = collate_fn(list(zip([vectors], [-1])))
    if lengths[0].item()<1:
        return None
    with torch.no_grad():
        x = x.to(device)
        outputs = lstm(x, lengths)       # 前向传播
        outputs = outputs.view(-1)    # 将输出展平
        result_score = outputs.cpu()[0].item()      
    return result_score

# 验证
crawl_result = pd.read_csv('../Dataset/crawl_result.csv')
crawl_result['emo_score'] = crawl_result['texts'].apply(lambda x: calculate_emo_score(x))
crawl_result.to_csv('result/crawl_result_emo.csv', index=False, encoding='utf-8-sig')