In [2]:
from utils import load_corpus, stopwords
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import os
import datetime

In [3]:
TRAIN_PATH = "../weibo2018/train.txt"
TEST_PATH = "../weibo2018/test.txt"

# 分别加载训练集和测试集
train_data = load_corpus(TRAIN_PATH)
test_data = load_corpus(TEST_PATH)

today = datetime.date.today().strftime('%Y%m%d')
if not os.path.exists('./evaluation'):
    os.makedirs('./evaluation')
writer = SummaryWriter(log_dir=os.path.join('./evaluation', today))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.625 seconds.
Prefix dict has been built successfully.


In [4]:
os.path.join('/evaluation', today)

'/evaluation/20220208'

In [5]:
import pandas as pd

df_train = pd.DataFrame(train_data, columns=["text", "label"])
df_test = pd.DataFrame(test_data, columns=["text", "label"])
df_train.head()

Unnamed: 0,text,label
0,书中 自有 黄金屋 书中 自有 颜如玉 沿着 岁月 的 长河 跋涉 或是 风光旖旎 或是 姹...,1
1,这是 英超 被 黑 的 最惨 的 一次 二哈 二哈 十几年来 中国 只有 孙继海 董方卓 郑...,0
2,中国 远洋 海运 集团 副总经理 俞曾 港 月 日 在 上 表示 中央 企业 走 出去 是 ...,1
3,看 流星花园 其实 也 还好 啦 现在 的 观念 以及 时尚 眼光 都 不一样 了 或许 十...,1
4,汉武帝 的 罪己 诏 的 真实性 尽管 存在 着 争议 然而 轮台 罪己 诏 作为 中国 历...,1


In [6]:
# word2vec要求的输入格式: list(word)
wv_input = df_train['text'].map(lambda s: s.split(" "))   # [for w in s.split(" ") if w not in stopwords]
wv_input.head()          

0    [书中, 自有, 黄金屋, 书中, 自有, 颜如玉, 沿着, 岁月, 的, 长河, 跋涉, ...
1    [这是, 英超, 被, 黑, 的, 最惨, 的, 一次, 二哈, 二哈, 十几年来, 中国,...
2    [中国, 远洋, 海运, 集团, 副总经理, 俞曾, 港, 月, 日, 在, 上, 表示, ...
3    [看, 流星花园, 其实, 也, 还好, 啦, 现在, 的, 观念, 以及, 时尚, 眼光,...
4    [汉武帝, 的, 罪己, 诏, 的, 真实性, 尽管, 存在, 着, 争议, 然而, 轮台,...
Name: text, dtype: object

In [7]:
from gensim import models

# Word2Vec
word2vec = models.Word2Vec(wv_input, 
                           vector_size=128,   # 词向量维度
                           min_count=1,      # 最小词频, 因为数据量较小, 这里卡1
                           epochs=1000)      # 迭代轮次



In [8]:
word2vec.wv.most_similar("你")

[('我', 0.9512325525283813),
 ('他', 0.8914921879768372),
 ('的', 0.8785059452056885),
 ('自己', 0.8765919208526611),
 ('了', 0.8479840159416199),
 ('都', 0.8261860609054565),
 ('人', 0.8194601535797119),
 ('是', 0.8069769740104675),
 ('也', 0.7970981001853943),
 ('就', 0.7916268110275269)]

In [None]:
word2vec.wv.most_similar("汽车")

[('燃料电池', 0.4654559791088104),
 ('新能源', 0.4610259532928467),
 ('王哲', 0.45301851630210876),
 ('长椅', 0.4285859763622284),
 ('加增', 0.42834678292274475),
 ('怨气', 0.42726951837539673),
 ('挖掘机', 0.4267406463623047),
 ('柬埔寨', 0.4071280360221863),
 ('轮船', 0.4057103395462036),
 ('合婚', 0.40410640835762024)]

In [10]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence,pad_packed_sequence
from torch.utils.data import Dataset, DataLoader

device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [11]:
# 超参数
learning_rate = 5e-6
input_size = 768
num_epoches = 100
batch_size = 128
embed_size = 128
hidden_size = 64
num_layers = 2

In [12]:
# 数据集
class MyDataset(Dataset):
    def __init__(self, df):
        self.data = []
        self.label = df["label"].tolist()
        for s in df["text"].tolist():
            vectors = []
            for w in s.split(" "):
                if w in word2vec.wv.key_to_index:
                    vectors.append(word2vec.wv[w])   # 将每个词替换为对应的词向量
            vectors = torch.Tensor(vectors)
            self.data.append(vectors)
    
    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.label)

def collate_fn(data):
    """
    :param data: 第0维：data，第1维：label
    :return: 序列化的data、记录实际长度的序列、以及label列表
    """
    data.sort(key=lambda x: len(x[0]), reverse=True) # pack_padded_sequence要求要按照序列的长度倒序排列
    data_length = [len(sq[0]) for sq in data]
    x = [i[0] for i in data]
    y = [i[1] for i in data]
    data = pad_sequence(x, batch_first=True, padding_value=0)   # 用RNN处理变长序列的必要操作
    return data, torch.tensor(y, dtype=torch.float32), torch.tensor(data_length)


# 训练集
train_data = MyDataset(df_train)
train_loader = DataLoader(train_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

# 测试集
test_data = MyDataset(df_test)
test_loader = DataLoader(test_data, batch_size=batch_size, collate_fn=collate_fn, shuffle=True)

In [13]:
# 网络结构
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, bidirectional=True)
        self.fc = nn.Linear(hidden_size * 2, 1)  # 双向, 输出维度要*2
        self.sigmoid = nn.Sigmoid()

    def forward(self, x, lengths):
        h0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)  # 双向, 第一个维度要*2
        c0 = torch.zeros(self.num_layers * 2, x.size(0), self.hidden_size).to(device)
        
        packed_input = torch.nn.utils.rnn.pack_padded_sequence(input=x, lengths=lengths, batch_first=True)
        
        packed_out, (h_n, h_c) = self.lstm(packed_input, (h0, c0))

        lstm_out = torch.cat([h_n[-2], h_n[-1]], 1)  # 双向, 所以要将最后两维拼接, 得到的就是最后一个time step的输出
        out = self.fc(lstm_out)
        out = self.sigmoid(out)
        return out

lstm = LSTM(embed_size, hidden_size, num_layers).to(device)   

In [14]:
from sklearn import metrics

# 在测试集效果检验
def test(epoch,num_epoches):
    y_pred, y_true = [], []

    with torch.no_grad():
        for x, labels, lengths in test_loader:
            x = x.to(device)
            outputs = lstm(x, lengths)         # 前向传播
            outputs = outputs.view(-1)          # 将输出展平
            y_pred.append(outputs)
            y_true.append(labels)

    y_prob = torch.cat(y_pred).cpu()
    y_true = torch.cat(y_true).cpu()
    y_pred = y_prob.clone()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_true, y_pred)
    roc_auc = metrics.roc_auc_score(y_true, y_prob)
    # print(metrics.classification_report(y_true, y_pred))
    # print("测试集准确率:", metrics.accuracy_score(y_true, y_pred))
    # print("测试集AUC:", metrics.roc_auc_score(y_true, y_prob) )
    print('f1',f1)
    print('Epoch {}/{}, P {:.4f}, R {:.4f}, F {:.4f}, AUC {:.4f}'.format(
        epoch, num_epoches, precision.mean(), recall.mean(), f1.mean(), roc_auc.mean()))
    writer.add_scalar('precision', precision.mean(), epoch)
    writer.add_scalar('recall', recall.mean(), epoch)
    writer.add_scalar('f1score', f1.mean(), epoch)
    writer.add_scalar('auc', roc_auc.mean(), epoch)

In [15]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)

In [16]:
# 迭代训练
for epoch in range(num_epoches):
    total_loss = 0
    for i, (x, labels, lengths) in enumerate(train_loader):
        x = x.to(device)
        labels = labels.to(device)
        outputs = lstm(x, lengths)          # 前向传播
        logits = outputs.view(-1)           # 将输出展平
        loss = criterion(logits, labels)    # loss计算
        total_loss += loss
        optimizer.zero_grad()               # 梯度清零
        loss.backward(retain_graph=True)    # 反向传播，计算梯度
        optimizer.step()                    # 梯度更新
        if (i+1) % 50 == 0:
            print("epoch:{}, step:{}, loss:{}".format(epoch+1, i+1, total_loss/10))
            total_loss = 0
    writer.add_scalar('train_loss', total_loss, epoch)
    # test
    test(epoch,num_epoches)
    
    # save model
    model_path = "./model/lstm_{}.model".format(epoch+1)
    torch.save(lstm, model_path)
    print("saved model: ", model_path)
writer.close()

epoch:1, step:50, loss:3.4944095611572266
f1 [0.47222222 0.02840909]
Epoch 0/500, P 0.5123, R 0.5008, F 0.2503, AUC 0.5345
saved model:  ./model/lstm_1.model
epoch:2, step:50, loss:3.482086658477783
f1 [0.47113885 0.05571031]
Epoch 1/500, P 0.5125, R 0.5016, F 0.2634, AUC 0.5528
saved model:  ./model/lstm_2.model
epoch:3, step:50, loss:3.475788116455078
f1 [0.46885246 0.16923077]
Epoch 2/500, P 0.5238, R 0.5091, F 0.3190, AUC 0.5708
saved model:  ./model/lstm_3.model
epoch:4, step:50, loss:3.468914747238159
f1 [0.48114901 0.3476298 ]
Epoch 3/500, P 0.5595, R 0.5439, F 0.4144, AUC 0.5852
saved model:  ./model/lstm_4.model
epoch:5, step:50, loss:3.4595234394073486
f1 [0.48275862 0.49704142]
Epoch 4/500, P 0.5649, R 0.5665, F 0.4899, AUC 0.5990
saved model:  ./model/lstm_5.model
epoch:6, step:50, loss:3.4496407508850098
f1 [0.47727273 0.58928571]
Epoch 5/500, P 0.5679, R 0.5778, F 0.5333, AUC 0.6122
saved model:  ./model/lstm_6.model
epoch:7, step:50, loss:3.4424641132354736
f1 [0.4536082