In [1]:
from utils import load_corpus, stopwords, processing, processing_bert
import numpy as np
from torch.utils.tensorboard.writer import SummaryWriter
import os
import datetime
import pandas as pd

### 加载数据集

In [2]:
today = datetime.date.today().strftime('%Y%m%d')
if not os.path.exists('./evaluation-bert'):
    os.makedirs('./evaluation-bert')
writer = SummaryWriter(log_dir=os.path.join('./evaluation-bert', today))

In [3]:
# 分别加载训练集和测试集
df_train=pd.read_csv('../Dataset/weibo_senti_bert_train.csv') 
df_test=pd.read_csv('../Dataset/weibo_senti_bert_test.csv') 

In [4]:
os.path.join('/evaluation-bert', today)

'/evaluation-bert/20220218'

### 加载Bert

In [5]:
import os
from transformers import BertTokenizer, BertModel

os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"    # 在我的电脑上不加这一句, bert模型会报错
MODEL_PATH = "../WeiboSentiment/model/chinese_wwm_pytorch"     # 下载地址见 https://github.com/ymcui/Chinese-BERT-wwm

### 神经网络

In [6]:
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader

device = "cuda:1" if torch.cuda.is_available() else "cpu"

In [7]:
# 加载
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)   # 分词器
bert = BertModel.from_pretrained(MODEL_PATH).to(device)            # 模型

Some weights of the model checkpoint at ../WeiboSentiment/model/chinese_wwm_pytorch were not used when initializing BertModel: ['cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
# 超参数
learning_rate = 1e-3
input_size = 768
num_epoches = 30
batch_size = 100
decay_rate = 0.9

In [9]:
# 数据集
class MyDataset(Dataset):
    def __init__(self, df):
        self.data = df["text"].tolist()
        self.label = df["label"].tolist()

    def __getitem__(self, index):
        data = self.data[index]
        label = self.label[index]
        return data, label

    def __len__(self):
        return len(self.label)

# 训练集
train_data = MyDataset(df_train)
train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)

# 测试集
test_data = MyDataset(df_test)
test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=True)

In [10]:
# 网络结构
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        out = self.sigmoid(out)
        return out

net = Net(input_size).to(device)

In [11]:
from sklearn import metrics

# 测试集效果检验
def test():
    y_pred, y_true = [], []

    with torch.no_grad():
        for words, labels in test_loader:
            tokens = tokenizer(words, padding=True)
            input_ids = torch.tensor(tokens["input_ids"]).to(device)
            attention_mask = torch.tensor(tokens["attention_mask"]).to(device)
            last_hidden_states = bert(input_ids, attention_mask=attention_mask)
            bert_output = last_hidden_states[0][:, 0]
            outputs = net(bert_output)          # 前向传播
            outputs = outputs.view(-1)          # 将输出展平
            y_pred.append(outputs)
            y_true.append(labels)

    y_prob = torch.cat(y_pred).cpu()
    y_true = torch.cat(y_true).cpu()
    y_pred = y_prob.clone()
    y_pred[y_pred > 0.5] = 1
    y_pred[y_pred <= 0.5] = 0
    
    # print(metrics.classification_report(y_true, y_pred))
    # print("准确率:", metrics.accuracy_score(y_true, y_pred))
    # print("AUC:", metrics.roc_auc_score(y_true, y_prob) )
    precision, recall, f1, _ = metrics.precision_recall_fscore_support(y_true, y_pred)
    roc_auc = metrics.roc_auc_score(y_true, y_prob)
    print('Epoch {}/{}, P {:.4f}, R {:.4f}, F1 {:.4f}, AUC {:.4f}'.format(
        epoch, num_epoches, precision.mean(), recall.mean(), f1.mean(), roc_auc.mean()))

In [12]:
# 定义损失函数和优化器
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=decay_rate)

In [13]:
# 迭代训练
for epoch in range(num_epoches):
    total_loss = 0
    for i, (words, labels) in enumerate(train_loader):
        tokens = tokenizer(words, padding=True)
        input_ids = torch.tensor(tokens["input_ids"]).to(device)
        attention_mask = torch.tensor(tokens["attention_mask"]).to(device)
        labels = labels.float().to(device)
        with torch.no_grad():
            last_hidden_states = bert(input_ids, attention_mask=attention_mask)
            bert_output = last_hidden_states[0][:, 0]
        optimizer.zero_grad()               # 梯度清零
        outputs = net(bert_output)          # 前向传播
        logits = outputs.view(-1)           # 将输出展平
        loss = criterion(logits, labels)    # loss计算
        total_loss += loss
        loss.backward()                     # 反向传播，计算梯度
        optimizer.step()                    # 梯度更新
        if (i+1) % 500 == 0:
            print("epoch:{}, step:{}, loss:{}".format(epoch+1, i+1, total_loss/10))
            total_loss = 0
    
    # learning_rate decay
    scheduler.step()
    
    # test
    test()
    
    # save model
    model_path = "./model/classification/bert_dnn_{}.model".format(epoch+1)
    torch.save(net, model_path)
    print("saved model: ", model_path)

epoch:1, step:500, loss:18.69440269470215
epoch:1, step:1000, loss:15.497702598571777
epoch:1, step:1500, loss:14.89936351776123
Epoch 0/30, P 0.8812, R 0.8812, F1 0.8812, AUC 0.9544
saved model:  ./model/classification/bert_dnn_1.model
epoch:2, step:500, loss:14.426989555358887
epoch:2, step:1000, loss:14.13404369354248
epoch:2, step:1500, loss:14.374293327331543
Epoch 1/30, P 0.8876, R 0.8855, F1 0.8857, AUC 0.9569
saved model:  ./model/classification/bert_dnn_2.model
epoch:3, step:500, loss:14.180218696594238
epoch:3, step:1000, loss:13.816229820251465
epoch:3, step:1500, loss:13.99055004119873
Epoch 2/30, P 0.8877, R 0.8868, F1 0.8869, AUC 0.9582
saved model:  ./model/classification/bert_dnn_3.model
epoch:4, step:500, loss:13.778979301452637
epoch:4, step:1000, loss:13.76666259765625
epoch:4, step:1500, loss:13.744775772094727
Epoch 3/30, P 0.8892, R 0.8845, F1 0.8847, AUC 0.9589
saved model:  ./model/classification/bert_dnn_4.model
epoch:5, step:500, loss:13.810759544372559
epoch:

### 手动输入句子，判断情感倾向（1正/0负）

In [14]:
net = torch.load("./model/classification/bert_dnn_8.model")    # 训练过程中的巅峰时刻

In [15]:
s = ["华丽繁荣的城市、充满回忆的小镇、郁郁葱葱的山谷...", "突然就觉得人间不值得"]
tokens = tokenizer(s, padding=True)
input_ids = torch.tensor(tokens["input_ids"]).to(device)
attention_mask = torch.tensor(tokens["attention_mask"]).to(device)
last_hidden_states = bert(input_ids, attention_mask=attention_mask)
bert_output = last_hidden_states[0][:, 0]
outputs = net(bert_output)
outputs

tensor([[0.6120],
        [0.1450]], device='cuda:1', grad_fn=<SigmoidBackward>)

In [16]:
outputs.view(-1).cpu()[0].item() 

0.6120356917381287

In [17]:
s = ["今天天气真好", "今天天气特别特别棒"]
tokens = tokenizer(s, padding=True)
input_ids = torch.tensor(tokens["input_ids"]).to(device)
attention_mask = torch.tensor(tokens["attention_mask"]).to(device)
last_hidden_states = bert(input_ids, attention_mask=attention_mask)
bert_output = last_hidden_states[0][:, 0]
outputs = net(bert_output)
outputs

tensor([[0.9864],
        [0.9264]], device='cuda:1', grad_fn=<SigmoidBackward>)

## 评估（inference）

In [5]:
import torch
from torch import nn
from torch.nn.utils.rnn import pad_sequence
from utils import processing,processing_bert
import pandas as pd
import argparse
import os
import sys
import time
from gensim import models
from transformers import BertTokenizer, BertModel
device = "cuda:0" if torch.cuda.is_available() else "cpu"

In [3]:
class Net(nn.Module):
    def __init__(self, input_size):
        super(Net, self).__init__()
        self.fc = nn.Linear(input_size, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        out = self.fc(x)
        out = self.sigmoid(out)
        return out

In [7]:
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"   
MODEL_PATH = '/root/nas/WeiboSentiment/model/chinese_wwm_pytorch'     # 下载地址见 https://github.com/ymcui/Chinese-BERT-wwm
# 加载
tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)   # 分词器
bert = BertModel.from_pretrained(MODEL_PATH).to(device)            # 模型
# 超参数
input_size = 768
net = Net(input_size)
net = torch.load('/root/nas/chinese-sentiment-analysis/model/classification/bert_dnn_20.model')    # 训练过程中的巅峰时刻
net = net.to(device)

def calculate_emo_score_bert(text):
    text = processing_bert(text)
    if len(text)>510:
        text=text[:510]
    with torch.no_grad():
        tokens = tokenizer([text], padding=True)
        input_ids = torch.tensor(tokens["input_ids"]).to(device)
        attention_mask = torch.tensor(tokens["attention_mask"]).to(device)
        last_hidden_states = bert(input_ids, attention_mask=attention_mask)
        bert_output = last_hidden_states[0][:, 0]
        outputs = net(bert_output)
        result_score = outputs.view(-1).cpu()[0].item() 
    return result_score

    # 模型结果
    # 读取爬虫数据
crawl_result = pd.read_csv('/root/nas/Dataset/crawl_result.csv')
crawl_result['emo_score'] = crawl_result['texts'].apply(lambda x: calculate_emo_score_bert(x))
crawl_result.to_csv('crawl_result_emo_bert.csv', index=False, encoding='utf-8-sig')
print('finish label emotion score through bert model!!')

Some weights of the model checkpoint at /root/nas/WeiboSentiment/model/chinese_wwm_pytorch were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


finish label emotion score through bert model!!


In [None]:
 --output_path /root/nas/chinese-sentiment-analysis/result/crawl_result_emo_bert.csv