In [1]:
import transformers
from transformers import BertModel, BertTokenizer, AdamW, get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
%matplotlib inline
%config InlineBackend.figure_format='retina'
sns.set(style='whitegrid', palette='muted', font_scale=1.2)
HAPPY_COLORS_PALETTE = ["#01BEFE", "#FFDD00", "#FF7D00", "#FF006D", "#ADFF02", "#8F00FF"]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
rcParams['figure.figsize'] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
MODEL_NAME = 'autodl-tmp/ernie-1.0-base-zh' #模型的位置
PATH = 'autodl-tmp/bilstm_model.pth'  # 定义模型保存路径
TRAIN_DATA_PATH = "autodl-tmp/train_data/all_train.csv"
TEST_DATA_PATH = "autodl-tmp/train_data/all_test.csv"
LABEL_DICT = {'happy':0, 'sad':1, 'neutral':2, 'fear':3, 'angry':4, 'surprise':5}# 标签映射表

In [3]:
# 数据进行token化处理, seq_length表示接受的句子最大长度
def convert_text_to_token(tokenizer, sentence, seq_length):
    tokens = tokenizer.tokenize(sentence) # 句子转换成token
    tokens = ["[CLS]"] + tokens + ["[SEP]"] # token前后分别加上[CLS]和[SEP]
    # 生成 input_id, seg_id, att_mask
    ids1 = tokenizer.convert_tokens_to_ids(tokens)
    types = [0] * len(ids1)
    masks = [1] * len(ids1)
    # 句子长度统一化处理：截断或补全至seq_length
    if len(ids1) < seq_length: #补全
        ids = ids1 + [0] * (seq_length - len(ids1)) #[0]是因为词表中PAD的索引是0
        types = types + [1] * (seq_length - len(ids1))  # [1]表明该部分为PAD
        masks = masks + [0] * (seq_length - len(ids1)) # PAD部分，attention mask置为[0]
    else: # 截断
        ids = ids1[:seq_length]
        types = types[:seq_length]
        masks = masks[:seq_length]
    assert len(ids) == len(types) == len(masks)
    return ids, types, masks

# 构造训练集和测试集的DataLoader
def genDataLoader(is_train):
    if is_train: # 构造训练集
        path = TRAIN_DATA_PATH
    else: # 构造测试集
        path = TEST_DATA_PATH
    ### json
#     with open(path, encoding='utf8') as f:
#         data = json.load(f)
    ### csv
    data = pd.read_csv(path)
    # data.dropna(axis=0,subset = ["cleaned_content_without_seg"])   # 丢弃‘cleaned_content_without_seg’这两列中有缺失值的行  
    
    ids_pool = []
    types_pool = []
    masks_pool = []
    target_pool = []
    count = 0
    # 遍历构造每条数据
    for each1, each2 in zip(data['文本'], data['情绪标签']): # json: for each in data:
        cur_ids, cur_type, cur_mask = convert_text_to_token(TOKENIZER, str(each1), seq_length = SEQ_LENGTH) # json: each['cleaned_content']
        ids_pool.append(cur_ids)
        types_pool.append(cur_type)
        masks_pool.append(cur_mask)
        cur_target = LABEL_DICT[each2] # json: each['label']
        target_pool.append([cur_target])
        count += 1
        if count % 1000 == 0:
            print('已处理{}条'.format(count))
            # break
    # 构造loader
    data_gen = TensorDataset(torch.LongTensor(np.array(ids_pool)),
                  torch.LongTensor(np.array(types_pool)),
                  torch.LongTensor(np.array(masks_pool)),
                  torch.LongTensor(np.array(target_pool)))
    # print('shit')
    sampler = RandomSampler(data_gen)
    loader = DataLoader(data_gen, sampler=sampler, batch_size=BATCH_SIZE)
    return loader

In [4]:
import torch
from transformers import BertTokenizer, BertModel
import torch.nn as nn
import json
import numpy as np
import torch
# from build_data import genDataLoader
from transformers import BertModel
import torch.nn as nn
from tqdm import tqdm # 注意不要直接 import tqdm
import torch.nn.functional as F

# 复用模型结构
class Model(nn.Module):
    def __init__(self, num_classes):
        super(Model, self).__init__()
        self.bert = BertModel.from_pretrained(MODEL_NAME)  # /roberta-wwm-ext pretrain/
        for param in self.bert.parameters():
            param.requires_grad = True  # 所有参数求梯度
        self.fc = nn.Linear(768, num_classes)   # 768 -> 6
    def forward(self, x, token_type_ids, attention_mask):
        context = x  # 输入的句子
        types = token_type_ids
        mask = attention_mask  # 对padding部分进行mask，和句子相同size，padding部分用0表示，如：[1, 1, 1, 1, 0, 0]
        _, pooled = self.bert(context, token_type_ids=types, attention_mask=mask)
        out = self.fc(pooled)   # 得到6分类概率
        return out


class BERT_BiLSTM(nn.Module):
    def __init__(self, hidden_dim, num_classes, n_layers, bidirectional=True, drop_prob=0.5):
        super(BERT_BiLSTM, self).__init__()
        
        self.num_classes = num_classes
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.bidirectional = bidirectional
        
        # Bert ----------------重点，bert模型需要嵌入到自定义模型里面
        self.bert = BertModel.from_pretrained(MODEL_NAME)
        for param in self.bert.parameters():
            param.requires_grad = True  # 所有参数求梯度
            
        # LSTM layers
        self.lstm = nn.LSTM(768, hidden_dim, n_layers, batch_first=True, bidirectional=bidirectional)
        
        # dropout layer
        self.dropout = nn.Dropout(drop_prob)
        
        # linear and sigmoid layers
        if bidirectional:
            self.fc = nn.Linear(hidden_dim * 2, num_classes)
        else:
            self.fc = nn.Linear(hidden_dim, num_classes)
        
    def forward(self, x, token_type_ids, attention_mask):
        batch_size = x.size(0)
        # 生成bert字向量
        x = self.bert(x)[0]  # bert 字向量
#         print("&&&&&&&&&&&&&&&&")
#         print(x.shape)

        # lstm_out
        # x = x.float()
        lstm_out, (hidden_last, cn_last) = self.lstm(x)
        # print(lstm_out.shape)   #[32,100,768]
        # print(hidden_last.shape)   #[4, 32, 384]
        # print(cn_last.shape)    #[4, 32, 384]

        # 修改 双向的需要单独处理
        if self.bidirectional:
            # 正向最后一层，最后一个时刻
            hidden_last_L = hidden_last[-2]
            # print(hidden_last_L.shape)  #[32, 384]
            # 反向最后一层，最后一个时刻
            hidden_last_R = hidden_last[-1]
            # print(hidden_last_R.shape)   #[32, 384]
            # 进行拼接
            hidden_last_out = torch.cat([hidden_last_L, hidden_last_R], dim=-1)
            # print(hidden_last_out.shape,'hidden_last_out')   #[32, 768]
        else:
            hidden_last_out = hidden_last[-1]  # [32, 384]

        # dropout and fully-connected layer
        out = self.dropout(hidden_last_out)
        # print(out.shape)    #[32,768]
        out = self.fc(out)

        return out

In [5]:
def train(model, device, train_loader, test_loader, optimizer):   # 训练模型
    model.train()
    best_acc = 0.0
    for epoch in range(1, NUM_EPOCHS + 1):  # 3个epoch
        batch_idx = 0
        for (x1, x2, x3, y) in tqdm(train_loader):
            x1, x2, x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
            y_pred = model(x1, token_type_ids=x2, attention_mask=x3)  # 得到预测结果
            optimizer.zero_grad()             # 梯度清零
            loss = F.cross_entropy(y_pred, y.squeeze())  # 得到loss
            # accu_loss += loss.item() # 计算累积loss
            loss.backward()
            optimizer.step()
            batch_idx += 1
            if(batch_idx + 1) % 100 == 0:    # 打印loss
                print('Train Epoch: {} [{}/{} ({:.2f}%)]\tLoss: {:.6f}'.format(epoch, (batch_idx+1) * len(x1),
                  len(train_loader.dataset),
                  100. * batch_idx / len(train_loader),
                  # accu_loss / batch_idx))
                  loss.item()))  # 记得为loss.item()
        acc = test(model, device, test_loader) # 每个epoch结束后评估一次测试集精度
        if best_acc < acc:
            best_acc = acc
            torch.save(model.state_dict(), PATH)  # 保存最优模型
def test(model, device, test_loader):    # 测试模型, 得到测试集评估结果
#     model.eval()
    test_loss = 0.0
    acc = 0
    for (x1, x2, x3, y) in tqdm(test_loader):
        x1, x2, x3, y = x1.to(device), x2.to(device), x3.to(device), y.to(device)
        with torch.no_grad():
            y_ = model(x1, token_type_ids=x2, attention_mask=x3)
        test_loss += F.cross_entropy(y_, y.squeeze())
        pred = y_.max(-1, keepdim=True)[1]   # .max(): 2输出，分别为最大值和最大值的index
        acc += pred.eq(y.view_as(pred)).sum().item()    # 记得加item()
    test_loss /= len(test_loader)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)'.format(
          test_loss, acc, len(test_loader.dataset),
          100. * acc / len(test_loader.dataset)))
    return acc / len(test_loader.dataset)

In [6]:
import json
from transformers import BertTokenizer
from torch.utils.data import BatchSampler, TensorDataset, DataLoader, RandomSampler
import torch
import numpy as np
import pandas as pd
SEQ_LENGTH = 128
BATCH_SIZE = 64
TOKENIZER = BertTokenizer.from_pretrained(MODEL_NAME) #模型所在的目录名称

train_data = genDataLoader(True)
print('训练集处理完毕')
test_data = genDataLoader(False)
print('测试集处理完毕')

# 开始训练
MODEL1 = BERT_BiLSTM(hidden_dim = 384, num_classes = 6, n_layers = 2, bidirectional=True, drop_prob=0.5) # 指定分类类别 hidden_dim, num_classes, n_layers, bidirectional=True, drop_prob=0.5
print('原始模型加载完毕')
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
MODEL = MODEL1.to(DEVICE)
OPTIMIZER = torch.optim.Adam(MODEL.parameters(), lr=2e-5) # 优化器
NUM_EPOCHS = 3 # epoch

train(MODEL, DEVICE, train_data, test_data, OPTIMIZER)

已处理1000条
已处理2000条
已处理3000条
已处理4000条
已处理5000条
已处理6000条
已处理7000条
已处理8000条
已处理9000条
已处理10000条
已处理11000条
已处理12000条
已处理13000条
已处理14000条
已处理15000条
已处理16000条
已处理17000条
已处理18000条
已处理19000条
已处理20000条
已处理21000条
已处理22000条
已处理23000条
已处理24000条
已处理25000条
已处理26000条
已处理27000条
已处理28000条
已处理29000条
已处理30000条
已处理31000条
已处理32000条
已处理33000条
已处理34000条
已处理35000条
已处理36000条
训练集处理完毕
已处理1000条
已处理2000条
已处理3000条
已处理4000条
已处理5000条
测试集处理完毕


Some weights of the model checkpoint at autodl-tmp/ernie-1.0-base-zh were not used when initializing BertModel: ['ernie.encoder.layer.11.attention.output.dense.bias', 'ernie.encoder.layer.0.attention.self.query.bias', 'ernie.encoder.layer.9.attention.output.dense.weight', 'ernie.encoder.layer.8.attention.self.value.weight', 'ernie.encoder.layer.5.attention.self.key.weight', 'ernie.encoder.layer.2.attention.self.value.bias', 'ernie.encoder.layer.8.attention.self.value.bias', 'ernie.encoder.layer.8.attention.self.query.bias', 'ernie.encoder.layer.1.output.dense.weight', 'ernie.encoder.layer.2.output.dense.bias', 'ernie.encoder.layer.3.output.dense.bias', 'ernie.encoder.layer.3.attention.self.key.weight', 'ernie.encoder.layer.5.attention.self.query.weight', 'ernie.encoder.layer.7.attention.self.value.weight', 'ernie.encoder.layer.4.attention.self.key.bias', 'ernie.encoder.layer.6.attention.self.query.bias', 'ernie.encoder.layer.6.output.dense.weight', 'ernie.encoder.layer.4.attention.self

原始模型加载完毕


 18%|█▊        | 100/569 [00:13<01:06,  7.10it/s]



 35%|███▌      | 200/569 [00:27<00:51,  7.19it/s]



 53%|█████▎    | 300/569 [00:41<00:36,  7.30it/s]



 70%|███████   | 400/569 [00:55<00:23,  7.26it/s]



 88%|████████▊ | 500/569 [01:09<00:09,  7.21it/s]



100%|██████████| 569/569 [01:18<00:00,  7.22it/s]
100%|██████████| 79/79 [00:03<00:00, 23.55it/s]



Test set: Average loss: 1.2960, Accuracy: 2592/5000 (51.84%)


 18%|█▊        | 100/569 [00:13<01:04,  7.26it/s]



 35%|███▌      | 200/569 [00:27<00:51,  7.19it/s]



 53%|█████▎    | 300/569 [00:41<00:37,  7.16it/s]



 70%|███████   | 400/569 [00:55<00:23,  7.20it/s]



 88%|████████▊ | 500/569 [01:09<00:09,  7.23it/s]



100%|██████████| 569/569 [01:18<00:00,  7.22it/s]
100%|██████████| 79/79 [00:03<00:00, 23.57it/s]



Test set: Average loss: 1.0910, Accuracy: 2999/5000 (59.98%)


 18%|█▊        | 100/569 [00:13<01:06,  7.02it/s]



 35%|███▌      | 200/569 [00:27<00:50,  7.24it/s]



 53%|█████▎    | 300/569 [00:41<00:37,  7.22it/s]



 70%|███████   | 400/569 [00:55<00:23,  7.17it/s]



 88%|████████▊ | 500/569 [01:09<00:09,  7.32it/s]



100%|██████████| 569/569 [01:18<00:00,  7.21it/s]
100%|██████████| 79/79 [00:03<00:00, 23.57it/s]



Test set: Average loss: 0.9776, Accuracy: 3191/5000 (63.82%)


In [23]:
#开始预测
data_path='autodl-tmp/pred_data/wuhannomeaning.csv'
res_path='autodl-tmp/res_data_315/wuhannomeaning.csv'

# data_path='autodl-tmp/pred_data/shanghai.csv'
# res_path='autodl-tmp/res_data_315/shanghai.csv'

targrt='content'
data = pd.read_csv(data_path)
LABEL_DICT_new = dict(zip(LABEL_DICT.values(), LABEL_DICT.keys()))
print(data.shape)
data.dropna(subset=[targrt], inplace=True)
print(data.shape)

(72543, 14)
(72543, 14)


In [24]:
res = []
correct = 0
wrong = 0
count = 0
ids_pool = []
types_pool = []
masks_pool = []
target_pool = []
count = 0

import torch.nn.functional as F
names=list(LABEL_DICT_new.values())
for id_item,each in data.iterrows():
    cur_sentence = each[targrt]
    ids = []
    types = []
    masks = []
    cur_ids, cur_type, cur_mask = convert_text_to_token(TOKENIZER, each[targrt], seq_length=SEQ_LENGTH)
    # ids.append(cur_ids)
    # types.append(cur_type)
    # masks.append(cur_mask)
    # cur_ids, cur_type, cur_mask = torch.LongTensor(np.array(ids)).to(DEVICE), torch.LongTensor(np.array(types)).to(DEVICE), torch.LongTensor(np.array(masks)).to(DEVICE) # 数据构造成tensor形式
    ids_pool.append(cur_ids)
    types_pool.append(cur_type)
    masks_pool.append(cur_mask)
    count += 1
    if count % 1000 == 0:
        print('已处理{}条'.format(count))
data_gen = TensorDataset(torch.LongTensor(np.array(ids_pool)),
                  torch.LongTensor(np.array(types_pool)),
                  torch.LongTensor(np.array(masks_pool)))
loader = DataLoader(data_gen, batch_size=BATCH_SIZE,shuffle=False)
res_df=pd.DataFrame([])
MODEL.eval()
names=list(LABEL_DICT_new.values())

col=names+['flag']
res_df=pd.DataFrame(columns=col)
for (x1, x2, x3) in tqdm(loader):
    x1, x2, x3 = x1.to(device), x2.to(device), x3.to(device)
    with torch.no_grad():
        y_ = MODEL(x1, token_type_ids=x2, attention_mask=x3)
        probabilities = F.softmax(y_, dim=-1)
        pred = probabilities.max(-1, keepdim=True)[1]  # 取最大值
        probabilities=probabilities.cuda().data.cpu().numpy()
        pred=pred.cuda().data.cpu().numpy()
        labs=[names[x[0]] for x in pred]
        
        # 将列表转换为 Pandas DataFrame
        
        df_A = pd.DataFrame(probabilities)
        df_B = pd.DataFrame(labs)

        # 将 B 作为最后一列合并到 A
        result = pd.concat([df_A, df_B], axis=1)
        result.columns =col
        
        res_df=pd.concat([res_df,result], axis=0, ignore_index=True)
res_data=pd.concat([data,res_df], axis=1)
res_data.to_csv(res_path,encoding='utf_8_sig')

已处理1000条
已处理2000条
已处理3000条
已处理4000条
已处理5000条
已处理6000条
已处理7000条
已处理8000条
已处理9000条
已处理10000条
已处理11000条
已处理12000条
已处理13000条
已处理14000条
已处理15000条
已处理16000条
已处理17000条
已处理18000条
已处理19000条
已处理20000条
已处理21000条
已处理22000条
已处理23000条
已处理24000条
已处理25000条
已处理26000条
已处理27000条
已处理28000条
已处理29000条
已处理30000条
已处理31000条
已处理32000条
已处理33000条
已处理34000条
已处理35000条
已处理36000条
已处理37000条
已处理38000条
已处理39000条
已处理40000条
已处理41000条
已处理42000条
已处理43000条
已处理44000条
已处理45000条
已处理46000条
已处理47000条
已处理48000条
已处理49000条
已处理50000条
已处理51000条
已处理52000条
已处理53000条
已处理54000条
已处理55000条
已处理56000条
已处理57000条
已处理58000条
已处理59000条
已处理60000条
已处理61000条
已处理62000条
已处理63000条
已处理64000条
已处理65000条
已处理66000条
已处理67000条
已处理68000条
已处理69000条
已处理70000条
已处理71000条
已处理72000条


100%|██████████| 1134/1134 [00:47<00:00, 23.93it/s]
