In [13]:
import os
import pandas as pd
import tensorflow as tf
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer
from transformers import BertModel
from pathlib import Path
from sklearn.metrics import roc_auc_score, f1_score, recall_score, precision_score, precision_recall_curve
import matplotlib.pyplot as plt
# 模型存储路径
model_dir = Path("./bert_test_checkpoints")
# 如果模型目录不存在，则创建一个
os.makedirs(model_dir) if not os.path.exists(model_dir) else ''


''

In [14]:
#加载Bert分词器和预训练模型
tokenizer = BertTokenizer.from_pretrained("./ernie")
bert_model = BertModel.from_pretrained("./ernie")
#加载数据集
train_data_first = pd.read_csv("./data/train.news.csv")
test_data_first = pd.read_csv("./data/test.feature.csv")

#提取'Title'和'label'两列
train_data_second = train_data_first.loc[:, ['Title','label']]

test_data_second = test_data_first.loc[:, ['Title']]
test_data_second['id']=test_data_second.index+1

#填补缺失值
train_data_second['Title'] =train_data_second['Title'].fillna('')
test_data_second['Title'] =test_data_second['Title'].fillna('')

#洗牌并划分验证集
#按7:1比例划分训练集ds_train和验证集ds_valid（9263，1324）ds_test(10141)
valid_data = train_data_second.sample(frac=0.125)
train_data = train_data_second[~train_data_second.index.isin(valid_data.index)]
test_data=test_data_second


In [15]:
# 构建Dataset
class MyDataset(Dataset):

    def __init__(self, mode='train'):
        super(MyDataset, self).__init__()#调用父类的init，确保父类正确初始化
        self.mode = mode
        # 拿到对应的数据
        if mode == 'train':
            self.dataset = train_data
        elif mode == 'valid':
            self.dataset = valid_data
            
        elif mode == 'test':
            # 如果是测试模式，则返回内容和id。
            self.dataset = test_data
        
        else:
            raise Exception("Unknown mode {}".format(mode))
            
    def __getitem__(self, idx):
        # 取第index条
        data = self.dataset.iloc[idx]
        # 取其内容
        text = data['content']
        # 根据状态返回内容
        if self.mode == 'test':
            # 如果是test，将id做为target
            label = data['id']
        else:
            label = data['label']
        # 返回内容和label
        return text, label

    def __len__(self):
        return len(self.dataset)

train_dataset = MyDataset('train')
valid_dataset = MyDataset('valid')
test_dataset=MyDataset('test')


In [16]:
#构造Dataloader。
#自定义collate_fn，将多个样本合成一个批次，在其中完成对句子进行编码、填充、组装batch：
def collate_fn(batch):
    
    text, label = zip(*batch)
    text, label = list(text), list(label)

    # src给bert
    # padding='max_length' 不够长度的进行填充
    # truncation=True 长度过长的进行裁剪
    src = tokenizer(text, padding='max_length', max_length=180, return_tensors='pt', truncation=True)
    
    return src, torch.LongTensor(label)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
valid_loader = DataLoader(valid_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn)
inputs, targets = next(iter(train_loader))
print("inputs:", inputs)
print("targets:", targets)

inputs: {'input_ids': tensor([[    1,  1060,   464,  ...,     0,     0,     0],
        [    1, 17963,   305,  ...,     0,     0,     0],
        [    1,   248,    82,  ...,     0,     0,     0],
        ...,
        [    1,   342,   337,  ...,     0,     0,     0],
        [    1,  9474,   132,  ...,     0,     0,     0],
        [    1,   978,   828,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}
targets: tensor([1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0,
        0, 1, 1, 1, 0, 0, 1, 1])


In [17]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#定义预测模型，由bert模型加上最后的预测层组成
class MyModel(nn.Module):
    def __init__(self):
        super(MyModel, self).__init__()
        # 加载bert预训练模型bert_model
        self.bert = bert_model
        # 最后的预测层
        self.predictor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, 1),
            nn.Sigmoid()
        )

    def forward(self, src):
        # src直接序列解包传入bert
        # 得到encoder的输出，用最前面[CLS]的输出作为最终线性层的输入
        outputs = self.bert(**src).last_hidden_state[:, 0, :]
    
        return self.predictor(outputs)
  
model = MyModel()
model = model.to(device)


In [18]:
def to_device(dict_tensors):
    result_tensors = {}
    for key, value in dict_tensors.items():
        result_tensors[key] = value.to(device)
    return result_tensors


In [19]:
lr = 3e-5
#损失函数Binary Cross Entropy：
criteria = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

y_true = []  # 验证集实际标签
y_pred = []  # 验证集模型预测结果

def validate():
    model.eval()
    total_loss = 0.
    total_correct = 0
    for inputs, targets in valid_loader:
        inputs, targets = to_device(inputs), targets.to(device)
        outputs = model(inputs)
        loss = criteria(outputs.view(-1), targets.float())
        total_loss += float(loss)

        correct_num = (((outputs >= 0.5).float() * 1).flatten() == targets).sum()
        total_correct += correct_num
        #保留y_true, y_pred, 计算precision,Recall,F1-score,AUC
        predictions = (outputs >= 0.5).float()  
        y_true.extend(targets.tolist())
        y_pred.extend(predictions.view(-1).tolist())

    return total_correct / len(valid_dataset), total_loss / len(valid_dataset), y_true, y_pred

In [20]:

# 首先将模型调成训练模式
model.train()
#设置loss辅助变量
total_loss = 0.
step = 0# 记录步数
log_per_step = 50#每个50步打印一次loss
best_accuracy = 0# 记录在验证集上最好的准确率

epochs=4
# 开始训练
for epoch in range(epochs):
    model.train()
    for i, (inputs, targets) in enumerate(train_loader):
        # 从batch中拿到训练数据
        inputs, targets = to_device(inputs), targets.to(device)
        # 传入模型进行前向传播
        outputs = model(inputs)
        # 计算损失
        loss = criteria(outputs.view(-1), targets.float())
        #反向传播
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        total_loss += float(loss)
        step += 1

        if step % log_per_step == 0:
            print("Epoch {}/{}, Step: {}/{}, total loss:{:.4f}".format(epoch+1, epochs, i, len(train_loader), total_loss))
            total_loss = 0

        del inputs, targets

    # 一个epoch后，使用验证集进行验证
    accuracy, validation_loss, y_trues, y_preds = validate()
    print("Epoch {}, accuracy: {:.4f}, validation loss: {:.4f}".format(epoch+1, accuracy, validation_loss))  
    
    # 保存最好的模型
    if accuracy > best_accuracy:
        torch.save(model, model_dir / f"model_test_best.pt")
    


'\n# 首先将模型调成训练模式\nmodel.train()\n#设置loss辅助变量\ntotal_loss = 0.\nstep = 0# 记录步数\nlog_per_step = 50#每个50步打印一次loss\nbest_accuracy = 0# 记录在验证集上最好的准确率\n\nepochs=4\n# 开始训练\nfor epoch in range(epochs):\n    model.train()\n    for i, (inputs, targets) in enumerate(train_loader):\n        # 从batch中拿到训练数据\n        inputs, targets = to_device(inputs), targets.to(device)\n        # 传入模型进行前向传播\n        outputs = model(inputs)\n        # 计算损失\n        loss = criteria(outputs.view(-1), targets.float())\n        #反向传播\n        loss.backward()\n        optimizer.step()\n        optimizer.zero_grad()\n\n        total_loss += float(loss)\n        step += 1\n\n        if step % log_per_step == 0:\n            print("Epoch {}/{}, Step: {}/{}, total loss:{:.4f}".format(epoch+1, epochs, i, len(train_loader), total_loss))\n            total_loss = 0\n\n        del inputs, targets\n\n    # 一个epoch后，使用验证集进行验证\n    accuracy, validation_loss, y_trues, y_preds = validate()\n    print("Epoch {}, accuracy: {:.4f}, va

In [22]:
model = torch.load(model_dir / f"model_test_best.pt",map_location='cpu')
model = model.eval()


In [23]:


results = []
for inputs, ids in test_loader:
    outputs = model(inputs.to(device))
    outputs = (outputs >= 0.3).int().flatten().tolist()
    ids = ids.tolist()
    results = results + [(id, result) for result, id in zip(outputs, ids)]

test_label = [pair[1] for pair in results]
test_data['label'] = test_label
test_data[['id','label']].to_csv('submit_task2.csv', index=False)

