# 多标签文本分类流程
1. 数据预处理
a.中文分词
b.去停用词
2. 标签归一化
3. 特征提取
4. 模型训练

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, BertModel, BertConfig, get_linear_schedule_with_warmup
from torch.utils.data import Dataset, DataLoader
import os
import time


# 初始化BERT tokenizer和模型
# MODEL_NAME = 'hfl/chinese-bert-wwm-ext'
# tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
# model = BertForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(labels))
tokenizer = BertTokenizer.from_pretrained('BERT/chinese_bert_wwm_ext_L-12_H-768_A-12/')
config = BertConfig.from_pretrained('BERT/chinese_bert_wwm_ext_L-12_H-768_A-12/pytorch_model.bin/config.json')
model = BertForSequenceClassification.from_pretrained('BERT/chinese_bert_wwm_ext_L-12_H-768_A-12/pytorch_model.bin/pytorch_model.bin', config=config)

# 定义数据集类
class RecordDataset(Dataset):
    def __init__(self, text, label, tokenizer, max_len):
        self.text = text
        self.label = label
        self.tokenizer = tokenizer
        self.max_len = max_len
    
    def __len__(self):
        return len(self.text)
    
    def __getitem__(self, index):
        text = str(self.text[index])
        label = self.label[index]
        
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            return_token_type_ids=False,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt',
        )
        
        return {
            'text': text,
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


# 定义数据加载器
def create_data_loader(df, tokenizer, max_len, batch_size):
    '''
    @param df : dataframe数据集
    @param tokenizer : 分词器
    @param max_len : 最大长度
    @param batch_size : 批次大小
    '''
    ds = RecordDataset(
        text=df['text'].to_numpy(),
        label=df['label'].to_numpy(),
        tokenizer=tokenizer,
        max_len=max_len
    )
    
    return DataLoader(
        ds,
        batch_size=batch_size,
        num_workers=4
    )


# 定义训练函数
def train_epoch(model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
    '''
    @param model : 模型
    @param data_loader : 数据加载器
    @param loss_fn : 损失函数
    @param optimizer : 优化器
    @param device : 设备
    @param scheduler : 动态学习率
    @param n_examples : 样本数
    @return : 准确率和损失
    '''
    model = model.train()
    
    losses = []
    correct_predictions = 0
    
    for d in data_loader:
        input_ids = d['input_ids'].to(device)
        attention_mask = d['attention_mask'].to(device)
        labels = d['labels'].to(device)
        
        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )
        
        _, preds = torch.max(outputs.logits, dim=1)
        loss = loss_fn(outputs.logits, labels)
        
        correct_predictions += torch.sum(preds == labels)
        losses.append(loss.item())
        
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        scheduler.step()
        optimizer.zero_grad()
        
    return correct_predictions.double() / n_examples, np.mean(losses)

def show_time(start_time, end_time):
    '''
    @param start_time : 开始时间
    @param end_time : 结束时间
    @return : 用时
    '''
    hours, rem = divmod(end_time - start_time, 3600)
    minutes, seconds = divmod(rem, 60)
    return "{:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)

# 定义优化器
EPOCHS = 10
BATCH_SIZE = 16
MAX_LEN = 512
LEARNING_RATE = 2e-5
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, correct_bias=False, eps=1e-8)

df_train = pd.read_csv(os.path.join('Record Collections', 'medical_record.csv'))
df_train = df_train[:10]
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
loss_fn = nn.CrossEntropyLoss().to(device) # 损失函数定义
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=len(train_data_loader) * EPOCHS
) # 动态学习率

# 训练模型
for epoch in range(EPOCHS):
    print(f'Epoch {epoch + 1}/{EPOCHS}')
    print('-' * 10)
    start_time = time.time()
    train_acc, train_loss = train_epoch(
        model,
        train_data_loader,
        loss_fn,
        optimizer,
        device,
        scheduler,
        len(df_train)
    )
    end_time = time.time()
    print(f'{show_time(start_time, end_time)} : Train loss {train_loss} accuracy {train_acc}')

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at BERT/chinese_bert_wwm_ext_L-12_H-768_A-12/pytorch_model.bin/pytorch_model.bin were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification mode

Epoch 1/10
----------
