In [1]:
pip install pandas numpy transformers scikit-learn tqdm

Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
[33mDEPRECATION: pytorch-lightning 1.7.7 has a non-standard dependency specifier torch>=1.9.*. pip 24.0 will enforce this behaviour change. A possible replacement is to upgrade to a newer version of pytorch-lightning or contact the author to suggest that they release a version with a conforming dependency specifiers. Discussion can be found at https://github.com/pypa/pip/issues/12063[0m[33m
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from tqdm import tqdm

2025-08-05 15:27:45.854957: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-08-05 15:27:45.895039: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("已释放 PyTorch CUDA 缓存")
    
np.random.seed(42)
torch.manual_seed(42)
    

已释放 PyTorch CUDA 缓存


<torch._C.Generator at 0x7ff3a97fd410>

In [4]:
data = pd.read_csv('train_all.csv')

In [5]:
def get_label(data):

    unique_labels = sorted(data['类别'].unique())
    label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
    id_to_label = {idx: label for label, idx in label_to_id.items()}
    
    data['label'] = data['类别'].map(label_to_id)
    return label_to_id, id_to_label, data

label_to_id,id_to_label, data = get_label(data)

In [6]:
class TextClassificationDataset(Dataset):
    """
    自定义文本分类数据集类
    """
    def __init__(self, texts, labels, tokenizer, max_length=128):
        """
        Args:
            texts (list): 文本列表
            labels (list): 标签列表 (整数，0-9)
            tokenizer (BertTokenizer): BERT tokenizer
            max_length (int): 文本最大长度
        """
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
class BertForSequenceClassification(nn.Module):
    """
    BERT 外接分类头进行序列分类
    """
    def __init__(self, bert_model_name='bert-base-uncased', num_labels=10, dropout=0.1):
        """
        Args:
            bert_model_name (str): 预训练 BERT 模型名称
            num_labels (int): 分类数量
            dropout (float): dropout 概率
        """
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained(bert_model_name)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Linear(self.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        """
        Args:
            input_ids (torch.Tensor): shape (batch_size, sequence_length)
            attention_mask (torch.Tensor): shape (batch_size, sequence_length)
            token_type_ids (torch.Tensor): shape (batch_size, sequence_length)

        Returns:
            logits (torch.Tensor): shape (batch_size, num_labels)
        """
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False  # 为了简单，返回 tuple
        )
        pooled_output = outputs[1]
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)

        return logits

In [8]:
def train_model(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, device, num_epochs=3, save_path='./model_path/best_model_macro_f1.pth'):
    """
    训练模型，并根据 Macro F1-Score 保存最佳模型
    """
    best_macro_f1 = 0.0  # 用于保存最佳模型的 Macro F1 指标

    for epoch in range(num_epochs):
        # --- 训练阶段 ---
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

        avg_train_loss = total_loss / len(train_dataloader)

        # --- 验证阶段 ---
        val_loss, val_accuracy, val_macro_f1 = evaluate_model(model, val_dataloader, criterion, device)
        
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {avg_train_loss:.4f}")
        print(f"  Val Loss: {val_loss:.4f}")
        print(f"  Val Acc: {val_accuracy:.4f}")
        print(f"  Val Macro F1: {val_macro_f1:.4f}")  # 打印 F1
        print("-" * 50)

        # --- 保存最佳模型（基于 Macro F1）---
        if val_macro_f1 > best_macro_f1:
            best_macro_f1 = val_macro_f1
            torch.save(model.state_dict(), save_path)
            print(f"✅ Best model saved to {save_path} with Val Macro F1: {val_macro_f1:.4f}")

    print(f"✅ Training completed. Best Macro F1: {best_macro_f1:.4f}")

In [9]:
def evaluate_model(model, dataloader, criterion, device):
    """
    评估模型，返回损失、准确率和 Macro F1-Score
    """
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

            progress_bar.set_postfix({'batch_loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(dataloader)
    accuracy = correct_predictions / total_predictions

    # 计算 Macro F1-Score
    macro_f1 = f1_score(all_labels, all_preds, average='macro')

    return avg_loss, accuracy, macro_f1

In [13]:
MODEL_NAME = '/mnt/workspace/model/bert-base-chinese'
MAX_LENGTH = 512
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.1
NUM_EPOCHS = 3
DROPOUT = 0.1
NUM_LABELS = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained(MODEL_NAME)
model = BertForSequenceClassification(bert_model_name=MODEL_NAME,num_labels=NUM_LABELS,dropout=DROPOUT).to(device)

In [14]:
train_data, val_data = train_test_split(data, test_size=0.2, random_state=42, stratify=data['label'])

train_data_text = train_data['文本'].to_list()
train_data_label = train_data['label'].to_list()

val_data_text = val_data['文本'].to_list()
val_data_label = val_data['label'].to_list()

train_dataset = TextClassificationDataset(train_data_text, train_data_label, tokenizer, MAX_LENGTH)
val_dataset = TextClassificationDataset(val_data_text, val_data_label, tokenizer, MAX_LENGTH)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_dataloader) * NUM_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

In [5]:
(1.3506 - 1.1003)

0.25029999999999997

In [16]:
train_model(model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, device, num_epochs=NUM_EPOCHS)

Training Epoch 1: 100%|██████████| 628/628 [09:32<00:00,  1.10it/s, loss=0.321] 
                                                                                

Epoch 1/3
  Train Loss: 0.5832
  Val Loss: 0.2436
  Val Acc: 0.9185
  Val Macro F1: 0.5929
--------------------------------------------------
✅ Best model saved to ./model_path/best_model_macro_f1.pth with Val Macro F1: 0.5929


Training Epoch 2: 100%|██████████| 628/628 [09:32<00:00,  1.10it/s, loss=0.347] 
                                                                                

Epoch 2/3
  Train Loss: 0.1916
  Val Loss: 0.2107
  Val Acc: 0.9263
  Val Macro F1: 0.6427
--------------------------------------------------
✅ Best model saved to ./model_path/best_model_macro_f1.pth with Val Macro F1: 0.6427


Training Epoch 3: 100%|██████████| 628/628 [09:32<00:00,  1.10it/s, loss=0.0969] 
                                                                                

Epoch 3/3
  Train Loss: 0.1321
  Val Loss: 0.2094
  Val Acc: 0.9313
  Val Macro F1: 0.6577
--------------------------------------------------
✅ Best model saved to ./model_path/best_model_macro_f1.pth with Val Macro F1: 0.6577
✅ Training completed. Best Macro F1: 0.6577


In [17]:
def predict_single_text(model, text, tokenizer, label_to_id, id_to_label, device, max_length=512):
    """
    对单条文本进行预测

    Args:
        model: 训练好的模型
        text (str): 要预测的文本
        tokenizer: BERT tokenizer
        label_to_id (dict): 标签到ID的映射（用于获取 num_labels）
        id_to_label (dict): ID到标签名称的映射
        device: 设备 (cpu/cuda)
        max_length (int): 最大序列长度

    Returns:
        predicted_label (str): 预测的类别名称
        confidence (float): 预测的最大概率值（可选）
    """
    model.eval()  # 确保模型处于评估模式

    # 1. 分词并编码
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=max_length,
        return_tensors='pt'  # 返回 PyTorch 张量
    )

    # 2. 将输入移动到设备
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    token_type_ids = encoding['token_type_ids'].to(device)

    # 3. 前向传播，得到 logits
    with torch.no_grad():
        logits = model(input_ids, attention_mask, token_type_ids)

    # 4. 获取预测结果
    probabilities = F.softmax(logits, dim=1)
    predicted_id = torch.argmax(logits, dim=1).item()

    # 5. 转换为原始标签名称
    predicted_label = id_to_label[predicted_id]

    return predicted_label

In [31]:
data_test = pd.read_csv('test_text.csv')

In [32]:
loaded_model = BertForSequenceClassification(bert_model_name=MODEL_NAME,num_labels=NUM_LABELS).to(device)
loaded_model.load_state_dict(torch.load('./model_path/best_model_macro_f1.pth', map_location=device))
loaded_model.to(device)
loaded_model.eval()

# ------------------- 逐条预测新数据 -------------------
data_test_text = data_test['文本'].to_list()
pred_list = []
for text in tqdm(data_test_text):
    pred_label = predict_single_text(
        model=loaded_model,
        text=text,
        tokenizer=tokenizer,
        label_to_id=label_to_id,
        id_to_label=id_to_label,
        device=device,
        max_length=MAX_LENGTH
    )
    pred_list.append(pred_label)

submit = pd.read_csv('example.csv')
submit['类别'] = pred_list
submit.to_csv('submit.csv')

100%|██████████| 6276/6276 [01:19<00:00, 78.75it/s]
