In [1]:
pip install pandas numpy transformers==4.43 scikit-learn tqdm

Looking in indexes: https://mirrors.aliyun.com/pypi/simple
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
from transformers import BertTokenizer, BertModel, get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from sklearn.metrics import accuracy_score, classification_report
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
import pandas as pd
from tqdm import tqdm
import random
from datetime import datetime
import os

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def seed_everything(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

if torch.cuda.is_available():
    torch.cuda.empty_cache()
    print("已释放 PyTorch CUDA 缓存")

seed_everything(seed=2025)

已释放 PyTorch CUDA 缓存


In [5]:
# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.device_count() > 1:
    print(f"使用 {torch.cuda.device_count()} 个GPU进行训练")

Using device: cuda


In [6]:
def get_label(data):
    unique_labels = sorted(data['类别'].unique())
    label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
    id_to_label = {idx: label for label, idx in label_to_id.items()}
    data['label'] = data['类别'].map(label_to_id)

    return label_to_id, id_to_label, data

In [7]:
# def truncate_with_head_tail(text, tokenizer, max_length=512, head_length=128, tail_length=382):
#     """
#     对文本进行头尾截断
#     :param text: 原始文本
#     :param tokenizer: BERT tokenizer
#     :param max_length: 最大长度（通常512）
#     :param head_length: 保留前面多少个token
#     :param tail_length: 保留后面多少个token
#     :return: 截断后的 input_ids, attention_mask, token_type_ids
#     """
#     encoded = tokenizer.encode_plus(
#         text,
#         add_special_tokens=False,
#         truncation=False,
#         padding=False
#     )
#     input_ids = encoded['input_ids']

#     usable_length = max_length - 2

#     if len(input_ids) <= usable_length:
#         final_input_ids = input_ids
#     else:

#         head_ids = input_ids[:head_length]
#         tail_ids = input_ids[-tail_length:] if tail_length > 0 else []
#         final_input_ids = head_ids + tail_ids

#     final_input_ids = [tokenizer.cls_token_id] + final_input_ids + [tokenizer.sep_token_id]

#     attention_mask = [1] * len(final_input_ids)

#     padding_length = max_length - len(final_input_ids)
#     if padding_length > 0:
#         final_input_ids = final_input_ids + [tokenizer.pad_token_id] * padding_length
#         attention_mask = attention_mask + [0] * padding_length

#     token_type_ids = [0] * max_length

#     return {
#         'input_ids': torch.tensor(final_input_ids),
#         'attention_mask': torch.tensor(attention_mask),
#         'token_type_ids': torch.tensor(token_type_ids)
#     }

In [8]:
class TextClassificationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'token_type_ids': encoding['token_type_ids'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }
    
# class TextClassificationDataset(Dataset):
#     def __init__(self, texts, labels, tokenizer, max_length=512, head_len=128, tail_len=382):
#         self.texts = texts
#         self.labels = labels
#         self.tokenizer = tokenizer
#         self.max_length = max_length
#         self.head_len = head_len
#         self.tail_len = tail_len

#     def __len__(self):
#         return len(self.texts)

#     def __getitem__(self, idx):
#         text = str(self.texts[idx])
#         label = self.labels[idx]

#         encoding = truncate_with_head_tail(
#             text, self.tokenizer,
#             max_length=self.max_length,
#             head_length=self.head_len,
#             tail_length=self.tail_len
#         )

#         return {
#             'input_ids': encoding['input_ids'],
#             'attention_mask': encoding['attention_mask'],
#             'token_type_ids': encoding['token_type_ids'],
#             'labels': torch.tensor(label, dtype=torch.long)
#         }    

In [10]:
# class BertForSequenceClassification(nn.Module):
#     def __init__(self, bert_model_name, num_labels=10, dropout=0.1):
#         super(BertForSequenceClassification, self).__init__()
#         self.num_labels = num_labels
#         self.bert = BertModel.from_pretrained(bert_model_name)
#         self.dropout = nn.Dropout(dropout)
#         self.hidden_size = self.bert.config.hidden_size
#         self.classifier = nn.Linear(self.hidden_size, num_labels)

#     def forward(self, input_ids, attention_mask, token_type_ids):
#         outputs = self.bert(
#             input_ids=input_ids,
#             attention_mask=attention_mask,
#             token_type_ids=token_type_ids,
#             return_dict=False
#         )
#         pooled_output = outputs[1]
#         pooled_output = self.dropout(pooled_output)
#         logits = self.classifier(pooled_output)
#         return logits

class BertForSequenceClassification(nn.Module):
    def __init__(self, bert_model_name, num_labels=10, dropout=0.1):
        super(BertForSequenceClassification, self).__init__()
        self.num_labels = num_labels
        self.bert = BertModel.from_pretrained(bert_model_name, output_hidden_states=True)
        self.dropout = nn.Dropout(dropout)
        self.hidden_size = self.bert.config.hidden_size
        self.classifier = nn.Linear(self.hidden_size * 5, num_labels)

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            return_dict=False
        )

        last_hidden_state = outputs[0]
        pooled_output = outputs[1]
        hidden_states = outputs[2]

        cls_list = []
        for i in range(-1, -6, -1):
            cls_output = hidden_states[i][:, 0, :]
            cls_list.append(cls_output)

        last_hidden = torch.cat(cls_list, dim=1)
        last_hidden = self.dropout(last_hidden)
        logits = self.classifier(last_hidden)

        return logits

In [11]:
class FGM():
    def __init__(self, model):
        self.model = model
        self.backup = {}

    def attack(self, epsilon=1., emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                self.backup[name] = param.data.clone()
                norm = torch.norm(param.grad)
                if norm != 0 and not torch.isnan(norm):
                    r_at = epsilon * param.grad / norm
                    param.data.add_(r_at)

    def restore(self, emb_name='word_embeddings'):
        for name, param in self.model.named_parameters():
            if param.requires_grad and emb_name in name:
                assert name in self.backup
                param.data = self.backup[name]
        self.backup = {}

In [12]:
def train_model_with_FGM(MODEL_NAME, model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, device, num_epochs, save_dir):
    """
    训练模型并保存每个 epoch 的检查点，文件名包含验证集性能
    Args:
        save_dir: 保存模型的目录路径（会自动创建）
    """
    # 确保保存目录存在
    os.makedirs(save_dir, exist_ok=True)
    
    best_macro_f1 = 0.0
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")  # 防止不同训练任务文件名冲突

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        progress_bar = tqdm(train_dataloader, desc=f"Training Epoch {epoch + 1}")

        fgm = FGM(model)
        # 用于计算训练集指标
        all_train_labels = []
        all_train_preds = []

        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            optimizer.zero_grad()
            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)
            loss.backward(retain_graph=True)

            # 对抗攻击
            fgm.attack()
            logits_adv = model(input_ids, attention_mask, token_type_ids)
            loss_adv = criterion(logits_adv, labels)
            loss_adv.backward(retain_graph=True)
            fgm.restore()

            optimizer.step()
            scheduler.step()

            total_loss += loss.item()
            progress_bar.set_postfix({'loss': loss.item()})

            preds = torch.argmax(logits, dim=-1).detach().cpu().numpy()
            labels_np = labels.detach().cpu().numpy()

            all_train_preds.extend(preds)
            all_train_labels.extend(labels_np)

        # 计算训练集 Macro F1
        train_macro_f1 = f1_score(all_train_labels, all_train_preds, average='macro')
        avg_train_loss = total_loss / len(train_dataloader)

        # 验证模型
        val_loss, val_macro_f1, classification_report = evaluate_model(model, val_dataloader, criterion, device)

        # 输出训练和验证指标
        print(f"Epoch {epoch+1}/{num_epochs}")
        print(f"  Train Loss: {avg_train_loss:.4f} | Train Macro F1: {train_macro_f1:.4f}")
        print(f"  Val Loss: {val_loss:.4f} | Val Macro F1: {val_macro_f1:.4f}")
        print("\nClassification Report:")
        print(classification_report)
        print("-" * 60)
        
        val_f1_str = f"{val_macro_f1:.4f}".replace('.', '_')
        
        # 保存最佳模型（可选：也可单独保存最佳模型）
        if val_macro_f1 > best_macro_f1:
            best_macro_f1 = val_macro_f1
            if best_macro_f1 > 0.68:
                best_model_path = os.path.join(save_dir, f"best_{MODEL_NAME}_FGM_valF1_{val_f1_str}_{timestamp}.pth")
                torch.save(model.state_dict(), best_model_path)
                print(f"Best model saved: {best_model_path}")

    print(f"Training completed. Best Val Macro F1: {best_macro_f1:.4f}")
    return best_model_path

In [13]:
def evaluate_model(model, dataloader, criterion, device):
    """
    评估模型，返回损失、准确率和 Macro F1-Score
    """
    model.eval()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0
    all_preds = []
    all_labels = []

    progress_bar = tqdm(dataloader, desc="Evaluating", leave=False)

    with torch.no_grad():
        for batch in progress_bar:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            token_type_ids = batch['token_type_ids'].to(device)
            labels = batch['labels'].to(device)

            logits = model(input_ids, attention_mask, token_type_ids)
            loss = criterion(logits, labels)
            total_loss += loss.item()

            preds = torch.argmax(logits, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            correct_predictions += (preds == labels).sum().item()
            total_predictions += labels.size(0)

            progress_bar.set_postfix({'batch_loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(dataloader)
    macro_f1 = f1_score(all_labels, all_preds, average='macro')
    classification_reports = classification_report(all_labels, np.array(all_preds))
    
    return avg_loss, macro_f1, classification_reports

In [14]:
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

MODEL_PATH = '/root/lanyun-fs/models/chinese-roberta-wwm-ext'
MODEL_NAME = os.path.basename(MODEL_PATH)
MAX_LENGTH = 512
BATCH_SIZE = 32
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 1e-4
NUM_EPOCHS = 10
DROPOUT = 0.2
NUM_LABELS = 10
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizer = BertTokenizer.from_pretrained(MODEL_PATH)
model = BertForSequenceClassification(bert_model_name=MODEL_PATH,num_labels=NUM_LABELS,dropout=DROPOUT).to(device)

In [15]:
data = pd.read_csv('dataset/train_all.csv')
label_to_id,id_to_label, data = get_label(data)

In [16]:
train_data, val_data = train_test_split(data, test_size=0.2, stratify=data['label'])

train_data_text = train_data['文本'].to_list()
train_data_label = train_data['label'].to_list()

val_data_text = val_data['文本'].to_list()
val_data_label = val_data['label'].to_list()

train_dataset = TextClassificationDataset(train_data_text, train_data_label, tokenizer, MAX_LENGTH)
val_dataset = TextClassificationDataset(val_data_text, val_data_label, tokenizer, MAX_LENGTH)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [17]:
def softmax_with_temperature(logits, temperature=1.0):
    return np.exp((logits - np.max(logits)) / temperature) / np.sum(np.exp((logits - np.max(logits)) / temperature))

In [24]:
class_weights = torch.tensor(100*softmax_with_temperature(compute_class_weight('balanced', classes=np.array(list(id_to_label.keys())), y=train_data_label), temperature=25), dtype=torch.float32).to(device)
class_weights

tensor([ 2.7297, 65.6727,  3.0078,  4.0070,  2.8148, 10.8158,  2.6806,  2.8851,
         2.7027,  2.6837], device='cuda:0')

In [16]:
# CrossEntropyLoss
# criterion = nn.CrossEntropyLoss()

# class_weights CrossEntropyLoss
class_weights = torch.tensor(softmax_with_temperature(compute_class_weight('balanced', classes=np.array(list(id_to_label.keys())), y=train_data_label), temperature=1), dtype=torch.float32).to(device)
criterion = nn.CrossEntropyLoss(weight=class_weights)

optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_dataloader) * NUM_EPOCHS
scheduler = get_cosine_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1 * total_steps), num_training_steps=total_steps)

In [None]:
best_model_path = train_model_with_FGM(MODEL_NAME, model, train_dataloader, val_dataloader, criterion, optimizer, scheduler, device, num_epochs=NUM_EPOCHS, save_dir="./model_path")

Training Epoch 1: 100%|██████████| 5020/5020 [48:18<00:00,  1.73it/s, loss=0.0417]
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 1/10
  Train Loss: 0.8808 | Train Macro F1: 0.5123
  Val Loss: 0.4842 | Val Macro F1: 0.5555

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.95      0.97       621
           1       0.00      0.00      0.00         6
           2       0.29      0.92      0.44       155
           3       0.38      0.48      0.42        48
           4       0.69      0.03      0.05       319
           5       0.00      0.00      0.00        14
           6       0.94      0.94      0.94      1417
           7       0.87      0.77      0.81       229
           8       0.95      0.97      0.96       898
           9       0.95      0.95      0.95      1313

    accuracy                           0.88      5020
   macro avg       0.61      0.60      0.56      5020
weighted avg       0.90      0.88      0.86      5020

------------------------------------------------------------


Training Epoch 2: 100%|██████████| 5020/5020 [48:19<00:00,  1.73it/s, loss=0.0259] 
                                                                                  

Epoch 2/10
  Train Loss: 0.8796 | Train Macro F1: 0.5080
  Val Loss: 0.4658 | Val Macro F1: 0.6474

Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.98       621
           1       0.40      0.33      0.36         6
           2       0.31      0.95      0.47       155
           3       0.47      0.56      0.51        48
           4       0.95      0.06      0.12       319
           5       0.40      0.29      0.33        14
           6       0.95      0.95      0.95      1417
           7       0.92      0.76      0.83       229
           8       0.97      0.95      0.96       898
           9       0.94      0.96      0.95      1313

    accuracy                           0.89      5020
   macro avg       0.73      0.68      0.65      5020
weighted avg       0.93      0.89      0.88      5020

------------------------------------------------------------


Training Epoch 3:  71%|███████   | 3565/5020 [34:21<14:01,  1.73it/s, loss=1.08]   IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

Training Epoch 3: 100%|██████████| 5020/5020 [48:22<00:00,  1.73it/s, loss=1.59] 
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


Epoch 3/10
  Train Loss: 1.2587 | Train Macro F1: 0.3462
  Val Loss: 3.9371 | Val Macro F1: 0.0440

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       621
           1       0.00      0.00      0.00         6
           2       0.00      0.00      0.00       155
           3       0.00      0.00      0.00        48
           4       0.00      0.00      0.00       319
           5       0.00      0.00      0.00        14
           6       0.28      1.00      0.44      1417
           7       0.00      0.00      0.00       229
           8       0.00      0.00      0.00       898
           9       0.00      0.00      0.00      1313

    accuracy                           0.28      5020
   macro avg       0.03      0.10      0.04      5020
weighted avg       0.08      0.28      0.12      5020

------------------------------------------------------------


Training Epoch 4:  41%|████      | 2055/5020 [19:49<28:38,  1.73it/s, loss=1.45] 

In [None]:
def predict_single_text(model, text, tokenizer, label_to_id, id_to_label, device, max_length=512):

    model.eval()

    encoding = truncate_with_head_tail(
        text, tokenizer,
        max_length=512,
        head_length=128,
        tail_length=382
    )
    
    input_ids = encoding['input_ids'].unsqueeze(0).to(device)
    attention_mask = encoding['attention_mask'].unsqueeze(0).to(device)
    token_type_ids = encoding['token_type_ids'].unsqueeze(0).to(device)
    with torch.no_grad():
        logits = model(input_ids, attention_mask, token_type_ids)

    probabilities = F.softmax(logits, dim=1)
    predicted_id = torch.argmax(logits, dim=1).item()
    predicted_label = id_to_label[predicted_id]

    return predicted_id, probabilities, predicted_label

In [None]:
MODEL_PATH = '/root/lanyun-fs/models/chinese-roberta-wwm-ext'
MODEL_NAME = os.path.basename(MODEL_PATH)
MAX_LENGTH = 512
BATCH_SIZE = 16
LEARNING_RATE = 2e-5
WEIGHT_DECAY = 0.01
NUM_EPOCHS = 10
DROPOUT = 0.2

In [None]:
FOCAL_GAMMA = 2.0
class_weights = compute_class_weight(
    'balanced', 
    classes=np.array(list(id_to_label.keys())), 
    y=train_labels
)
criterion = FocalLoss(
    gamma=FOCAL_GAMMA,
    weight=class_weights,
    reduction='mean'
)

In [None]:
loaded_model = BertForSequenceClassification(bert_model_name=MODEL_NAME,num_labels=NUM_LABELS).to(device)
loaded_model.load_state_dict(torch.load(best_model_path,map_location=device))
loaded_model.eval()

In [None]:
val_list = []
val_leibie_list = []
val_prob_list = []
val_data_text = val_data['文本'].to_list()
val_data_label = val_data['label'].to_list()

for text in tqdm(val_data_text):
    pred_id, pred_prob, pred_label = predict_single_text(
        model=loaded_model,
        text=text,
        tokenizer=tokenizer,
        label_to_id=label_to_id,
        id_to_label=id_to_label,
        device=device,
        max_length=512
    )
    val_leibie_list.append(pred_label)
    val_prob_list.append(pred_prob)
    val_list.append(pred_id)

print("\nClassification Report:")
print(classification_report(val_data_label, np.array(val_list)))

In [None]:
data_test = pd.read_csv('dataset/test_text.csv')
data_test_text = data_test['文本'].to_list()

pred_list = []
for text in tqdm(data_test_text):
    pred_id, _, pred_label = predict_single_text(
        model=loaded_model,
        text=text,
        tokenizer=tokenizer,
        label_to_id=label_to_id,
        id_to_label=id_to_label,
        device=device,
        max_length=MAX_LENGTH
    )
    pred_list.append(pred_label)

submit = pd.read_csv('dataset/example.csv')
submit['类别'] = pred_list
submit.to_csv('dataset/submit.csv')