In [1]:
# 导入必要的库
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel, AdamW, get_linear_schedule_with_warmup
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/dangerous-driving/0-ori-data-dangerous driving.xlsx
/kaggle/input/dangerous-driving/0-ori-data-dangerous driving ver3.0.xlsx


In [2]:
# 设置随机种子保证可复现性
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed_all(SEED)

In [3]:
df = pd.read_excel("/kaggle/input/dangerous-driving/0-ori-data-dangerous driving ver3.0.xlsx")

In [4]:
df.info()
print(df["indictment"].sum(), 46179 - df["indictment"].sum())
print(df["indictment"].sum() / (46179 - df["indictment"].sum()))

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 46179 entries, 0 to 46178
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   texts       46179 non-null  object
 1   indictment  46179 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 721.7+ KB
40997 5182
7.91142416055577


In [5]:
from sklearn.utils import resample

def prepare_data_safely(df, test_size=0.2, augment_multiplier=3):
    """
    更安全的数据预处理流程：
    1. 先分割训练集和测试集（保持原始分布）
    2. 对训练集的少数类进行文本增强
    3. 上采样增强后的少数类到平衡数量
    """
    # 第一步：分割原始数据为训练集和测试集（保持原始分布）
    train_df, test_df = train_test_split(
        df, 
        test_size=test_size,
        stratify=df['indictment'],
        random_state=SEED
    )
    
    # 第二步：在训练集内部划分训练和验证（避免测试集参与任何处理）
    train_part, val_df = train_test_split(
        train_df,
        test_size=0.2,
        stratify=train_df['indictment'],
        random_state=SEED
    )
    
    # 对训练集的少数类进行文本增强（仅增强训练部分，不增强验证集）
    minority_train = train_part[train_part['indictment'] == 0]
    majority_train = train_part[train_part['indictment'] == 1]
    
    minority_upsampled = resample(
        minority_train,
        replace=True,  # 允许重复采样
        n_samples=len(majority_train),  # 采样到与多数类相同数量
        random_state=SEED
    )
    
    # 合并并打乱顺序
    balanced_train = pd.concat([majority_train, minority_upsampled]).sample(frac=1, random_state=SEED)
    
    return balanced_train, val_df, test_df  # 返回平衡训练集、验证集、测试集

In [6]:
balanced_train_df, val_df, test_df = prepare_data_safely(df)  # 这里假设df是已经加载的DataFrame

In [7]:
# 为所有数据添加split标识
balanced_train_df['split'] = 'train'
val_df['split'] = 'validation'
test_df['split'] = 'test'

# 合并保存完整划分信息
full_split_df = pd.concat([balanced_train_df, val_df, test_df])
full_split_df.to_excel("/kaggle/working/data_split_details.xlsx", index=False)

In [8]:
# ========================== 数据集类定义 ==========================
class LegalDataset(Dataset):
    """自定义数据集类，处理文本编码"""
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]

        # BERT编码（包含padding和truncation）
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'label': torch.tensor(label, dtype=torch.long)
        }

In [9]:
# ========================== 模型定义 ==========================
class BERTLSTMClassifier(nn.Module):
    """支持多GPU的BERT-LSTM分类器"""
    def __init__(self, bert_model='bert-base-chinese', hidden_size=256, lstm_layers=1):
        super().__init__()
        # 加载BERT模型
        self.bert = BertModel.from_pretrained(bert_model)
        self.bert_hidden_size = self.bert.config.hidden_size
        
        # LSTM层
        self.lstm = nn.LSTM(
            input_size=self.bert_hidden_size,
            hidden_size=hidden_size,
            num_layers=lstm_layers,
            batch_first=True,
            bidirectional=False
        )
        
        # 分类器
        self.classifier = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(hidden_size, 64),
            nn.ReLU(),
            nn.Linear(64, 2)
        )

    def forward(self, input_ids, attention_mask, labels=None):
        # BERT输出
        outputs = self.bert(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        sequence_output = outputs.last_hidden_state
        
        # LSTM处理
        lstm_out, _ = self.lstm(sequence_output)
        last_lstm = lstm_out[:, -1, :]  # 取最后一个时间步
        
        # 分类
        logits = self.classifier(last_lstm)
        
        # 统一返回格式（适配多GPU）
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)
        
        return {'loss': loss, 'logits': logits}

In [10]:
# ========================== 训练准备 ==========================
# ========================== 设备设置 ==========================
# 检测可用GPU数量
num_gpus = torch.cuda.device_count()
print(f"可用GPU数量: {num_gpus}")

# 初始化组件
tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
model = BERTLSTMClassifier()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# 如果检测到多GPU，启用数据并行
if num_gpus > 1:
    print(f"\n启用数据并行，使用 {num_gpus} 个GPU")
    model = nn.DataParallel(model)

# 将模型移动到设备（单GPU或多GPU）
model = model.to(device)

# 创建DataLoader
def create_dataloader(df, tokenizer, batch_size=16, shuffle=True):
    texts = df['texts'].tolist()
    labels = df['indictment'].tolist()
    dataset = LegalDataset(texts, labels, tokenizer)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle)

train_loader = create_dataloader(balanced_train_df, tokenizer, batch_size=16)
val_loader = create_dataloader(val_df, tokenizer, batch_size=16, shuffle=False)

# 优化器和学习率调度
def get_model_parameters(model):
    """适配DataParallel的参数获取方式"""
    if isinstance(model, nn.DataParallel):
        return [
            {'params': model.module.bert.parameters(), 'lr': 2e-5},
            {'params': model.module.lstm.parameters(), 'lr': 1e-3},
            {'params': model.module.classifier.parameters(), 'lr': 1e-3}
        ]
    else:
        return [
            {'params': model.bert.parameters(), 'lr': 2e-5},
            {'params': model.lstm.parameters(), 'lr': 1e-3},
            {'params': model.classifier.parameters(), 'lr': 1e-3}
        ]

optimizer = AdamW(get_model_parameters(model))

total_steps = len(train_loader) * 5  # 总训练步数（假设训练5个epoch）
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=100,
    num_training_steps=total_steps
)

可用GPU数量: 2


tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/269k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/624 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/412M [00:00<?, ?B/s]


启用数据并行，使用 2 个GPU


In [11]:
# ========================== 训练函数 ==========================
def train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=5, patience=2):
    """支持多GPU的训练函数"""
    best_f1 = 0
    epochs_no_improve = 0
    device = next(model.parameters()).device  # 自动获取设备
    
    for epoch in range(epochs):
        # 阶段控制：前2个epoch冻结BERT
        if epoch < 2:
            print("\n阶段1：冻结BERT，训练分类层")
            # 获取实际BERT模块（适配DataParallel）
            bert_module = model.module.bert if isinstance(model, nn.DataParallel) else model.bert
            for param in bert_module.parameters():
                param.requires_grad = False
        else:
            print("\n阶段2：微调整个模型")
            bert_module = model.module.bert if isinstance(model, nn.DataParallel) else model.bert
            for param in bert_module.parameters():
                param.requires_grad = True

        # 训练循环
        model.train()
        total_loss = 0
        all_preds = []
        all_labels = []
        
        for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            
            # 前向传播
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask,
                labels=labels
            )
            
            # 处理多GPU的loss（可能返回张量列表）
            loss = outputs['loss']
            if loss is not None:
                if loss.dim() > 0:  # 多GPU情况下loss是向量
                    loss = loss.mean()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()
                total_loss += loss.item()
            
            # 获取预测结果
            logits = outputs['logits']
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())

        # 训练集指标
        train_acc = accuracy_score(all_labels, all_preds)
        train_f1 = f1_score(all_labels, all_preds)
        avg_loss = total_loss / len(train_loader) if len(train_loader) > 0 else 0
        
        # 验证集评估
        val_acc, val_f1, val_cm = evaluate(model, val_loader)
        
        print(f"\nEpoch {epoch+1}/{epochs}")
        print(f"Train Loss: {avg_loss:.4f} | Acc: {train_acc:.4f} | F1: {train_f1:.4f}")
        print(f"Val Acc: {val_acc:.4f} | Val F1: {val_f1:.4f}")
        print("混淆矩阵:")
        print(val_cm)
        
        # 早停机制
        if val_f1 > best_f1:
            best_f1 = val_f1
            epochs_no_improve = 0
            # 保存模型（适配DataParallel）
            if isinstance(model, nn.DataParallel):
                torch.save(model.module.state_dict(), 'best_model.bin')
            else:
                torch.save(model.state_dict(), 'best_model.bin')
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print("\n早停：验证集指标未提升，停止训练！")
                break

In [12]:
# ========================== 评估函数 ==========================
def evaluate(model, dataloader):
    """支持多GPU的评估函数"""
    model.eval()
    device = next(model.parameters()).device
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            outputs = model(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            logits = outputs['logits']
            preds = torch.argmax(logits, dim=1).cpu().numpy()
            
            all_preds.extend(preds)
            all_labels.extend(labels.cpu().numpy())
    
    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    cm = confusion_matrix(all_labels, all_preds)
    return acc, f1, cm

In [13]:
# ========================== 训练并保存模型 ==========================
if __name__ == "__main__":
    # 训练模型（训练完成后自动保存最佳模型到文件）
    train_model(model, train_loader, val_loader, optimizer, scheduler, epochs=5)
    
    # 确保模型保存成功
    assert os.path.exists('/kaggle/working/best_model.bin'), "模型保存失败！"
    print("\n模型已保存到 /kaggle/working/best_model.bin")


阶段1：冻结BERT，训练分类层


Epoch 1: 100%|██████████| 3280/3280 [23:52<00:00,  2.29it/s]



Epoch 1/5
Train Loss: 0.3236 | Acc: 0.8603 | F1: 0.8599
Val Acc: 0.9040 | Val F1: 0.9431
混淆矩阵:
[[ 804   25]
 [ 684 5876]]

阶段1：冻结BERT，训练分类层


Epoch 2: 100%|██████████| 3280/3280 [23:57<00:00,  2.28it/s]



Epoch 2/5
Train Loss: 0.1413 | Acc: 0.9536 | F1: 0.9533
Val Acc: 0.9414 | Val F1: 0.9660
混淆矩阵:
[[ 814   15]
 [ 418 6142]]

阶段2：微调整个模型


Epoch 3: 100%|██████████| 3280/3280 [59:06<00:00,  1.08s/it]



Epoch 3/5
Train Loss: 0.1329 | Acc: 0.9648 | F1: 0.9646
Val Acc: 0.9800 | Val F1: 0.9886
混淆矩阵:
[[ 803   26]
 [ 122 6438]]

阶段2：微调整个模型


Epoch 4: 100%|██████████| 3280/3280 [59:05<00:00,  1.08s/it]



Epoch 4/5
Train Loss: 0.0582 | Acc: 0.9866 | F1: 0.9866
Val Acc: 0.9804 | Val F1: 0.9889
混淆矩阵:
[[ 806   23]
 [ 122 6438]]

阶段2：微调整个模型


Epoch 5: 100%|██████████| 3280/3280 [59:05<00:00,  1.08s/it]



Epoch 5/5
Train Loss: 0.0221 | Acc: 0.9955 | F1: 0.9955
Val Acc: 0.9884 | Val F1: 0.9934
混淆矩阵:
[[ 798   31]
 [  55 6505]]

模型已保存到 /kaggle/working/best_model.bin


In [14]:
# ========================== 测试集评估 ==========================
def test_on_saved_model(test_loader, model_path='/kaggle/working/best_model.bin'):
    # 加载已保存的模型（适配Kaggle环境）
    model = BERTLSTMClassifier()
    model.load_state_dict(torch.load(model_path))
    model = model.to(device)
    model.eval()
    
    # 执行测试
    test_acc, test_f1, test_cm = evaluate(model, test_loader)
    
    print("\n" + "="*50)
    print(f"测试集最终结果（样本总数：{len(test_loader.dataset)}）")
    print(f"Accuracy: {test_acc:.4f} | F1 Score: {test_f1:.4f}")
    print("混淆矩阵（真实类别 vs 预测类别）：")
    print(test_cm)
    print("="*50)

# 执行测试（确保测试集DataLoader已创建）
test_loader = create_dataloader(test_df, tokenizer, batch_size=16, shuffle=False)
test_on_saved_model(test_loader)


测试集最终结果（样本总数：9236）
Accuracy: 0.9880 | F1 Score: 0.9932
混淆矩阵（真实类别 vs 预测类别）：
[[ 995   41]
 [  70 8130]]


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

def load_model(model_path):
    # 初始化模型结构（必须与训练时一致）
    model = BERTLSTMClassifier() 
    # 加载权重（自动适配CPU/GPU）
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()
    return model

In [16]:
import pandas as pd
import torch
from tqdm import tqdm

def export_misclassified_samples(test_loader, model, test_df, save_path="misclassified_samples.xlsx"):
    """二分类模型专用错误样本导出"""
    model.eval()
    
    error_samples = []
    
    with torch.no_grad():
        for batch_idx, batch in tqdm(enumerate(test_loader)):
            # 设备转移
            inputs = {
                "input_ids": batch["input_ids"].to(device),
                "attention_mask": batch["attention_mask"].to(device)
            }
            labels = batch["label"].to(device)
            
            # 模型推理（适配返回字典的特性）
            outputs = model(**inputs)
            logits = outputs['logits']  # 直接从字典获取logits
            
            # 获取预测结果和置信度
            probs = F.softmax(logits, dim=1)
            confidences, preds = torch.max(probs, dim=1)
            
            # 记录错误样本（带全局索引）
            batch_size = labels.size(0)
            global_indices = [batch_idx * test_loader.batch_size + i for i in range(batch_size)]
            
            for i in range(batch_size):
                if preds[i] != labels[i]:
                    error_samples.append({
                        "global_idx": global_indices[i],
                        "text": test_df.iloc[global_indices[i]]["texts"],  # 替换为实际文本列名
                        "true_label": labels[i].item(),
                        "pred_label": preds[i].item(),
                        "confidence": confidences[i].item(),
                        "prob_0": probs[i][0].item(),  # 类别0的概率
                        "prob_1": probs[i][1].item()   # 类别1的概率
                    })
    
    # 生成报告
    if error_samples:
        error_df = pd.DataFrame(error_samples)
        
        # 添加错误类型分析
        error_df["error_type"] = error_df.apply(
            lambda x: f"True_{x['true_label']}->Pred_{x['pred_label']}", axis=1)
        
        # 按置信度排序
        error_df = error_df.sort_values("confidence", ascending=False)
        
        # 保存到Excel（包含详细概率）
        error_df.to_excel(save_path, 
                        columns=["global_idx", "text", "true_label", 
                                "pred_label", "confidence", "prob_0", 
                                "prob_1", "error_type"],
                        index=False)
        print(f"发现 {len(error_df)} 个错误样本，已保存至 {save_path}")
    else:
        print("本次测试全部正确！")
    
    return error_df if error_samples else pd.DataFrame()





trained_model = load_model("/kaggle/working/best_model.bin")  # 替换你的实际路径

test_loader = create_dataloader(test_df, tokenizer, batch_size=16, shuffle=False)

error_report = export_misclassified_samples(
    test_loader=test_loader,
    model=trained_model,
    test_df=test_df.rename(columns={"indictment": "label"}),  # 假设原始标签列名为indictment
    save_path="binary_errors.xlsx"
)

578it [05:37,  1.71it/s]


发现 111 个错误样本，已保存至 binary_errors.xlsx
