# 虚假新闻检测 - 模型训练与评估

本notebook实现模型的训练、评估和分析流程，包括：

1. 数据准备
   - 加载预处理后的特征
   - 数据集划分
   - 特征标准化

2. 模型训练
   - BERT模型
   - BiLSTM+Attention模型
   - 传统机器学习模型（SVM、随机森林）

3. 模型评估
   - 准确率、精确率、召回率、F1分数
   - 混淆矩阵分析
   - ROC和PR曲线

4. 错误分析
   - 错误预测案例分析
   - 模型解释性分析
   - 改进建议

In [None]:
# 导入必要的库
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, random_split
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from sklearn.metrics import confusion_matrix, roc_curve, precision_recall_curve, auc
from transformers import BertTokenizer
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from tqdm import tqdm
import logging

# 设置随机种子
SEED = 42
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# 设置显示中文
plt.rcParams['font.sans-serif'] = ['Arial Unicode MS']  # 对于macOS
plt.rcParams['axes.unicode_minus'] = False
sns.set_style("whitegrid")

# 设置设备
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")

# 配置日志
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)

## 1. 数据准备

首先加载并准备训练数据：
1. 加载特征工程生成的特征
2. 加载对应的标签
3. 划分训练集、验证集和测试集
4. 数据标准化和格式转换

In [None]:
# 加载特征和标签
features_path = Path('../results/combined_features.npy')
labels_path = Path('../results/labels.npy')

X = np.load(features_path)
y = np.load(labels_path)

print(f"特征形状: {X.shape}")
print(f"标签形状: {y.shape}")
print(f"\n标签分布:\n{pd.Series(y).value_counts(normalize=True)}")

# 划分数据集
X_train_val, X_test, y_train_val, y_test = train_test_split(
    X, y, test_size=0.2, random_state=SEED, stratify=y
)

X_train, X_val, y_train, y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=SEED, stratify=y_train_val
)

print("\n数据集划分:")
print(f"训练集: {X_train.shape[0]} 样本")
print(f"验证集: {X_val.shape[0]} 样本")
print(f"测试集: {X_test.shape[0]} 样本")

# 特征标准化
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# 创建数据加载器
class RumorDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features)
        self.labels = torch.LongTensor(labels)
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return {
            'features': self.features[idx],
            'label': self.labels[idx]
        }

# 创建数据加载器
train_dataset = RumorDataset(X_train_scaled, y_train)
val_dataset = RumorDataset(X_val_scaled, y_val)
test_dataset = RumorDataset(X_test_scaled, y_test)

BATCH_SIZE = 32

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE)

# 可视化标签分布
plt.figure(figsize=(10, 5))
plt.subplot(131)
plt.pie(pd.Series(y_train).value_counts(normalize=True), 
        labels=['非谣言', '谣言'], autopct='%1.1f%%')
plt.title('训练集标签分布')

plt.subplot(132)
plt.pie(pd.Series(y_val).value_counts(normalize=True), 
        labels=['非谣言', '谣言'], autopct='%1.1f%%')
plt.title('验证集标签分布')

plt.subplot(133)
plt.pie(pd.Series(y_test).value_counts(normalize=True), 
        labels=['非谣言', '谣言'], autopct='%1.1f%%')
plt.title('测试集标签分布')

plt.tight_layout()
plt.show()

## 2. 模型定义和训练

我们将实现并训练以下模型：
1. 基于神经网络的模型：
   - 多层感知机 (MLP)
   - BERT分类器
   - BiLSTM+Attention

2. 传统机器学习模型：
   - SVM
   - 随机森林

In [None]:
# 定义评估函数
def evaluate_model(model, data_loader, device):
    model.eval()
    predictions = []
    actual_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            features = batch['features'].to(device)
            labels = batch['label']
            
            outputs = model(features)
            _, preds = torch.max(outputs, dim=1)
            
            predictions.extend(preds.cpu().numpy())
            actual_labels.extend(labels.numpy())
    
    # 计算评估指标
    accuracy = accuracy_score(actual_labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(
        actual_labels, predictions, average='binary'
    )
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }

# 定义训练函数
def train_model(model, train_loader, val_loader, criterion, optimizer, 
                scheduler, device, epochs=10):
    history = []
    best_val_f1 = 0
    
    for epoch in range(epochs):
        # 训练阶段
        model.train()
        train_loss = 0
        
        for batch in tqdm(train_loader, desc=f'Epoch {epoch + 1}/{epochs}'):
            features = batch['features'].to(device)
            labels = batch['label'].to(device)
            
            optimizer.zero_grad()
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            train_loss += loss.item()
        
        # 计算平均训练损失
        train_loss = train_loss / len(train_loader)
        
        # 验证阶段
        model.eval()
        val_metrics = evaluate_model(model, val_loader, device)
        
        # 更新学习率
        scheduler.step()
        
        # 保存最佳模型
        if val_metrics['f1'] > best_val_f1:
            best_val_f1 = val_metrics['f1']
            torch.save(model.state_dict(), '../results/models/best_model.pt')
        
        # 记录训练信息
        history.append({
            'epoch': epoch + 1,
            'train_loss': train_loss,
            **val_metrics
        })
        
        print(f'Epoch {epoch + 1}/{epochs}:')
        print(f'训练损失: {train_loss:.4f}')
        print(f'验证集指标: {val_metrics}')
        print('-' * 50)
    
    return history

# 定义MLP模型
class MLPClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 2)
        )
    
    def forward(self, x):
        return self.layers(x)

# 训练MLP模型
print("训练MLP模型...")
mlp_model = MLPClassifier(X_train.shape[1]).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(mlp_model.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.1)

mlp_history = train_model(
    mlp_model, train_loader, val_loader,
    criterion, optimizer, scheduler, device
)

# 可视化训练过程
plt.figure(figsize=(12, 4))

plt.subplot(121)
plt.plot([h['train_loss'] for h in mlp_history], label='训练损失')
plt.title('训练损失曲线')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend()

plt.subplot(122)
plt.plot([h['f1'] for h in mlp_history], label='F1分数')
plt.plot([h['accuracy'] for h in mlp_history], label='准确率')
plt.title('验证集评估指标')
plt.xlabel('Epoch')
plt.ylabel('Score')
plt.legend()

plt.tight_layout()
plt.show()

# 在测试集上评估MLP模型
mlp_test_metrics = evaluate_model(mlp_model, test_loader, device)
print("\nMLP模型测试集表现:")
print(mlp_test_metrics)

# 训练传统机器学习模型
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

# SVM模型
print("\n训练SVM模型...")
svm = SVC(kernel='rbf', probability=True)
svm.fit(X_train_scaled, y_train)

svm_pred = svm.predict(X_test_scaled)
svm_metrics = {
    'accuracy': accuracy_score(y_test, svm_pred),
    **dict(zip(
        ['precision', 'recall', 'f1', '_'],
        precision_recall_fscore_support(y_test, svm_pred, average='binary')
    ))
}

print("SVM模型测试集表现:")
print(svm_metrics)

# 随机森林模型
print("\n训练随机森林模型...")
rf = RandomForestClassifier(n_estimators=100, random_state=SEED)
rf.fit(X_train_scaled, y_train)

rf_pred = rf.predict(X_test_scaled)
rf_metrics = {
    'accuracy': accuracy_score(y_test, rf_pred),
    **dict(zip(
        ['precision', 'recall', 'f1', '_'],
        precision_recall_fscore_support(y_test, rf_pred, average='binary')
    ))
}

print("随机森林模型测试集表现:")
print(rf_metrics)

# 比较所有模型性能
models_comparison = pd.DataFrame({
    'MLP': mlp_test_metrics,
    'SVM': svm_metrics,
    'Random Forest': rf_metrics
}).T

plt.figure(figsize=(10, 6))
models_comparison[['accuracy', 'precision', 'recall', 'f1']].plot(kind='bar')
plt.title('模型性能比较')
plt.xlabel('模型')
plt.ylabel('分数')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## 3. 详细模型评估

对模型进行深入评估：
1. 混淆矩阵分析
2. ROC曲线和AUC
3. 精确率-召回率曲线
4. 不同阈值下的性能分析

In [None]:
# 获取模型预测概率
def get_predictions_proba(model, data_loader, device):
    model.eval()
    all_probs = []
    all_labels = []
    
    with torch.no_grad():
        for batch in data_loader:
            features = batch['features'].to(device)
            labels = batch['label']
            
            outputs = model(features)
            probs = torch.softmax(outputs, dim=1)
            
            all_probs.extend(probs.cpu().numpy()[:, 1])
            all_labels.extend(labels.numpy())
    
    return np.array(all_probs), np.array(all_labels)

# 获取各个模型在测试集上的预测概率
mlp_probs, _ = get_predictions_proba(mlp_model, test_loader, device)
svm_probs = svm.predict_proba(X_test_scaled)[:, 1]
rf_probs = rf.predict_proba(X_test_scaled)[:, 1]

# 1. 混淆矩阵分析
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(title)
    plt.ylabel('真实标签')
    plt.xlabel('预测标签')
    plt.show()

# 绘制每个模型的混淆矩阵
plot_confusion_matrix(y_test, (mlp_probs > 0.5).astype(int), 'MLP模型混淆矩阵')
plot_confusion_matrix(y_test, (svm_probs > 0.5).astype(int), 'SVM模型混淆矩阵')
plot_confusion_matrix(y_test, (rf_probs > 0.5).astype(int), '随机森林模型混淆矩阵')

# 2. ROC曲线分析
plt.figure(figsize=(10, 6))

# 计算并绘制每个模型的ROC曲线
for probs, name in [(mlp_probs, 'MLP'), (svm_probs, 'SVM'), (rf_probs, 'Random Forest')]:
    fpr, tpr, _ = roc_curve(y_test, probs)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{name} (AUC = {roc_auc:.3f})')

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('假阳性率')
plt.ylabel('真阳性率')
plt.title('ROC曲线比较')
plt.legend()
plt.show()

# 3. 精确率-召回率曲线
plt.figure(figsize=(10, 6))

for probs, name in [(mlp_probs, 'MLP'), (svm_probs, 'SVM'), (rf_probs, 'Random Forest')]:
    precision, recall, _ = precision_recall_curve(y_test, probs)
    avg_precision = auc(recall, precision)
    plt.plot(recall, precision, label=f'{name} (AP = {avg_precision:.3f})')

plt.xlabel('召回率')
plt.ylabel('精确率')
plt.title('精确率-召回率曲线比较')
plt.legend()
plt.show()

# 4. 不同阈值下的性能分析
def threshold_performance(y_true, y_prob, thresholds):
    metrics = []
    for threshold in thresholds:
        y_pred = (y_prob >= threshold).astype(int)
        precision, recall, f1, _ = precision_recall_fscore_support(y_true, y_pred, average='binary')
        metrics.append({
            'threshold': threshold,
            'precision': precision,
            'recall': recall,
            'f1': f1
        })
    return pd.DataFrame(metrics)

# 分析不同阈值
thresholds = np.arange(0.1, 1.0, 0.1)
mlp_threshold_metrics = threshold_performance(y_test, mlp_probs, thresholds)

plt.figure(figsize=(12, 6))
plt.plot(mlp_threshold_metrics['threshold'], mlp_threshold_metrics['precision'], label='精确率')
plt.plot(mlp_threshold_metrics['threshold'], mlp_threshold_metrics['recall'], label='召回率')
plt.plot(mlp_threshold_metrics['threshold'], mlp_threshold_metrics['f1'], label='F1分数')
plt.xlabel('阈值')
plt.ylabel('分数')
plt.title('MLP模型在不同阈值下的性能')
plt.legend()
plt.grid(True)
plt.show()

# 打印最佳阈值
best_threshold_idx = mlp_threshold_metrics['f1'].idxmax()
best_threshold = mlp_threshold_metrics.loc[best_threshold_idx]
print(f"MLP模型的最佳阈值: {best_threshold['threshold']:.2f}")
print(f"在最佳阈值下的性能:")
print(f"精确率: {best_threshold['precision']:.4f}")
print(f"召回率: {best_threshold['recall']:.4f}")
print(f"F1分数: {best_threshold['f1']:.4f}")

## 4. 错误分析与模型解释

分析模型的预测错误：
1. 错误预测案例分析
2. 特征重要性分析
3. 模型预测置信度分析
4. 改进建议

In [None]:
# 1. 错误预测案例分析
def analyze_errors(y_true, y_pred, probs, features, n_samples=5):
    errors = y_true != y_pred
    error_indices = np.where(errors)[0]
    
    print(f"错误预测总数: {len(error_indices)}")
    print(f"错误率: {len(error_indices) / len(y_true):.2%}")
    
    # 随机选择一些错误案例
    if len(error_indices) > n_samples:
        sample_indices = np.random.choice(error_indices, n_samples, replace=False)
    else:
        sample_indices = error_indices
    
    print("\n错误预测案例分析:")
    for idx in sample_indices:
        print("-" * 50)
        print(f"案例 {idx}:")
        print(f"真实标签: {y_true[idx]}")
        print(f"预测标签: {y_pred[idx]}")
        print(f"预测概率: {probs[idx]:.4f}")
        print(f"特征值前10个: {features[idx][:10]}")

# 分析MLP模型的错误预测
mlp_pred = (mlp_probs > best_threshold['threshold']).astype(int)
analyze_errors(y_test, mlp_pred, mlp_probs, X_test_scaled)

# 2. 特征重要性分析（使用随机森林模型）
feature_importance = pd.DataFrame({
    'feature': range(X_train.shape[1]),
    'importance': rf.feature_importances_
})
feature_importance = feature_importance.sort_values('importance', ascending=False)

plt.figure(figsize=(12, 6))
plt.bar(range(20), feature_importance['importance'][:20])
plt.title('Top 20 最重要特征')
plt.xlabel('特征索引')
plt.ylabel('重要性')
plt.tight_layout()
plt.show()

# 3. 预测置信度分析
def analyze_confidence(probs, y_true):
    correct = (probs > 0.5).astype(int) == y_true
    confidence_bins = np.array([0.5, 0.6, 0.7, 0.8, 0.9, 1.0])
    
    for i in range(len(confidence_bins)-1):
        mask = (probs >= confidence_bins[i]) & (probs < confidence_bins[i+1])
        if mask.any():
            n_samples = mask.sum()
            accuracy = correct[mask].mean()
            print(f"置信度 {confidence_bins[i]:.1f}-{confidence_bins[i+1]:.1f}: "
                  f"样本数={n_samples}, 准确率={accuracy:.2%}")

print("\nMLP模型预测置信度分析:")
analyze_confidence(mlp_probs, y_test)

print("\nSVM模型预测置信度分析:")
analyze_confidence(svm_probs, y_test)

print("\n随机森林模型预测置信度分析:")
analyze_confidence(rf_probs, y_test)

# 4. 模型集成分析
# 使用投票方式集成三个模型的预测
ensemble_pred = ((mlp_probs > 0.5).astype(int) +
                (svm_probs > 0.5).astype(int) +
                (rf_probs > 0.5).astype(int)) >= 2

ensemble_metrics = {
    'accuracy': accuracy_score(y_test, ensemble_pred),
    **dict(zip(
        ['precision', 'recall', 'f1', '_'],
        precision_recall_fscore_support(y_test, ensemble_pred, average='binary')
    ))
}

print("\n集成模型性能：")
print(ensemble_metrics)

# 将集成模型添加到比较中
models_comparison.loc['Ensemble'] = ensemble_metrics

# 更新模型比较图
plt.figure(figsize=(12, 6))
models_comparison[['accuracy', 'precision', 'recall', 'f1']].plot(kind='bar')
plt.title('所有模型性能比较（包含集成模型）')
plt.xlabel('模型')
plt.ylabel('分数')
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()

## 5. 总结与改进建议

基于上述分析，我们得出以下结论：

1. **模型性能比较**
   - 各个模型的优缺点
   - 最佳模型的选择
   - 集成方法的效果

2. **关键发现**
   - 最重要的特征
   - 常见的错误类型
   - 模型的置信度特征

3. **改进建议**
   - 特征工程优化
   - 模型架构改进
   - 集成策略优化
   - 数据增强方案

4. **部署建议**
   - 模型选择
   - 阈值设置
   - 监控指标