In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold

from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings("ignore")

# === 加载数据 ===
dataTrain = pd.read_csv("allAtt_onehot_large_train_new8.csv")
dataTest = pd.read_csv("allAtt_onehot_large_test_new8.csv")

x_train, y_train = dataTrain.iloc[:, 4:38].values, dataTrain.iloc[:, 38:].values
x_test, y_test = dataTest.iloc[:, 4:38].values, dataTest.iloc[:, 38:].values

y_train_int = np.argmax(y_train, axis=1)
y_test_int = np.argmax(y_test, axis=1)

print(f"训练集形状: {x_train.shape}, 测试集形状: {x_test.shape}")
print(f"类别分布 - 训练集: {np.bincount(y_train_int)}, 测试集: {np.bincount(y_test_int)}")

# === 使用交叉验证生成堆叠特征 ===
# 初始化K折交叉验证
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

# 初始化存储训练集预测的数组
n_train = x_train.shape[0]
n_test = x_test.shape[0]
n_classes = 2

# 用于存储每个折叠的训练集预测
xgb_train_preds = np.zeros((n_train, n_classes))
lgb_train_preds = np.zeros((n_train, n_classes))
rf_train_preds = np.zeros((n_train, n_classes))

# 用于存储测试集预测（我们将平均每个折叠的预测）
xgb_test_preds = np.zeros((n_test, n_classes))
lgb_test_preds = np.zeros((n_test, n_classes))
rf_test_preds = np.zeros((n_test, n_classes))

print("开始进行交叉验证生成堆叠特征...")

# 执行交叉验证
for fold, (train_idx, val_idx) in enumerate(kf.split(x_train)):
    print(f"处理第 {fold+1}/{n_folds} 折...")
    
    # 分割数据
    X_fold_train, X_fold_val = x_train[train_idx], x_train[val_idx]
    y_fold_train = y_train_int[train_idx]
    
    # 初始化和训练模型
    fold_xgb = XGBClassifier(objective="multi:softprob", num_class=2, eval_metric="mlogloss", 
                             use_label_encoder=False, random_state=42)
    fold_lgb = LGBMClassifier(objective='multiclass', num_class=2, random_state=42)
    fold_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    
    # 训练模型
    fold_xgb.fit(X_fold_train, y_fold_train)
    fold_lgb.fit(X_fold_train, y_fold_train)
    fold_rf.fit(X_fold_train, y_fold_train)
    
    # 对验证集进行预测（当前折叠中的验证数据）
    xgb_train_preds[val_idx] = fold_xgb.predict_proba(X_fold_val)
    lgb_train_preds[val_idx] = fold_lgb.predict_proba(X_fold_val)
    rf_train_preds[val_idx] = fold_rf.predict_proba(X_fold_val)
    
    # 对测试集进行预测并累积（稍后我们将取平均值）
    xgb_test_preds += fold_xgb.predict_proba(x_test) / n_folds
    lgb_test_preds += fold_lgb.predict_proba(x_test) / n_folds
    rf_test_preds += fold_rf.predict_proba(x_test) / n_folds

# === 为LSTM准备堆叠特征 ===
# 基础模型数量和每个模型输出的类别数
n_models = 3  # XGBoost, LightGBM, RandomForest
n_classes = 2  # 二分类问题

# 将预测概率重塑为适合LSTM的格式 [样本数, 时间步(模型数), 特征(类别数)]
train_probs_reshaped = np.zeros((n_train, n_models, n_classes))
train_probs_reshaped[:, 0, :] = xgb_train_preds
train_probs_reshaped[:, 1, :] = lgb_train_preds
train_probs_reshaped[:, 2, :] = rf_train_preds

test_probs_reshaped = np.zeros((n_test, n_models, n_classes))
test_probs_reshaped[:, 0, :] = xgb_test_preds
test_probs_reshaped[:, 1, :] = lgb_test_preds
test_probs_reshaped[:, 2, :] = rf_test_preds

print(f"重塑后的堆叠特征形状 - 训练集: {train_probs_reshaped.shape}, 测试集: {test_probs_reshaped.shape}")

# === 基础学习器性能评估 ===
base_models = ['XGBoost', 'LightGBM', 'RandomForest']
base_preds = [
    np.argmax(xgb_train_preds, axis=1),
    np.argmax(lgb_train_preds, axis=1),
    np.argmax(rf_train_preds, axis=1)
]

print("\n基础学习器在交叉验证上的性能:")
for name, preds in zip(base_models, base_preds):
    acc = accuracy_score(y_train_int, preds)
    print(f"{name} CV 准确率: {acc:.4f}")

# === 构建 LSTM 元模型 ===
class LSTMStack(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=2, dropout=0.3, bidirectional=True, output_dim=2):
        super(LSTMStack, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bidirectional = bidirectional
        self.directions = 2 if bidirectional else 1
        
        # LSTM层
        self.lstm = nn.LSTM(
            input_size=input_size,
            hidden_size=hidden_size,
            num_layers=num_layers,
            batch_first=True,
            dropout=dropout if num_layers > 1 else 0,
            bidirectional=bidirectional
        )
        
        # 注意力机制
        self.attention = nn.Sequential(
            nn.Linear(hidden_size * self.directions, hidden_size),
            nn.Tanh(),
            nn.Linear(hidden_size, 1)
        )
        
        # 输出层
        self.fc = nn.Sequential(
            nn.Linear(hidden_size * self.directions, hidden_size),
            nn.BatchNorm1d(hidden_size),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_size, output_dim)
        )
        
        self.softmax = nn.Softmax(dim=1)
        
    def forward(self, x):
        # x形状: [batch_size, seq_len, input_size]
        
        # LSTM处理
        lstm_out, _ = self.lstm(x)  # [batch_size, seq_len, hidden_size*directions]
        
        # 注意力计算
        attention_scores = self.attention(lstm_out)  # [batch_size, seq_len, 1]
        attention_weights = torch.softmax(attention_scores, dim=1)
        
        # 加权平均
        context = torch.sum(attention_weights * lstm_out, dim=1)  # [batch_size, hidden_size*directions]
        
        # 输出层
        out = self.fc(context)
        return self.softmax(out)

# 转换为PyTorch tensors
train_probs_tensor = torch.FloatTensor(train_probs_reshaped)
test_probs_tensor = torch.FloatTensor(test_probs_reshaped)
y_train_tensor = torch.LongTensor(y_train_int)
y_test_tensor = torch.LongTensor(y_test_int)

# 创建数据加载器
train_dataset = TensorDataset(train_probs_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

# 初始化LSTM模型
input_size = n_classes  # 每个时间步(模型)输出的类别数
hidden_size = 64
num_layers = 2
model = LSTMStack(
    input_size=input_size, 
    hidden_size=hidden_size, 
    num_layers=num_layers, 
    dropout=0.3, 
    bidirectional=True,
    output_dim=2
)

# 损失函数和优化器
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)

# === 训练LSTM元模型 ===
epochs = 50
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

# 添加早停
best_val_loss = float('inf')
patience = 10
counter = 0
best_model_path = 'best_lstm_stack_model.pt'

# 跟踪训练历史
train_losses = []
val_accuracies = []

print("\n开始训练LSTM元学习器...")
for epoch in range(epochs):
    model.train()
    total_loss = 0
    
    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        
        # 前向传播
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        
        # 反向传播和优化
        optimizer.zero_grad()
        loss.backward()
        
        # 梯度裁剪，防止梯度爆炸
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)
    
    # 验证
    model.eval()
    with torch.no_grad():
        # 使用一部分训练数据作为验证集
        val_size = int(0.2 * len(train_probs_tensor))
        val_inputs = train_probs_tensor[-val_size:].to(device)
        val_labels = y_train_tensor[-val_size:].to(device)
        
        val_outputs = model(val_inputs)
        val_loss = criterion(val_outputs, val_labels)
        
        _, val_preds = torch.max(val_outputs, 1)
        val_accuracy = accuracy_score(val_labels.cpu().numpy(), val_preds.cpu().numpy())
        val_accuracies.append(val_accuracy)
        
        print(f'Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
        
        # 早停逻辑
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            counter = 0
            torch.save(model.state_dict(), best_model_path)
        else:
            counter += 1
            if counter >= patience:
                print(f'Early stopping at epoch {epoch+1}')
                break
        
        # 学习率调整
        scheduler.step(val_loss)

# 加载最佳模型
model.load_state_dict(torch.load(best_model_path))

# === 绘制训练历史 ===
plt.figure(figsize=(12, 4))

plt.subplot(1, 2, 1)
plt.plot(train_losses)
plt.title('训练损失')
plt.xlabel('Epoch')
plt.ylabel('Loss')

plt.subplot(1, 2, 2)
plt.plot(val_accuracies)
plt.title('验证准确率')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')

plt.tight_layout()
plt.savefig('lstm_training_history.png')
plt.close()

# === LSTM元模型评估 ===
model.eval()
with torch.no_grad():
    test_probs_tensor = test_probs_tensor.to(device)
    outputs = model(test_probs_tensor)
    _, predicted = torch.max(outputs, 1)
    
    predicted = predicted.cpu().numpy()
    
# 计算性能指标
lstm_acc = accuracy_score(y_test_int, predicted)
lstm_report = classification_report(y_test_int, predicted)
lstm_cm = confusion_matrix(y_test_int, predicted)

print("\n=== LSTM元模型性能 ===")
print(f"✅ LSTM堆叠集成准确率: {lstm_acc:.4f}")
print("📊 分类报告:")
print(lstm_report)

# 绘制混淆矩阵
plt.figure(figsize=(8, 6))
sns.heatmap(lstm_cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['0', '1'], yticklabels=['0', '1'])
plt.xlabel('预测标签')
plt.ylabel('真实标签')
plt.title('LSTM元模型混淆矩阵')
plt.savefig('lstm_confusion_matrix.png')
plt.close()

# === 与投票系统比较 ===
# 对测试集进行多数投票
test_base_preds = [
    np.argmax(xgb_test_preds, axis=1),
    np.argmax(lgb_test_preds, axis=1),
    np.argmax(rf_test_preds, axis=1)
]

# 转换为数组以便投票
test_votes = np.array(test_base_preds).T
vote_result = np.array([np.bincount(row).argmax() for row in test_votes])

vote_acc = accuracy_score(y_test_int, vote_result)
vote_report = classification_report(y_test_int, vote_result)

print("\n=== 多数投票系统性能 ===")
print(f"✅ 投票系统准确率: {vote_acc:.4f}")
print("📊 分类报告:")
print(vote_report)

# 在这里加上基线模型报告
print("\n=== 基线模型分类报告 ===")
for name, preds in zip(base_models, test_base_preds):
    print(f"\n---- {name} ----")
    print(f"准确率: {accuracy_score(y_test_int, preds):.4f}")
    print("分类报告:")
    print(classification_report(y_test_int, preds))

# === 比较所有模型性能 ===
base_accs = [accuracy_score(y_test_int, preds) for preds in test_base_preds]

model_names = base_models + ['投票系统', 'LSTM元模型']
model_accs = base_accs + [vote_acc, lstm_acc]

plt.figure(figsize=(10, 6))
bars = plt.bar(model_names, model_accs, 
               color=['blue', 'green', 'red', 'purple', 'orange'])

# 在柱状图上添加准确率数值
for bar, acc_val in zip(bars, model_accs):
    plt.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.01,
             f'{acc_val:.4f}', ha='center', va='bottom')

plt.ylim(0, max(model_accs) + 0.1)
plt.xlabel('模型')
plt.ylabel('准确率')
plt.title('各个模型在测试集上的性能比较')
plt.savefig('lstm_model_comparison.png')
plt.close()

print("\n=== 各个模型在测试集上的性能比较 ===")
for name, acc_val in zip(model_names, model_accs):
    print(f"{name}: {acc_val:.4f}")

# 可视化注意力权重分析
model.eval()
with torch.no_grad():
    # 获取一批测试数据
    sample_inputs = test_probs_tensor[:20].to(device)
    
    # 前向传播并获取注意力权重
    lstm_out, _ = model.lstm(sample_inputs)
    attention_scores = model.attention(lstm_out)
    attention_weights = torch.softmax(attention_scores, dim=1).cpu().numpy()
    
    # 计算每个模型的平均注意力权重
    avg_attention = attention_weights.mean(axis=0).flatten()
    
    # 可视化
    plt.figure(figsize=(10, 6))
    plt.bar(base_models, avg_attention, color='skyblue')
    plt.xlabel('基础模型')
    plt.ylabel('平均注意力权重')
    plt.title('LSTM元模型对各基础模型的注意力分配')
    plt.savefig('attention_weights.png')
    plt.close()
    
    print("\n=== 注意力权重分析 ===")
    for name, weight in zip(base_models, avg_attention):
        print(f"{name}: {weight:.4f}")

print("\nLSTM堆叠集成训练完成！")