In [1]:
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

# 读取预处理后的数据
df = pd.read_csv(r'C:\\Users\\戴尔\\Desktop\\sc\\sc\\Datasets/processed_total.csv')

# 筛选目标类别（与原始代码一致）
rare_classes = ["Infiltration", "Web Attack � XSS", "Web Attack � Sql Injection", "Heartbleed"]
benign_df = df[df["Label"] == "BENIGN"].sample(n=100000, random_state=42)
target_df = df[df["Label"].isin(rare_classes)]
small_df = pd.concat([benign_df, target_df], ignore_index=True)

# 拆分特征和标签
X = small_df.drop(columns=["Label"])
y = small_df["Label"]

# 将标签编码为数值
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, stratify=y_encoded, test_size=0.2, random_state=42)

# 转换为PyTorch Tensor
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# 创建DataLoader
class TabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset = TabularDataset(X_train_tensor, y_train_tensor)
test_dataset = TabularDataset(X_test_tensor, y_test_tensor)

batch_size = 256
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [2]:
import torch.nn as nn
from torch.nn import TransformerEncoder, TransformerEncoderLayer

class FTTransformer(nn.Module):
    def __init__(self, num_features, num_classes, d_model=128, nhead=8, num_layers=3):
        super().__init__()
        self.feature_embedding = nn.Linear(num_features, d_model)
        encoder_layer = TransformerEncoderLayer(
            d_model=d_model, nhead=nhead, dim_feedforward=4*d_model, dropout=0.1
        )
        self.transformer_encoder = TransformerEncoder(encoder_layer, num_layers=num_layers)
        # 新增：添加CLS Token用于分类
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))  # 新增CLS Token
        self.classifier = nn.Linear(d_model, num_classes)

    def forward(self, x):
        batch_size = x.shape[0]
        # 特征嵌入
        x = self.feature_embedding(x)  # (batch_size, num_features) → (batch_size, d_model)
        # 添加CLS Token到序列中
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)  # (batch_size, 1, d_model)
        x = torch.cat((cls_tokens, x.unsqueeze(1)), dim=1)  # (batch_size, seq_len=2, d_model)
        # Transformer处理
        x = x.permute(1, 0, 2)  # (seq_len, batch_size, d_model)
        x = self.transformer_encoder(x)
        # 提取CLS Token作为分类特征
        cls_output = x[0]  # (batch_size, d_model)
        # 分类
        x = self.classifier(cls_output)
        return x

# 初始化模型
num_features = X_train.shape[1]
num_classes = len(label_encoder.classes_)
model = FTTransformer(num_features, num_classes)



In [3]:
from sklearn.metrics import classification_report, confusion_matrix

In [None]:
import numpy as np
device = torch.device("cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def train_model(model, train_loader, epochs=20):
    print("开始训练.....")
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        for batch_x, batch_y in train_loader:
            batch_x, batch_y = batch_x.to(device), batch_y.to(device)
            print(f"batch_y 中的最大值: {batch_y.max().item()}")
            print(f"num_classes: {num_classes}")
            assert batch_y.max().item() < num_classes, "错误: batch_y 中包含超出 num_classes 范围的值."

            optimizer.zero_grad()
            outputs = model(batch_x)
            print(np.shape(batch_x))
            loss = criterion(outputs, batch_y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

def evaluate_model(model, test_loader):
    model.eval()
    y_true, y_pred = [], []
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.to(device)
            outputs = model(batch_x)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(batch_y.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())
    print("📊 分类报告:")
    print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))
    print("📉 混淆矩阵:")
    print(confusion_matrix(y_true, y_pred))

In [5]:
# 训练模型
train_model(model, train_loader, epochs=20)

开始训练.....
Epoch 1/20, Loss: 0.0403
Epoch 2/20, Loss: 0.0232
Epoch 3/20, Loss: 0.0225
Epoch 4/20, Loss: 0.0268
Epoch 5/20, Loss: 0.0237
Epoch 6/20, Loss: 0.0212
Epoch 7/20, Loss: 0.0238
Epoch 8/20, Loss: 0.0194
Epoch 9/20, Loss: 0.0220
Epoch 10/20, Loss: 0.0207
Epoch 11/20, Loss: 0.0200
Epoch 12/20, Loss: 0.0213
Epoch 13/20, Loss: 0.0198
Epoch 14/20, Loss: 0.0194
Epoch 15/20, Loss: 0.0193
Epoch 16/20, Loss: 0.0227
Epoch 17/20, Loss: 0.0196
Epoch 18/20, Loss: 0.0215
Epoch 19/20, Loss: 0.0205
Epoch 20/20, Loss: 0.0232


In [6]:
# 评估模型
evaluate_model(model, test_loader)

📊 分类报告:
                            precision    recall  f1-score   support

                    BENIGN       0.99      1.00      1.00     20000
                Heartbleed       0.00      0.00      0.00         2
              Infiltration       0.00      0.00      0.00         7
Web Attack � Sql Injection       0.00      0.00      0.00         4
          Web Attack � XSS       0.00      0.00      0.00       131

                  accuracy                           0.99     20144
                 macro avg       0.20      0.20      0.20     20144
              weighted avg       0.99      0.99      0.99     20144

📉 混淆矩阵:
[[20000     0     0     0     0]
 [    2     0     0     0     0]
 [    7     0     0     0     0]
 [    4     0     0     0     0]
 [  131     0     0     0     0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [7]:
import pandas as pd
from ctgan import CTGAN

# ✅ 设定稀有类标签列表
rare_classes = [
    "Web Attack � XSS",
    "Infiltration",
    "Web Attack � Sql Injection",
    "Heartbleed"
]

# ✅ 设置增强参数
min_real_threshold = 10      # 最少多少条样本才允许训练 GAN
max_real_sample = 100        # 每类最多使用多少条真实样本训练 GAN
default_generate_n = 500     # 默认生成数量
scaling_ratio = 5            # 每个真实样本扩增多少倍

# ✅ 初始化生成结果容器
synthetic_samples_list = []

for category in rare_classes:
    # 提取该类别所有真实样本
    category_df = df[df["Label"] == category].copy()
    available_n = len(category_df)

    print(f"🧪 当前类别: {category}，真实样本数量: {available_n}")

    if available_n < min_real_threshold:
        print(f"⚠️ 样本数过少 (<{min_real_threshold})，跳过该类增强。\n")
        continue

    # 限制最大训练数量
    train_n = min(max_real_sample, available_n)
    real_samples = category_df.sample(n=train_n, random_state=42).reset_index(drop=True)

    # 获取特征列
    features = real_samples.drop(columns=["Label"])
    discrete_columns = features.select_dtypes(include=["object"]).columns.tolist()

    # 初始化并训练 CTGAN
    print(f"🚀 训练 CTGAN (使用 {train_n} 条真实样本)...")
    ctgan = CTGAN(epochs=300, verbose=True)
    ctgan.fit(features, discrete_columns=discrete_columns)

    # 🧠 设定生成样本数量：Heartbleed 特别处理
    if category == "Heartbleed":
        generate_n = 50
    else:
        generate_n = min(default_generate_n, train_n * scaling_ratio)

    print(f"🎯 将生成 {generate_n} 条增强样本。")

    # 生成样本
    synthetic = ctgan.sample(generate_n)
    synthetic["Label"] = category
    synthetic.columns = features.columns.tolist() + ["Label"]

    # 添加进总列表
    synthetic_samples_list.append(synthetic)
    print(f"✅ 增强完成: {category} → {generate_n} 条样本\n")

# 合并所有增强样本
final_synthetic_data = pd.concat(synthetic_samples_list, ignore_index=True)
print(f"🎉 所有类别增强完毕，总共生成样本数: {len(final_synthetic_data)}")


🧪 当前类别: Web Attack � XSS，真实样本数量: 652
🚀 训练 CTGAN (使用 100 条真实样本)...


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Gen. (0.65) | Discrim. (-0.54): 100%|██████████| 300/300 [00:51<00:00,  5.85it/s] 


🎯 将生成 500 条增强样本。
✅ 增强完成: Web Attack � XSS → 500 条样本

🧪 当前类别: Infiltration，真实样本数量: 36
🚀 训练 CTGAN (使用 36 条真实样本)...


Gen. (-2.27) | Discrim. (-0.62): 100%|██████████| 300/300 [00:51<00:00,  5.86it/s]


🎯 将生成 180 条增强样本。
✅ 增强完成: Infiltration → 180 条样本

🧪 当前类别: Web Attack � Sql Injection，真实样本数量: 21
🚀 训练 CTGAN (使用 21 条真实样本)...


Gen. (5.36) | Discrim. (0.34): 100%|██████████| 300/300 [00:50<00:00,  5.92it/s] 


🎯 将生成 105 条增强样本。
✅ 增强完成: Web Attack � Sql Injection → 105 条样本

🧪 当前类别: Heartbleed，真实样本数量: 11
🚀 训练 CTGAN (使用 11 条真实样本)...


Gen. (2.95) | Discrim. (0.02): 100%|██████████| 300/300 [00:50<00:00,  5.95it/s]  


🎯 将生成 50 条增强样本。
✅ 增强完成: Heartbleed → 50 条样本

🎉 所有类别增强完毕，总共生成样本数: 835


In [11]:
print(final_synthetic_data["Label"].value_counts())


Label
Web Attack � XSS              500
Infiltration                  180
Web Attack � Sql Injection    105
Heartbleed                     50
Name: count, dtype: int64


In [30]:
# 加入真实样本（每类尽量保留全部）和 BENIGN 样本
benign_df = df[df["Label"] == "BENIGN"].sample(n=10000, random_state=42)
real_rare_df = df[df["Label"] == "Heartbleed"] # 所有真实稀有类样本

# 合并训练集
train_df = pd.concat([benign_df, real_rare_df, final_synthetic_data], ignore_index=True)

# 检查类别分布
print(train_df["Label"].value_counts())

# 特征和标签拆分
X = small_df.drop(columns=["Label"])
y = small_df["Label"]

label_encoder = LabelEncoder()
label_encoder.fit(df["Label"])  # 使用完整数据集适配编码器


# 数据划分
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
# 对训练集和测试集标签进行编码

y_train_encoded = label_encoder.transform(y_train1)
y_test_encoded = label_encoder.transform(y_test1)
# 训练 baseline 模型（FT transformer）
X_train_tensor1 = torch.tensor(X_train1.values, dtype=torch.float32)
y_train_tensor1 = torch.tensor(y_train_encoded, dtype=torch.long)
X_test_tensor1 = torch.tensor(X_test1.values, dtype=torch.float32)
y_test_tensor1 = torch.tensor(y_test_encoded, dtype=torch.long)

# 创建DataLoader
class TabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

train_dataset1 = TabularDataset(X_train_tensor1, y_train_tensor1)
test_dataset1 = TabularDataset(X_test_tensor1, y_test_tensor1)

batch_size = 256
train_loader1 = DataLoader(train_dataset1, batch_size=batch_size, shuffle=True)
test_loader1 = DataLoader(test_dataset1, batch_size=batch_size, shuffle=False)
num_features = X_train1.shape[1]
num_classes = len(label_encoder.classes_)
model = FTTransformer(num_features, num_classes)
device = torch.device("cpu")
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
# 检查模型参数所在的设'
print(f"num_classes: {num_classes}")
train_model(model, train_loader1, epochs=20)



Label
BENIGN                        10000
Web Attack � XSS                500
Infiltration                    180
Web Attack � Sql Injection      105
Heartbleed                       61
Name: count, dtype: int64




num_classes: 15
开始训练.....
Epoch 1/20, Loss: 0.0458
Epoch 2/20, Loss: 0.0234
Epoch 3/20, Loss: 0.0220
Epoch 4/20, Loss: 0.0229
Epoch 5/20, Loss: 0.0235
Epoch 6/20, Loss: 0.0203
Epoch 7/20, Loss: 0.0193
Epoch 8/20, Loss: 0.0212
Epoch 9/20, Loss: 0.0232
Epoch 10/20, Loss: 0.0252
Epoch 11/20, Loss: 0.0212
Epoch 12/20, Loss: 0.0206
Epoch 13/20, Loss: 0.0192
Epoch 14/20, Loss: 0.0215
Epoch 15/20, Loss: 0.0220
Epoch 16/20, Loss: 0.0184
Epoch 17/20, Loss: 0.0204
Epoch 18/20, Loss: 0.0228
Epoch 19/20, Loss: 0.0229
Epoch 20/20, Loss: 0.0251


In [None]:
import torch
import numpy as np
from sklearn.preprocessing import LabelEncoder

# ---------- 标签编码 ----------
# 合并所有标签（包括合成数据）
all_labels = pd.concat([train_df["Label"], final_synthetic_data["Label"]], ignore_index=True)
label_encoder = LabelEncoder()
label_encoder.fit(all_labels)  # 适配所有可能的标签

# 对训练集标签编码
y_encoded = label_encoder.transform(train_df["Label"])

# 验证标签范围
unique_labels = np.unique(y_encoded)
unique_labels=[0, 8 ,9, 13 ,14]
num_classes = len(label_encoder.classes_)
print("唯一标签值:", unique_labels)
print("模型输出类别数:", num_classes)
assert unique_labels.max() < num_classes, "❌ 标签值超出模型输出范围"

# ---------- 模型初始化 ----------
model = FTTransformer(num_features, num_classes)
model = model.to(device)  # 确保模型在GPU（如果可用）

# ---------- 数据加载 ----------
class TabularDataset(Dataset):
    def __init__(self, features, labels):
        self.features = features
        self.labels = labels

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        return (
            self.features[idx].to(device),  # 自动移至设备
            self.labels[idx].to(device)
        )

# 创建 DataLoader
train_loader = DataLoader(train_dataset, batch_size=256, shuffle=True)



SyntaxError: invalid syntax. Perhaps you forgot a comma? (606979458.py, line 16)

In [39]:
def evaluate_model(model, test_loader, label_encoder):
    model.eval()
    y_true, y_pred = [], []
    
    with torch.no_grad():
        for batch_x, batch_y in test_loader:
            batch_x = batch_x.to(device)
            batch_y = batch_y.to(device)
            outputs = model(batch_x)
            _, predicted = torch.max(outputs, 1)
            y_true.extend(batch_y.cpu().numpy())
            y_pred.extend(predicted.cpu().numpy())

    # 🚨 检查类别范围
    print("✅ LabelEncoder 识别的类别:", label_encoder.classes_)
    print("✅ LabelEncoder 识别的类别数:", len(label_encoder.classes_))
    print("⚠️ y_true 真实标签类别:", set(y_true))
    print("⚠️ y_pred 预测的唯一类别:", set(y_pred))

    # 🚨 强制检查 y_pred 是否超出范围
    valid_labels = list(range(len(label_encoder.classes_)))  # [0, 1, 2, 3, 4]
    invalid_preds = [label for label in y_pred if label not in valid_labels]

    if invalid_preds:
        print(f"❌ 发现 {len(invalid_preds)} 个无效预测类别: {set(invalid_preds)}")
        raise ValueError(f"模型预测出 {set(invalid_preds)}，但应在 {valid_labels} 范围内！")

    print("📊 分类报告:")
    print(classification_report(y_true, y_pred, target_names=label_encoder.classes_))
    print("📉 混淆矩阵:")
    print(confusion_matrix(y_true, y_pred))

evaluate_model(model, test_loader1, label_encoder)



✅ LabelEncoder 识别的类别: ['BENIGN' 'Heartbleed' 'Infiltration' 'Web Attack � Sql Injection'
 'Web Attack � XSS']
✅ LabelEncoder 识别的类别数: 5
⚠️ y_true 真实标签类别: {np.int64(0), np.int64(8), np.int64(9), np.int64(13), np.int64(14)}
⚠️ y_pred 预测的唯一类别: {np.int64(2), np.int64(4)}
📊 分类报告:


ValueError: Number of classes, 7, does not match size of target_names, 5. Try specifying the labels parameter

In [None]:
import shap
import numpy as np


# 创建 SHAP 解释器
explainer = shap.TreeExplainer(model)

# 计算 SHAP 值
shap_values = explainer(X_train1,check_additivity=False)

# 可视化 SHAP 值（全局）
shap.summary_plot(shap_values, X_train1) 
# 可视化 SHAP 值（局部）
shap.force_plot(explainer.expected_value, shap_values[0], X_train1[0])