## 1. Initial Dataset

In [1]:
!pip install pandas scikit-learn scikit-image numpy sdv



In [2]:
import pandas as pd
from tqdm import tqdm
import os
import numpy as np
from sklearn.preprocessing import MinMaxScaler

# ---------- 归一化处理函数 ----------
def normalize_dataframe(df, label_column="Label"):
    """
    对 DataFrame 进行数据清洗和归一化：
    - 非数值强制转换
    - 删除 NaN 和 inf 行
    - Min-Max 归一化
    - 返回 X_scaled, y
    """
    print(f"🧹 清洗数据（NaN / Infinity）...")

    df = df.copy()
    if label_column not in df.columns:
        raise ValueError(f"❌ Label 列 '{label_column}' 不存在")

    labels = df[label_column].reset_index(drop=True)
    features = df.drop(columns=[label_column])

    # 类型转换
    features = features.apply(pd.to_numeric, errors='coerce')
    features = features.replace([np.inf, -np.inf], np.nan)

    # 清除有问题的样本
    cleaned = pd.concat([features, labels], axis=1).dropna()
    print(f"✅ 清洗后剩余样本数: {cleaned.shape[0]}")

    # 归一化
    features_cleaned = cleaned.drop(columns=[label_column])
    labels_cleaned = cleaned[label_column].reset_index(drop=True)

    print(f"📐 执行 Min-Max 归一化...")
    scaler = MinMaxScaler()
    features_scaled = pd.DataFrame(scaler.fit_transform(features_cleaned), columns=features_cleaned.columns)

    print("🎉 归一化完成！")
    return features_scaled, labels_cleaned

# ---------- 合并原始数据 ----------
def writeData(csv_path):
    df = pd.read_csv(csv_path, header=None, encoding='utf-8', low_memory=False)
    return df

def mergeData():
    dataset_files = [
        "/home/maia-user/myl/sc/Datasets/Monday-WorkingHours.pcap_ISCX.csv",
        "/home/maia-user/myl/sc/Datasets/Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv",
        "/home/maia-user/myl/sc/Datasets/Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv",
        "/home/maia-user/myl/sc/Datasets/Friday-WorkingHours-Morning.pcap_ISCX.csv",
        "/home/maia-user/myl/sc/Datasets/Thursday-WorkingHours-Afternoon-Infilteration.pcap_ISCX.csv",
        "/home/maia-user/myl/sc/Datasets/Thursday-WorkingHours-Morning-WebAttacks.pcap_ISCX.csv",
        "/home/maia-user/myl/sc/Datasets/Tuesday-WorkingHours.pcap_ISCX.csv",
        "/home/maia-user/myl/sc/Datasets/Wednesday-workingHours.pcap_ISCX.csv"
    ]

    frames = []
    for file in tqdm(dataset_files, desc="📥 加载原始CSV文件", unit="file"):
        if os.path.exists(file):
            df = writeData(file)
            df = df.drop([0])  # 删除第一行标题行
            frames.append(df)
        else:
            print(f"⚠️ 文件 {file} 不存在，跳过")

    print("🔄 合并数据中...")
    result = pd.concat(frames, ignore_index=True)
    return result

# ---------- 执行整个流程 ----------
print("🚀 启动数据预处理...")
raw_data = mergeData()

# 加载列名
with open("/home/maia-user/myl/sc/column_names.txt") as f:
    col_names = [line.strip() for line in f.readlines()]

if len(col_names) == raw_data.shape[1]:
    raw_data.columns = col_names
    print("✅ 成功赋值列名")
else:
    raise ValueError(f"❌ 列名数不匹配：列名 {len(col_names)} 个，数据列数 {raw_data.shape[1]}")

# ---------- 调用归一化 ----------
X_scaled, y = normalize_dataframe(raw_data, label_column="Label")

# ---------- 保存最终处理结果 ----------
processed_df = pd.concat([X_scaled, y], axis=1)
output_file = "/home/maia-user/myl/sc/Datasets/processed_total.csv"
processed_df.to_csv(output_file, index=False)
print(f"✅ 所有预处理完成，数据保存至: {output_file}")


🚀 启动数据预处理...


📥 加载原始CSV文件: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:51<00:00,  6.40s/file]


🔄 合并数据中...
✅ 成功赋值列名
🧹 清洗数据（NaN / Infinity）...
✅ 清洗后剩余样本数: 2827876
📐 执行 Min-Max 归一化...
🎉 归一化完成！
✅ 所有预处理完成，数据保存至: /home/maia-user/myl/sc/Datasets/processed_total.csv


In [3]:
# 从文件重新读取列名（确保是正确的、干净的）
df = processed_df.copy()

with open("/home/maia-user/myl/sc/column_names.txt") as f:
    col_names = [line.strip() for line in f.readlines()]

# 如果最后一列是 'Label'，不用再手动加了（你已经包含它了）
if len(col_names) != df.shape[1]:
    print(f"❌ 列数不匹配：列名 {len(col_names)} 个，数据实际列数 {df.shape[1]}")
else:
    df.columns = col_names
    print("✅ 列名成功赋值")


print(df["Label"].value_counts())


✅ 列名成功赋值
Label
BENIGN                        2271320
DoS Hulk                       230124
PortScan                       158804
DDoS                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1956
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


## 2. Sample Datasets

In [4]:
import pandas as pd
from tqdm import tqdm
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

df = pd.read_csv('/home/maia-user/myl/sc/Datasets/processed_total.csv')

with open("/home/maia-user/myl/sc/column_names.txt") as f:
    col_names = [line.strip() for line in f.readlines()]

# 如果最后一列是 'Label'，不用再手动加了（你已经包含它了）
if len(col_names) != df.shape[1]:
    print(f"❌ 列数不匹配：列名 {len(col_names)} 个，数据实际列数 {df.shape[1]}")
else:
    df.columns = col_names
    print("✅ 列名成功赋值")
    
print(df["Label"].value_counts())

✅ 列名成功赋值
Label
BENIGN                        2271320
DoS Hulk                       230124
PortScan                       158804
DDoS                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1956
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


In [5]:
# 假设你已经读取好 df（含 Label 列）
# 下面是你需要合并的原始类标签
rare_classes = [
    "Infiltration",
    "Web Attack � XSS",
    "Web Attack � Sql Injection",
    "Heartbleed"
]

# 替换为 RareAttack
# df["Label"] = df["Label"].replace(rare_classes, "RareAttack")

# 查看合并后的标签分布
print("🎯 合并后的标签分布：")
print(df["Label"].value_counts())

🎯 合并后的标签分布：
Label
BENIGN                        2271320
DoS Hulk                       230124
PortScan                       158804
DDoS                           128025
DoS GoldenEye                   10293
FTP-Patator                      7935
SSH-Patator                      5897
DoS slowloris                    5796
DoS Slowhttptest                 5499
Bot                              1956
Web Attack � Brute Force         1507
Web Attack � XSS                  652
Infiltration                       36
Web Attack � Sql Injection         21
Heartbleed                         11
Name: count, dtype: int64


In [6]:
# 筛选 BENIGN 和目标攻击类别
benign_df = df[df["Label"] == "BENIGN"].sample(n=100000, random_state=42)
target_df = df[df["Label"].isin(rare_classes)]

# 合并样本构建小型数据集
small_df = pd.concat([benign_df, target_df], ignore_index=True)
print("✅ 筛选后的数据：")
print(small_df["Label"].value_counts())


# 特征和标签拆分
X = small_df.drop(columns=["Label"])
y = small_df["Label"]

print(f"🧪 使用特征总数: {X.shape[1]}")  # 预期应该是 78
print("🧠 特征列（前5个）:", X.columns.tolist()[:5])

# 数据划分
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 训练 baseline 模型（随机森林）
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# 评估模型
y_pred = clf.predict(X_test)
print("📊 分类报告:")
print(classification_report(y_test, y_pred))
print("📉 混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

✅ 筛选后的数据：
Label
BENIGN                        100000
Web Attack � XSS                 652
Infiltration                      36
Web Attack � Sql Injection        21
Heartbleed                        11
Name: count, dtype: int64
🧪 使用特征总数: 78
🧠 特征列（前5个）: ['Destination Port', 'Flow Duration', 'Total Fwd Packets', 'Total Backward Packets', 'Total Length of Fwd Packets']
📊 分类报告:
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00     20000
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       1.00      0.71      0.83         7
Web Attack � Sql Injection       0.00      0.00      0.00         4
          Web Attack � XSS       1.00      0.94      0.97       131

                  accuracy                           1.00     20144
                 macro avg       0.80      0.73      0.76     20144
              weighted avg       1.00      1.00      1.00     20144

📉 混淆矩阵:
[

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### 3. The use of GAN

In [7]:
# from ctgan import CTGAN
# import pandas as pd

# # ✅ 确保 `rare_small_df` 的索引正确
# rare_small_df = target_df.reset_index(drop=True)

# # ✅ 确保列名不会成为数据
# rare_features = rare_small_df.drop(columns=["Label"]).copy()

# # 🔍 识别类别特征（非数值列）
# discrete_columns = rare_features.select_dtypes(include=['object']).columns.tolist()

# print("🔹 识别到的类别列:", discrete_columns)

# # ✅ 初始化 CTGAN（可调整 epochs）
# ctgan = CTGAN(epochs=500, verbose=True)

# # ✅ 训练模型（正确指定离散列）
# ctgan.fit(rare_features, discrete_columns=discrete_columns)

# # ✅ 生成指定数量的样本（如 620）
# synthetic_samples = ctgan.sample(400)

# # ✅ 给生成数据添加标签列
# synthetic_samples["Label"] = "RareAttack"

# # ✅ 确保列名正确
# synthetic_samples.columns = rare_features.columns.tolist() + ["Label"]

# # 查看生成结果
# print(synthetic_samples.head())

In [8]:
# train_df_aug = pd.concat([benign_df, target_df, synthetic_samples], ignore_index=True)
# print("✅ 增强训练集类别分布:")
# print(train_df_aug["Label"].value_counts())


In [9]:
# # 特征和标签
# X_aug = train_df_aug.drop(columns=["Label"])
# y_aug = train_df_aug["Label"]

# # 划分训练测试集
# X_train_aug, X_test_aug, y_train_aug, y_test_aug = train_test_split(X_aug, y_aug, stratify=y_aug, test_size=0.2, random_state=42)

# # 训练增强版模型
# clf_aug = RandomForestClassifier(n_estimators=100, random_state=42)
# clf_aug.fit(X_train_aug, y_train_aug)

# # 预测评估
# y_pred_aug = clf_aug.predict(X_test_aug)

# from sklearn.metrics import classification_report, confusion_matrix

# print("📊 增强版分类报告:")
# print(classification_report(y_test_aug, y_pred_aug))
# print("📉 混淆矩阵:")
# print(confusion_matrix(y_test_aug, y_pred_aug))


In [10]:
# # 评估模型
# y_pred = clf_aug.predict(X_test)
# print("📊 分类报告:")
# print(classification_report(y_test, y_pred))
# print("📉 混淆矩阵:")
# print(confusion_matrix(y_test, y_pred))

4. The use of CTGAN

In [11]:
import pandas as pd
from ctgan import CTGAN

# ✅ 设定稀有类标签列表
rare_classes = [
    "Web Attack � XSS",
    "Infiltration",
    "Web Attack � Sql Injection",
    "Heartbleed"
]

# ✅ 设置增强参数
min_real_threshold = 20      # 最少多少条样本才允许训练 GAN
max_real_sample = 100        # 每类最多使用多少条真实样本训练 GAN
default_generate_n = 500     # 默认生成数量
scaling_ratio = 5            # 每个真实样本扩增多少倍

# ✅ 初始化生成结果容器
synthetic_samples_list = []

for category in rare_classes:
    # 提取该类别所有真实样本
    category_df = df[df["Label"] == category].copy()
    available_n = len(category_df)

    print(f"🧪 当前类别: {category}，真实样本数量: {available_n}")

    if available_n < min_real_threshold:
        print(f"⚠️ 样本数过少 (<{min_real_threshold})，跳过该类增强。\n")
        continue

    # 限制最大训练数量
    train_n = min(max_real_sample, available_n)
    real_samples = category_df.sample(n=train_n, random_state=42).reset_index(drop=True)

    # 获取特征列
    features = real_samples.drop(columns=["Label"])
    discrete_columns = features.select_dtypes(include=["object"]).columns.tolist()

    # 初始化并训练 CTGAN
    print(f"🚀 训练 CTGAN (使用 {train_n} 条真实样本)...")
    ctgan = CTGAN(epochs=300, verbose=True)
    ctgan.fit(features, discrete_columns=discrete_columns)

    # 🧠 设定生成样本数量：Heartbleed 特别处理
    if category == "Heartbleed":
        generate_n = 50
    else:
        generate_n = min(default_generate_n, train_n * scaling_ratio)

    print(f"🎯 将生成 {generate_n} 条增强样本。")

    # 生成样本
    synthetic = ctgan.sample(generate_n)
    synthetic["Label"] = category
    synthetic.columns = features.columns.tolist() + ["Label"]

    # 添加进总列表
    synthetic_samples_list.append(synthetic)
    print(f"✅ 增强完成: {category} → {generate_n} 条样本\n")

# 合并所有增强样本
final_synthetic_data = pd.concat(synthetic_samples_list, ignore_index=True)
print(f"🎉 所有类别增强完毕，总共生成样本数: {len(final_synthetic_data)}")


🧪 当前类别: Web Attack � XSS，真实样本数量: 652
🚀 训练 CTGAN (使用 100 条真实样本)...


  return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Gen. (2.11) | Discrim. (-0.16): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:55<00:00,  5.44it/s]


🎯 将生成 500 条增强样本。
✅ 增强完成: Web Attack � XSS → 500 条样本

🧪 当前类别: Infiltration，真实样本数量: 36
🚀 训练 CTGAN (使用 36 条真实样本)...


Gen. (-5.02) | Discrim. (0.17): 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:51<00:00,  5.86it/s]


🎯 将生成 180 条增强样本。
✅ 增强完成: Infiltration → 180 条样本

🧪 当前类别: Web Attack � Sql Injection，真实样本数量: 21
🚀 训练 CTGAN (使用 21 条真实样本)...


Gen. (2.23) | Discrim. (0.77): 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:56<00:00,  5.27it/s]


🎯 将生成 105 条增强样本。
✅ 增强完成: Web Attack � Sql Injection → 105 条样本

🧪 当前类别: Heartbleed，真实样本数量: 11
⚠️ 样本数过少 (<20)，跳过该类增强。

🎉 所有类别增强完毕，总共生成样本数: 785


In [12]:
print(final_synthetic_data["Label"].value_counts())

Label
Web Attack � XSS              500
Infiltration                  180
Web Attack � Sql Injection    105
Name: count, dtype: int64


In [13]:
# 加入真实样本（每类尽量保留全部）和 BENIGN 样本
benign_df = df[df["Label"] == "BENIGN"].sample(n=100000, random_state=42)
real_rare_df = df[df["Label"] == "Heartbleed"] # 所有真实稀有类样本

# 合并训练集
train_df = pd.concat([benign_df, real_rare_df, final_synthetic_data], ignore_index=True)

# 检查类别分布
print(train_df["Label"].value_counts())



# 特征和标签拆分
X = small_df.drop(columns=["Label"])
y = small_df["Label"]

# 数据划分
X_train1, X_test1, y_train1, y_test1 = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# 训练 baseline 模型（随机森林）
clf1 = RandomForestClassifier(n_estimators=100, random_state=42)
clf1.fit(X_train1, y_train1)

# 评估模型
y_pred1 = clf1.predict(X_test1)
print("📊 分类报告:")
print(classification_report(y_test1, y_pred1))
print("📉 混淆矩阵:")
print(confusion_matrix(y_test1, y_pred1))

Label
BENIGN                        100000
Web Attack � XSS                 500
Infiltration                     180
Web Attack � Sql Injection       105
Heartbleed                        11
Name: count, dtype: int64
📊 分类报告:
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00     20000
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       1.00      0.71      0.83         7
Web Attack � Sql Injection       0.00      0.00      0.00         4
          Web Attack � XSS       1.00      0.94      0.97       131

                  accuracy                           1.00     20144
                 macro avg       0.80      0.73      0.76     20144
              weighted avg       1.00      1.00      1.00     20144

📉 混淆矩阵:
[[20000     0     0     0     0]
 [    0     2     0     0     0]
 [    2     0     5     0     0]
 [    4     0     0     0     0]
 [    8     0     0 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# 评估模型
y_pred = clf1.predict(X_test)
print("📊 分类报告:")
print(classification_report(y_test, y_pred))
print("📉 混淆矩阵:")
print(confusion_matrix(y_test, y_pred))

📊 分类报告:
                            precision    recall  f1-score   support

                    BENIGN       1.00      1.00      1.00     20000
                Heartbleed       1.00      1.00      1.00         2
              Infiltration       1.00      0.71      0.83         7
Web Attack � Sql Injection       0.00      0.00      0.00         4
          Web Attack � XSS       1.00      0.94      0.97       131

                  accuracy                           1.00     20144
                 macro avg       0.80      0.73      0.76     20144
              weighted avg       1.00      1.00      1.00     20144

📉 混淆矩阵:
[[20000     0     0     0     0]
 [    0     2     0     0     0]
 [    2     0     5     0     0]
 [    4     0     0     0     0]
 [    8     0     0     0   123]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
