# 增强版DFNet+XGBoost恶意流量分类
## 包含子类别聚类处理

In [1]:
# 1. 数据加载和预处理
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
import tensorflow as tf
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt
import seaborn as sns

# 加载数据
df = pd.read_csv('data/train_data.csv')

# 分离特征和标签
X = df.drop(['id', 'attack_cat'], axis=1)
y = df['attack_cat']

# 编码分类特征
cat_cols = ['proto', 'service', 'state']
for col in cat_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

# 编码标签
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# 标准化特征
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 归一化到0-1范围用于图像化
minmax = MinMaxScaler()
X_normalized = minmax.fit_transform(X_scaled)

2025-05-03 23:35:39.779863: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-03 23:35:39.787927: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 23:35:39.848289: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-05-03 23:35:39.900810: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1746286539.946039   45884 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1746286539.95

In [7]:
# 图像尺寸计算
num_features = X_normalized.shape[1]
img_size = int(np.ceil(np.sqrt(num_features)))
padding = img_size**2 - num_features

# 0填充并 reshape 成灰度图
X_padded = np.pad(X_normalized, ((0,0), (0,padding)), mode='constant')
X_images = X_padded.reshape(-1, img_size, img_size)

# 用 flatten 后的图像数据进行聚类
X_flat_images = X_images.reshape(X_images.shape[0], -1)


In [8]:
def find_optimal_clusters(data, max_k=5):
    silhouette_scores = []
    for k in range(2, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42)
        labels = kmeans.fit_predict(data)
        silhouette_scores.append(silhouette_score(data, labels))
    return np.argmax(silhouette_scores) + 2

cluster_labels = np.zeros(len(X_flat_images), dtype=int)
for label in np.unique(y_encoded):
    mask = y_encoded == label
    label_data = X_flat_images[mask]

    if len(label_data) < 10:
        continue

    optimal_k = find_optimal_clusters(label_data)
    kmeans = KMeans(n_clusters=optimal_k, random_state=42)
    sub_labels = kmeans.fit_predict(label_data)

    cluster_labels[mask] = sub_labels + np.max(cluster_labels) + 1


KeyboardInterrupt: 

In [None]:
# 可视化图像 + 子类别
plt.figure(figsize=(10,10))
for i in range(49):
    plt.subplot(7, 7, i+1)
    plt.imshow(X_images[i], cmap='gray')
    plt.title(f'{y[i]} | C{cluster_labels[i]}')
    plt.axis('off')
plt.tight_layout()
plt.show()


In [4]:
# 4. 构建DFNet模型
def build_enhanced_dfnet(input_shape, classes):
    model = tf.keras.Sequential([
        # 输入层
        tf.keras.layers.InputLayer(input_shape=input_shape),
        
        # 改进的卷积块1 - 增加深度
        tf.keras.layers.Conv2D(32, (3,3), padding='same', activation='elu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Conv2D(32, (3,3), padding='same', activation='elu'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.MaxPooling2D((2,2)),
        tf.keras.layers.Dropout(0.2),
        
        # 卷积块2 - 增加残差连接
        tf.keras.layers.Conv2D(64, (3,3), padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.Conv2D(64, (3,3), padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Add(),
        tf.keras.layers.Activation('relu'),
        tf.keras.layers.MaxPooling2D((2,2)),
        tf.keras.layers.Dropout(0.3),
        
        # 全连接层
        tf.keras.layers.Flatten(),
        tf.keras.layers.Dense(512, activation='relu', kernel_initializer='he_normal'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Dropout(0.5),
        
        # 输出层
        tf.keras.layers.Dense(classes, activation='softmax')
    ])
    
    return model

# 构建模型
num_classes = len(le.classes_)
model = build_enhanced_dfnet(input_shape=(img_size, img_size, 1), classes=num_classes)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])
model.summary()

E0000 00:00:1746286827.546498   45884 cuda_executor.cc:1228] INTERNAL: CUDA Runtime error: Failed call to cudaGetRuntimeVersion: Error loading CUDA libraries. GPU will not be used.: Error loading CUDA libraries. GPU will not be used.
W0000 00:00:1746286827.547433   45884 gpu_device.cc:2341] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


ValueError: A merge layer should be called on a list of inputs. Received: input_shape=(None, 3, 3, 64) (not a list of shapes)

In [None]:
# 5. 训练和评估
# 分割数据
X_train, X_test, y_train, y_test = train_test_split(
    X_images, y_encoded, test_size=0.2, random_state=42)

# 训练模型
history = model.fit(X_train, y_train, 
                    epochs=30, 
                    batch_size=64, 
                    validation_data=(X_test, y_test),
                    callbacks=[
                        tf.keras.callbacks.EarlyStopping(patience=5, restore_best_weights=True)
                    ])

# 评估模型
y_pred = model.predict(X_test).argmax(axis=1)
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))
print("Macro F1 Score:", f1_score(y_test, y_pred, average='macro'))

# 绘制混淆矩阵
plt.figure(figsize=(12,10))
sns.heatmap(confusion_matrix(y_test, y_pred), 
            annot=True, fmt='d', 
            xticklabels=le.classes_, 
            yticklabels=le.classes_)
plt.title('Confusion Matrix')
plt.show()

In [None]:
# 6. 特征提取+XGBoost
# 创建特征提取器 (移除最后一层)
feature_extractor = tf.keras.Sequential(model.layers[:-1])

# 提取特征
train_features = feature_extractor.predict(X_train)
test_features = feature_extractor.predict(X_test)

# XGBoost分类
xgb = XGBClassifier(n_estimators=200, learning_rate=0.1, max_depth=7, subsample=0.8)
xgb.fit(train_features, y_train)

# 评估
y_pred_xgb = xgb.predict(test_features)
print("\nXGBoost Classification Report:")
print(classification_report(y_test, y_pred_xgb, target_names=le.classes_))
print("XGBoost Macro F1 Score:", f1_score(y_test, y_pred_xgb, average='macro'))

# 保存模型
model.save('dfnet_enhanced.h5')
import joblib
joblib.dump(xgb, 'xgb_enhanced.pkl')