In [1]:
!pip install --upgrade pip
!pip install -r requirements.txt

Looking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
    torch (>=1.9.*)
           ~~~~~~^[0m[33m
[0mLooking in indexes: https://mirrors.cloud.aliyuncs.com/pypi/simple
    PyYAML (>=5.1.*)
            ~~~~~~^[0m[33m
    torch (>=1.9.*)
           ~~~~~~^[0m[33m
[0m

In [2]:
# 导入第三方依赖库
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, RocCurveDisplay
from sklearn.metrics import roc_curve, auc
from keras.models import Sequential
from keras.layers import SimpleRNN, Dense
from keras.utils import to_categorical
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
import joblib
import numpy as np
from collections import Counter

2024-11-06 16:29:04.545409: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-06 16:29:04.593875: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


读取数据集，数据集位于 ./dataSet/simargl-2021-combined.parquet 

In [3]:
df_full = pd.read_parquet('./dataSet/simargl-2021-combined.parquet') # 读取数据集
df = df_full.sample(frac=0.01) # 因GPU配置差，缩小原始数据集大小 （原数据集大小为1.34GB ， 包括40263811行数据）

探索性数据分析

In [4]:
# 探索性数据分析
print('------------------------------- 探索性数据分析 ------------------------------------------')
# 保证输出数据完整
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

# 1. 数据概览
print("数据集概览：")
print("数据集形状 (行, 列):", df.shape)
print("数据集前五行：")
print(df.head())
print("\n数据信息：")
print(df.info())

# 2. 检查缺失值
print("\n缺失值概览：")
print(df.isnull().sum())

# 3. 统计描述
print("\n数值特征的统计信息：")
print(df.describe())

# 4. 类别特征分布
print("\n标签分布：")
print(df['LABEL'].value_counts())
# 可视化标签和协议分布
plt.figure(figsize=(12, 5))
# 标签分布
plt.subplot(1, 2, 1)
sns.countplot(x='LABEL', data=df)
plt.title("标签分布")
plt.savefig('./result/label_distribution.png')
plt.show()

# 5. 特征分布
# 数值特征直方图展示
numerical_features = [
    'DST_TO_SRC_SECOND_BYTES', 'FLOW_ACTIVE_TIMEOUT', 'FLOW_DURATION_MICROSECONDS',
    'FLOW_DURATION_MILLISECONDS', 'FRAME_LENGTH', 'IN_BYTES', 'IN_PKTS',
    'OUT_BYTES', 'OUT_PKTS', 'TCP_WIN_MAX_IN', 'TCP_WIN_MAX_OUT',
    'TCP_WIN_MIN_IN', 'TCP_WIN_MIN_OUT'
]
df[numerical_features].hist(bins=15, figsize=(15, 10), layout=(3, 4))
plt.suptitle("数值特征分布", fontsize=16)
plt.savefig('./result/numerical_features_histogram.png')
plt.show()

# 6. 协议和端口分布
# 源端口和目标端口的分布
plt.figure(figsize=(12, 5))
# 源端口分布
plt.subplot(1, 2, 1)
sns.histplot(df['L4_SRC_PORT'], bins=30, kde=True)
plt.title("源端口分布")
# 目标端口分布
plt.subplot(1, 2, 2)
sns.histplot(df['L4_DST_PORT'], bins=30, kde=True)
plt.title("目标端口分布")
plt.savefig('./result/port_distributions.png')
plt.show()

# 7. 相关性分析
plt.figure(figsize=(12, 8))
correlation_matrix = df[numerical_features].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title("数值特征之间的相关性")
plt.savefig('./result/correlation_matrix.png')
plt.show()

# 8. 异常值检测
# 使用箱线图检测数值特征中的异常值
plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_features, 1):
    plt.subplot(3, 4, i)
    sns.boxplot(y=df[col])
    plt.title(f"{col} 的箱线图")
plt.tight_layout()
plt.savefig('./result/box_plots.png')
plt.show()

print('------------------------------- 探索性数据分析结束 ------------------------------------------')

ValueError: could not convert string to float: '52,,,,,,,,,,52,,,,,,,,,,52,,,,,,,,,52,,,,,,,,,,52,,,,,,,,,,,52,,,,,,,,,,52,,,,,,,,,52,,,,,,,,,,,52,,,,,,,,,52,,,,,,,,,,,52,,,,,,,,,,52'

<Figure size 1200x800 with 0 Axes>

数据预处理，方法为 删除缺失值所在的行 （一行数据中，只要有一个特征有缺失，就删除该行）

In [None]:
df.dropna(inplace=True) # 删除缺失值所在的行

标签编码，将文本标签编码为数值，从而便于后续模型处理

In [None]:
# 将文本标签编码为数值
label_encoder = LabelEncoder()
df['LABEL'] = label_encoder.fit_transform(df['LABEL'])  # 将类别转换为数值

基于特征重要性图表，进行特征选择

In [None]:
# 特征选择：基于特征重要性图表
features = [
    'TCP_WIN_MSS_IN', 'IN_BYTES', 'OUT_BYTES', 
    'FLOW_DURATION_MILLISECONDS', 'L4_DST_PORT', 
    'TCP_WIN_MIN_IN', 'TCP_FLAGS', 'TCP_WIN_MAX_IN', 
    'LAST_SWITCHED'
]
X = df[features]
y = df['LABEL']

# 再次检查 X 和 y 的形状
print("X shape:", X.shape)
print("y shape:", y.shape)

X shape: (4026, 9)
y shape: (4026,)


将数值特征转化为浮点数

In [None]:
# 将数值特征转换为浮点数
for col in features:
    # 转换数值特征
    X[col] = pd.to_numeric(X[col].astype(str).str.replace(',', ''), errors='coerce')


# 再次检查 X 和 y 的形状
print("X shape:", X.shape)
print("y shape:", y.shape)

# 确保 y 和 X 的长度相同
assert len(X) == len(y), "X and y lengths do not match after dropping NaN values"

X shape: (4026, 9)
y shape: (4026,)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.to_numeric(X[col].astype(str).str.replace(',', ''), errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.to_numeric(X[col].astype(str).str.replace(',', ''), errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[col] = pd.to_numeric(X[col].astype(str).s

将数值特征归一化处理，使用MinMax归一化

In [None]:
# 归一化选定的特征
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

类别不平衡问题处理

In [None]:
# 打印原始类别分布
print("Original dataset shape:", Counter(y))

# 处理类别不平衡问题
smote = SMOTE(k_neighbors=1)
X_resampled, y_resampled = smote.fit_resample(X, y)

# 打印重采样后的类别分布
print("Resampled dataset shape:", Counter(y_resampled))

划分训练集和测试集

In [None]:
# 按 6:4 比例划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

定义模型评估函数

In [None]:
# 参数注释 ——————
# y_true: 真实标签，通常是一个一维数组或列表，表示实际的类别标签。
# y_pred: 预测标签，通常是一个一维数组或列表，表示模型预测的类别标签。
# model_name: 模型的名称，默认值为 "Model"，用于在输出中标识模型。
# y_pred_proba: 预测概率，通常是一个二维数组，表示每个样本属于每个类别的概率。如果提供，可以用于计算 ROC AUC 和绘制 ROC 曲线。
def evaluate_model(y_true, y_pred, model_name="Model", y_pred_proba=None):
    print('-------------------------------------------------------------------------')
    print(f"Evaluation results for {model_name}:")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    # 宏平均（Macro Average）：计算每个类别的精确率，然后取平均值
    print("Precision:", precision_score(y_true, y_pred, average='macro', zero_division=0)) # 使用宏平均（macro average）进行汇总。在某个类别没有预测样本时，将该类别的精确度设为 0。
    print("Recall:", recall_score(y_true, y_pred, average='macro', zero_division=0))
    print("F1 Score:", f1_score(y_true, y_pred, average='macro', zero_division=0))
    print('-------------------------------------------------------------------------')

    if y_pred_proba is not None:
        print("ROC AUC:", roc_auc_score(y_true, y_pred_proba, multi_class='ovr'))

        # 绘制多类的 ROC 曲线
        # 多分类问题中的 One-vs-Rest (OvR) 方法
        # 使用 roc_auc_score 函数，并设置参数 multi_class='ovr'。这表示每个类别都分别被视为正类，其他所有类别被视为负类，然后计算每个类别的二分类 AUC 值。
        # 最终的 AUC 值是这些二分类 AUC 值的平均值。
        fpr = {}
        tpr = {}
        roc_auc = {}

        for i in range(len(label_encoder.classes_)):
            fpr[i], tpr[i], _ = roc_curve(y_true, y_pred_proba[:, i], pos_label=i)
            roc_auc[i] = auc(fpr[i], tpr[i])
            plt.plot(fpr[i], tpr[i], label=f'ROC curve of class {label_encoder.classes_[i]} (area = {roc_auc[i]:.2f})')

        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlim([0.0, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'Receiver Operating Characteristic - {model_name}')
        plt.legend(loc="lower right")
        # 保存 AUC 图
        auc_image_path = f'./result/{model_name}_roc_auc.png'
        plt.savefig(auc_image_path)
        plt.show()


逻辑回归

In [None]:
# 训练并评估模型，确保正确的训练时间报告
results = {}

# 逻辑回归
lr_model = LogisticRegression()
start_time_lr = time.time()
lr_model.fit(X_train, y_train)
end_time_lr = time.time()
training_time_lr = end_time_lr - start_time_lr
y_pred_lr = lr_model.predict(X_test)
y_pred_lr_proba = lr_model.predict_proba(X_test)  # 获取预测概率
evaluate_model(y_test, y_pred_lr, "Logistic Regression", y_pred_proba=y_pred_lr_proba)

results["Logistic Regression"] = {
    "Accuracy": accuracy_score(y_test, y_pred_lr),
    "Precision": precision_score(y_test, y_pred_lr, average='weighted'),
    "Recall": recall_score(y_test, y_pred_lr, average='weighted'),
    "F1 Score": f1_score(y_test, y_pred_lr, average='weighted'),
    "AUC": roc_auc_score(y_test, y_pred_lr_proba, multi_class='ovr'),
    "Training time": training_time_lr
}

--------------------------------------------------------------
Evaluation results for Logistic Regression:
Accuracy: 0.9577814569536424
Precision: 0.4995669151700134
Recall: 0.4906968031968032
F1 Score: 0.48851831628612313
--------------------------------------------------------------


ValueError: Number of classes in y_true not equal to the number of columns in 'y_score'

朴素贝叶斯

In [None]:
# 朴素贝叶斯
nb_model = GaussianNB()
start_time_nb = time.time()
nb_model.fit(X_train, y_train)
end_time_nb = time.time()
training_time_nb = end_time_nb - start_time_nb
y_pred_nb = nb_model.predict(X_test)
y_pred_nb_proba = nb_model.predict_proba(X_test)  # 获取预测概率
evaluate_model(y_test, y_pred_nb, "Naive Bayes", y_pred_proba=y_pred_nb_proba)

results["Naive Bayes"] = {
    "Accuracy": accuracy_score(y_test, y_pred_nb),
    "Precision": precision_score(y_test, y_pred_nb, average='weighted'),
    "Recall": recall_score(y_test, y_pred_nb, average='weighted'),
    "F1 Score": f1_score(y_test, y_pred_nb, average='weighted'),
    "AUC": roc_auc_score(y_test, y_pred_nb_proba, multi_class='ovr'),
    "Training time": training_time_nb
}

随机森林

In [None]:
# 随机森林
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
start_time_rf = time.time()
rf_model.fit(X_train, y_train)
end_time_rf = time.time()
training_time_rf = end_time_rf - start_time_rf
y_pred_rf = rf_model.predict(X_test)
y_pred_rf_proba = rf_model.predict_proba(X_test)  # 获取预测概率
evaluate_model(y_test, y_pred_rf, "Random Forest", y_pred_proba=y_pred_rf_proba)

results["Random Forest"] = {
    "Accuracy": accuracy_score(y_test, y_pred_rf),
    "Precision": precision_score(y_test, y_pred_rf, average='weighted'),
    "Recall": recall_score(y_test, y_pred_rf, average='weighted'),
    "F1 Score": f1_score(y_test, y_pred_rf, average='weighted'),
    "AUC": roc_auc_score(y_test, y_pred_rf_proba, multi_class='ovr'),
    "Training time": training_time_rf
}

RNN模型

In [None]:
# RNN
y_train_categorical = to_categorical(y_train)
y_test_categorical = to_categorical(y_test)
X_train_rnn = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test_rnn = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
rnn_model = Sequential([
    SimpleRNN(50, input_shape=(X_train.shape[1], 1), activation='relu'),
    Dense(len(label_encoder.classes_), activation='softmax')
])
start_time_rnn = time.time()
rnn_model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = rnn_model.fit(X_train_rnn, y_train_categorical, epochs=5, batch_size=64, validation_data=(X_test_rnn, y_test_categorical))
end_time_rnn = time.time()
training_time_rnn = end_time_rnn - start_time_rnn
y_pred_rnn_proba = rnn_model.predict(X_test_rnn)
y_pred_rnn = np.argmax(y_pred_rnn_proba, axis=1)
evaluate_model(y_test, y_pred_rnn, "RNN", y_pred_proba=y_pred_rnn_proba)

results["RNN"] = {
    "Accuracy": accuracy_score(y_test, y_pred_rnn),
    "Precision": precision_score(y_test, y_pred_rnn, average='weighted'),
    "Recall": recall_score(y_test, y_pred_rnn, average='weighted'),
    "F1 Score": f1_score(y_test, y_pred_rnn, average='weighted'),
    "AUC": roc_auc_score(y_test, y_pred_rnn_proba, multi_class='ovr'),
    "Training time": training_time_rnn
}

模型性能对比

In [None]:
# 将结果转换为 DataFrame 进行显示
result_df = pd.DataFrame(results).T
print(result_df)
# 保存结果
result_df.to_csv('./result/model_results.csv', index=True)

# 生成结果对比图
# 设置图形大小
plt.figure(figsize=(10, 6))

# 绘制柱状图
result_df.plot(kind='bar', stacked=False, color=['blue', 'orange', 'green', 'red'])

# 添加标题和轴标签
plt.title('Model Comparison on Evaluation Metrics')
plt.xlabel('Models')
plt.ylabel('Score')

# 调整布局以适应所有的元素
plt.tight_layout()

# 保存图像
plt.savefig('./result/model_comparison.png')

# 显示图像
plt.show()