In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, f1_score,accuracy_score
import warnings
import time
warnings.filterwarnings('ignore')

In [12]:


# ==================== 1. 配置文件路径 ====================
train_path = r'D:\thing\AI\Dataset\newdata\KDDTrain+.csv'
test_path = r'D:\thing\AI\Dataset\newdata\KDDTest+.csv'

# ==================== 2. 加载数据 ====================
print("正在加载数据...")
df_train = pd.read_csv(train_path)
df_test = pd.read_csv(test_path)

print(f"训练集形状: {df_train.shape}")
print(f"测试集形状: {df_test.shape}")

# ==================== 3. 检查并处理列名（确保一致） ====================
# 有时CSV可能没有列名，需要手动添加（根据您之前提供的片段）
# 如果您的CSV文件第一行就是列名，请确保它们匹配
# 假设列名正确，我们检查目标列
target_col = 'class'  # 或 'label'，请根据实际数据确认

if target_col not in df_train.columns:
    print("警告: 未找到 'class' 列，可能是 'label' 或其他名称")
    print("可用列名:", df_train.columns.tolist())
    # 如果需要，请手动指定正确的列名
    # target_col = 'your_actual_label_column'

# ==================== 4. 分离正常与异常样本（关键！） ====================
# --- 训练集：只保留正常样本用于训练 ---
df_train_normal = df_train[df_train[target_col] == 'normal'].copy()
print(f"\n训练集中正常样本数量: {len(df_train_normal)}")

# --- 测试集：保留所有样本用于评估 ---
X_test = df_test.drop(target_col, axis=1).copy()
y_test_true = (df_test[target_col] != 'normal').astype(int)  # 0=normal, 1=anomaly
print(f"测试集中总样本数: {len(X_test)}")
print(f"测试集中异常样本数: {y_test_true.sum()}")

# 提前保存测试集的分类特征，用于后续编码对齐
categorical_columns = ['protocol_type', 'service', 'flag']
# 确保这些列存在
for col in categorical_columns:
    if col not in X_test.columns:
        print(f"警告: 列 '{col}' 不存在于数据集中")
        categorical_columns.remove(col)

# ==================== 5. 特征工程：独热编码 ====================
# 对训练集（正常样本）进行编码
X_train_clean = df_train_normal.drop(target_col, axis=1).copy()
X_train_encoded = pd.get_dummies(X_train_clean, columns=categorical_columns)
X_test_encoded = pd.get_dummies(X_test, columns=categorical_columns)

# 确保测试集的列与训练集完全一致
missing_cols = set(X_train_encoded.columns) - set(X_test_encoded.columns)
for col in missing_cols:
    X_test_encoded[col] = 0

extra_cols = set(X_test_encoded.columns) - set(X_train_encoded.columns)
for col in extra_cols:
    X_test_encoded.drop(col, axis=1, inplace=True)

# 排序列以确保顺序一致
X_train_encoded = X_train_encoded.reindex(sorted(X_train_encoded.columns), axis=1)
X_test_encoded = X_test_encoded.reindex(sorted(X_test_encoded.columns), axis=1)

print(f"编码后特征数量: {X_train_encoded.shape[1]}")

# ==================== 6. （可选）标准化 ====================
# scaler = StandardScaler()
# X_train_final = scaler.fit_transform(X_train_encoded)
# X_test_final = scaler.transform(X_test_encoded)

# 推荐：直接使用未缩放数据（孤立森林对尺度不敏感）
X_train_final = X_train_encoded
X_test_final = X_test_encoded



正在加载数据...
训练集形状: (125973, 42)
测试集形状: (22544, 42)

训练集中正常样本数量: 67343
测试集中总样本数: 22544
测试集中异常样本数: 12833
编码后特征数量: 77


In [13]:
# ==================== 7. 训练孤立森林 ====================
print("\n正在训练孤立森林...")
iso_forest = IsolationForest(
    contamination=0.1,      # 根据领域知识调整，或设为 'auto'
    random_state=42,
    n_estimators=100,
    max_samples='auto',
    n_jobs=-1
)

iso_forest.fit(X_train_final)




正在训练孤立森林...


0,1,2
,n_estimators,100
,max_samples,'auto'
,contamination,0.1
,max_features,1.0
,bootstrap,False
,n_jobs,-1
,random_state,42
,verbose,0
,warm_start,False


In [14]:
# ==================== 8. 预测与评估 ====================
print("正在预测...")
# 预测标签: 1 表示正常, -1 表示异常
y_pred_test = iso_forest.predict(X_test_final)
y_pred_test_binary = (y_pred_test == -1).astype(int)  # 转换为 0=正常, 1=异常

# 获取异常分数（用于AUC）
anomaly_scores = iso_forest.decision_function(X_test_final)

# ==================== 9. 输出评估结果 ====================
print("\n" + "="*50)
print("孤立森林异常检测结果")
print("="*50)
print(f"测试集真实异常比例: {y_test_true.mean():.3f}")
print(f"模型预测异常比例: {y_pred_test_binary.mean():.3f}")

print("\n混淆矩阵:")
print(confusion_matrix(y_test_true, y_pred_test_binary))

print("\n分类报告:")
print(classification_report(
    y_test_true, y_pred_test_binary, 
    target_names=['正常 (Normal)', '异常 (Anomaly)']
))

# 计算 AUC-ROC
# 注意：decision_function 返回的分数越低越异常，所以取负号
roc_auc = roc_auc_score(y_test_true, -anomaly_scores)
f1_score_value = f1_score(y_test_true, y_pred_test_binary)
accuracy_score_value = accuracy_score(y_test_true, y_pred_test_binary)

print(f"\nROC AUC Score: {roc_auc:.4f}")
print(f"\nF1 Score: {f1_score_value:.4f}")
print(f"\nAccuracy Score: {accuracy_score_value:.4f}")
# ==================== 10. 保存模型和结果（可选） ====================




正在预测...

孤立森林异常检测结果
测试集真实异常比例: 0.569
模型预测异常比例: 0.432

混淆矩阵:
[[9367  344]
 [3432 9401]]

分类报告:
              precision    recall  f1-score   support

 正常 (Normal)       0.73      0.96      0.83      9711
异常 (Anomaly)       0.96      0.73      0.83     12833

    accuracy                           0.83     22544
   macro avg       0.85      0.85      0.83     22544
weighted avg       0.86      0.83      0.83     22544


ROC AUC Score: 0.9419

F1 Score: 0.8328

Accuracy Score: 0.8325


In [15]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
def f1_scorer(estimator, X):
    pred = estimator.predict(X)
    pred_binary = (pred == -1).astype(int)
    return f1_score(y_test_true[:len(X)], pred_binary)

In [16]:
# 在您的预处理代码末尾，替换原来的赋值
X_train_final = X_train_encoded[sorted(X_train_encoded.columns)].astype(np.float64).values
X_test_final = X_test_encoded[sorted(X_test_encoded.columns)].astype(np.float64).values
print(f"X_train_final type: {type(X_train_final)}, dtype: {X_train_final.dtype}, shape: {X_train_final.shape}")

X_train_final type: <class 'numpy.ndarray'>, dtype: float64, shape: (67343, 77)


In [17]:
# ==================== 3. 网格搜索 (Grid Search) ====================
start_time = time.time()  # 开始计时
print("\n" + "="*50)
print("开始网格搜索 (Grid Search)")
print("="*50)

# --- 新增：自定义评分函数（用于 GridSearchCV）---
# 注意：由于 X_train_final 只有正常样本，无法计算真实 F1。
# 此处我们提供一个“占位”实现，实际 CV 分数不可靠。
# 更合理做法是用带标签的验证集，但按你要求不改逻辑。
def f1_scorer(estimator, X):
    """
    占位评分器：因训练集无异常标签，无法计算真实 F1。
    这里返回一个代理分数（如平均异常分数），仅用于让 GridSearchCV 运行。
    """
    # 获取异常分数（越小越异常）
    scores = estimator.decision_function(X)
    # 返回负均值（越大越好），作为无监督代理指标
    return -scores.mean()

# 定义参数网格
param_grid = {
    'n_estimators': [50, 100, 150],
    'max_samples': [50, 100, 200],
    'contamination': [0.1, 0.15, 0.2],
    'max_features': [0.5, 0.75, 1.0]
}

# 创建网格搜索对象
grid_search = GridSearchCV(
    estimator=IsolationForest(random_state=42),
    param_grid=param_grid,
    scoring=f1_scorer,  # 使用上面定义的占位评分器
    cv=3,
    n_jobs=-1,
    verbose=2
)

# 执行网格搜索
grid_search.fit(X_train_final)  # 传入训练数据（仅正常样本）

print(f"最佳参数 (网格搜索): {grid_search.best_params_}")
print(f"最佳交叉验证分数: {grid_search.best_score_:.4f}")

# 结束计时
end_time = time.time()
training_duration = end_time - start_time
print(f"✅ 训练耗时: {training_duration:.2f} 秒 ({training_duration / 60:.2f} 分钟)")

# ==================== 测试集评估（使用真实标签）====================
# 用最佳模型在测试集上评估 —— 改为使用异常分数 + 最佳 F1 阈值
best_model_grid = grid_search.best_estimator_

# 获取异常分数（注意：decision_function 越大越正常，所以取负号）
anomaly_scores = -best_model_grid.decision_function(X_test_final)

# 计算 AUC（使用连续分数，无需阈值）
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_curve
auc_grid = roc_auc_score(y_test_true, anomaly_scores)

# 找使 F1 最大的阈值
precision, recall, thresholds = precision_recall_curve(y_test_true, anomaly_scores)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

# 生成二分类预测
y_pred_grid_binary = (anomaly_scores >= best_threshold).astype(int)

# 计算最终指标（基于真实标签）
f1_grid = f1_score(y_test_true, y_pred_grid_binary)
accuracy_grid = accuracy_score(y_test_true, y_pred_grid_binary)

# 输出结果
print(f"网格搜索 - 测试集 AUC: {auc_grid:.4f}")
print(f"网格搜索 - 测试集 F1-Score: {f1_grid:.4f}")
print(f"网格搜索 - 测试集 Accuracy: {accuracy_grid:.4f}")
print(f"网格搜索 - 使用的最佳阈值: {best_threshold:.4f}")


开始网格搜索 (Grid Search)
Fitting 3 folds for each of 81 candidates, totalling 243 fits
最佳参数 (网格搜索): {'contamination': 0.2, 'max_features': 1.0, 'max_samples': 200, 'n_estimators': 150}
最佳交叉验证分数: -0.0424
✅ 训练耗时: 221.98 秒 (3.70 分钟)
网格搜索 - 测试集 AUC: 0.9416
网格搜索 - 测试集 F1-Score: 0.8830
网格搜索 - 测试集 Accuracy: 0.8504
网格搜索 - 使用的最佳阈值: -0.0639


In [18]:
start_time = time.time()  # 开始计时
print("\n" + "="*50)
print("开始随机搜索 (Random Search)")
print("="*50)

from scipy.stats import randint, uniform

# --- 修正的自定义评分器 ---
def neg_mean_outlier_score(estimator, X):
    """
    使用 IsolationForest 的 decision_function 的负均值作为代理评分。
    值越大越好（表示整体更“正常”）。
    """
    scores = estimator.decision_function(X)
    return -scores.mean()

# 定义参数分布
param_distributions = {
    'n_estimators': randint(50, 200),           # 50 到 199
    'max_samples': [50, 100, 200],              # 固定选项
    'contamination': uniform(0.05, 0.25),       # 0.05 ~ 0.30
    'max_features': uniform(0.3, 0.7),          # 0.3 ~ 1.0
}

# 创建随机搜索对象
random_search = RandomizedSearchCV(
    estimator=IsolationForest(random_state=42),
    param_distributions=param_distributions,
    n_iter=30,
    scoring=neg_mean_outlier_score,
    cv=3,
    n_jobs=-1,
    random_state=42,
    verbose=2
)

# 执行随机搜索
random_search.fit(X_train_final)  # 传入训练数据（仅 normal）

print(f"最佳参数 (随机搜索): {random_search.best_params_}")
print(f"最佳交叉验证分数: {random_search.best_score_:.4f}")

end_time = time.time()
training_duration = end_time - start_time
print(f"✅ 训练耗时: {training_duration:.2f} 秒 ({training_duration / 60:.2f} 分钟)")

# ==================== 修改从这里开始：测试集评估 ====================
# 用最佳模型在测试集上评估 —— 改为使用异常分数 + 最佳 F1 阈值
best_model_random = random_search.best_estimator_

# 获取异常分数（注意：decision_function 越大越正常，所以取负号 → 越大越异常）
anomaly_scores = -best_model_random.decision_function(X_test_final)

# 计算 AUC（使用连续分数，无需阈值）
auc_random = roc_auc_score(y_test_true, anomaly_scores)

# 找使 F1 最大的阈值
from sklearn.metrics import precision_recall_curve
precision, recall, thresholds = precision_recall_curve(y_test_true, anomaly_scores)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

# 生成二分类预测
y_pred_random_binary = (anomaly_scores >= best_threshold).astype(int)

# 计算指标
f1_random = f1_score(y_test_true, y_pred_random_binary)
accuracy_random = accuracy_score(y_test_true, y_pred_random_binary)

# 输出结果（保持你的格式，并新增阈值）
print(f"随机搜索 - 测试集 AUC: {auc_random:.4f}")
print(f"随机搜索 - 测试集 F1-Score: {f1_random:.4f}")
print(f"随机搜索 - 测试集 Accuracy: {accuracy_random:.4f}")
print(f"随机搜索 - 使用的最佳阈值: {best_threshold:.4f}")
# ==================== 修改到这里结束 ====================


开始随机搜索 (Random Search)
Fitting 3 folds for each of 30 candidates, totalling 90 fits
最佳参数 (随机搜索): {'contamination': np.float64(0.2804685587557792), 'max_features': np.float64(0.36194475143634364), 'max_samples': 200, 'n_estimators': 89}
最佳交叉验证分数: -0.0190
✅ 训练耗时: 155.19 秒 (2.59 分钟)
随机搜索 - 测试集 AUC: 0.9476
随机搜索 - 测试集 F1-Score: 0.8870
随机搜索 - 测试集 Accuracy: 0.8656
随机搜索 - 使用的最佳阈值: -0.0059


In [None]:
# ==================== 5. 遗传算法优化 (Genetic Algorithm Optimization) ====================
start_time = time.time()
print("\n" + "="*50)
print("开始遗传算法优化 (使用 DEAP)")
print("="*50)

import random
import numpy as np
from deap import base, creator, tools, algorithms

# --- 定义参数空间 ---
N_ESTIMATORS_RANGE = (50, 200)          # 整数
MAX_SAMPLES_OPTIONS = ['auto', 50, 100, 200]  # 离散类别（用索引表示）
CONTAMINATION_RANGE = (0.05, 0.3)       # 浮点
MAX_FEATURES_RANGE = (0.3, 1.0)         # 浮点

# --- 创建适应度和个体类 ---
# 最小化目标（越小越好）
creator.create("FitnessMin", base.Fitness, weights=(-1.0,))
creator.create("Individual", list, fitness=creator.FitnessMin)

# --- 注册遗传算子 ---
toolbox = base.Toolbox()

# 基因编码规则：
# [n_estimators(int), max_samples_idx(int), contamination(float), max_features(float)]
def create_individual():
    n_est = random.randint(*N_ESTIMATORS_RANGE)
    max_samp_idx = random.randint(0, len(MAX_SAMPLES_OPTIONS) - 1)
    contam = random.uniform(*CONTAMINATION_RANGE)
    max_feat = random.uniform(*MAX_FEATURES_RANGE)
    return [n_est, max_samp_idx, contam, max_feat]

toolbox.register("individual", tools.initIterate, creator.Individual, create_individual)
toolbox.register("population", tools.initRepeat, list, toolbox.individual)

# --- 适应度函数 ---
def evaluate_individual(ind):
    n_estimators = int(ind[0])
    max_samples = MAX_SAMPLES_OPTIONS[int(np.clip(ind[1], 0, len(MAX_SAMPLES_OPTIONS)-1))]
    contamination = np.clip(ind[2], *CONTAMINATION_RANGE)
    max_features = np.clip(ind[3], *MAX_FEATURES_RANGE)
    
    model = IsolationForest(
        n_estimators=n_estimators,
        max_samples=max_samples,
        contamination=contamination,
        max_features=max_features,
        random_state=42,
        n_jobs=-1  # 并行加速单次训练
    )
    scores = model.fit(X_train_final).decision_function(X_train_final)
    return (-scores.mean(),)  # 返回元组，DEAP 要求

toolbox.register("evaluate", evaluate_individual)

# --- 遗传算子 ---
toolbox.register("mate", tools.cxBlend, alpha=0.5)      # 浮点交叉
toolbox.register("mutate", tools.mutPolynomialBounded,
                 low=[N_ESTIMATORS_RANGE[0], 0, CONTAMINATION_RANGE[0], MAX_FEATURES_RANGE[0]],
                 up=[N_ESTIMATORS_RANGE[1], len(MAX_SAMPLES_OPTIONS)-1, CONTAMINATION_RANGE[1], MAX_FEATURES_RANGE[1]],
                 eta=0.1, indpb=0.2)
toolbox.register("select", tools.selTournament, tournsize=3)

# --- 运行遗传算法 ---
POP_SIZE = 30      # 种群大小
N_GEN = 20         # 代数（≈ 总评估次数 = POP_SIZE × N_GEN = 600）

print(f"正在执行遗传算法优化... (种群={POP_SIZE}, 代数={N_GEN})")
pop = toolbox.population(n=POP_SIZE)
hof = tools.HallOfFame(1)  # 记录历史最佳个体

stats = tools.Statistics(lambda ind: ind.fitness.values)
stats.register("avg", np.mean)
stats.register("min", np.min)
stats.register("max", np.max)

# 执行进化
algorithms.eaSimple(pop, toolbox,
                    cxpb=0.5,     # 交叉概率
                    mutpb=0.2,    # 变异概率
                    ngen=N_GEN,
                    stats=stats,
                    halloffame=hof,
                    verbose=True)

# --- 提取最佳参数 ---
best_ind = hof[0]
best_params_ga = {
    'n_estimators': int(best_ind[0]),
    'max_samples': MAX_SAMPLES_OPTIONS[int(np.clip(best_ind[1], 0, len(MAX_SAMPLES_OPTIONS)-1))],
    'contamination': float(np.clip(best_ind[2], *CONTAMINATION_RANGE)),
    'max_features': float(np.clip(best_ind[3], *MAX_FEATURES_RANGE))
}

best_score = best_ind.fitness.values[0]
print(f"\n最佳参数 (遗传算法): {best_params_ga}")
print(f"最佳适应度值: {best_score:.4f}")

end_time = time.time()
training_duration = end_time - start_time
print(f"✅ 训练耗时: {training_duration:.2f} 秒 ({training_duration / 60:.2f} 分钟)")

# ==================== 测试集评估（使用最佳 F1 阈值）====================
best_model_ga = IsolationForest(**best_params_ga, random_state=42)
best_model_ga.fit(X_train_final)

# 获取异常分数（越大越异常）
anomaly_scores = -best_model_ga.decision_function(X_test_final)

from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_curve
auc_ga = roc_auc_score(y_test_true, anomaly_scores)

# 找最佳 F1 阈值
precision, recall, thresholds = precision_recall_curve(y_test_true, anomaly_scores)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

y_pred_ga_binary = (anomaly_scores >= best_threshold).astype(int)
f1_ga = f1_score(y_test_true, y_pred_ga_binary)
accuracy_ga = accuracy_score(y_test_true, y_pred_ga_binary)

print(f"遗传算法 - 测试集 AUC: {auc_ga:.4f}")
print(f"遗传算法 - 测试集 F1-Score: {f1_ga:.4f}")
print(f"遗传算法 - 测试集 Accuracy: {accuracy_ga:.4f}")
print(f"遗传算法 - 使用的最佳阈值: {best_threshold:.4f}")


开始遗传算法优化
正在执行遗传算法搜索...
Iteration No: 1 started. Evaluating function at random point.
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 2.9073
Function value obtained: -0.0249
Current minimum: -0.0249
Iteration No: 2 started. Evaluating function at random point.
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 1.5759
Function value obtained: -0.0561
Current minimum: -0.0561
Iteration No: 3 started. Evaluating function at random point.
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 1.2424
Function value obtained: -0.1292
Current minimum: -0.1292
Iteration No: 4 started. Evaluating function at random point.
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 3.2466
Function value obtained: -0.0150
Current minimum: -0.1292
Iteration No: 5 started. Evaluating function at random point.
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 2.2687
Function value obtained: -0.1168
Current minimum: -0.1292
I

In [20]:
# ==================== 6. 粒子群优化 (Particle Swarm Optimization) ====================
start_time = time.time()  # 开始计时
print("\n" + "="*50)
print("开始粒子群优化")
print("="*50)

from sko.PSO import PSO
import numpy as np

# --- 目标函数（适应度函数）---
def objective_function(args):
    # 解码参数
    n_estimators = int(args[0])
    max_samples_options = ['auto', 50, 100, 200]
    max_samples_idx = int(np.clip(args[1], 0, len(max_samples_options) - 1))
    max_samples = max_samples_options[max_samples_idx]
    contamination = args[2]
    max_features = args[3]
    
    model = IsolationForest(
        n_estimators=n_estimators,
        max_samples=max_samples,
        contamination=contamination,
        max_features=max_features,
        random_state=42,
        n_jobs=1
    )
    anomaly_scores = model.fit(X_train_final).decision_function(X_train_final)
    return -anomaly_scores.mean()

# --- 定义搜索空间 ---
bounds = [
    [50, 0, 0.05, 0.3],   # 下界
    [200, 3, 0.3, 1.0]    # 上界
]

# --- 执行粒子群优化 ---
print("正在执行粒子群优化...")
pso = PSO(
    func=objective_function,
    n_dim=4,
    pop=20,                # 种群大小
    max_iter=30,           # ✅ 最大迭代次数（已设定）
    lb=bounds[0],
    ub=bounds[1],
    w=0.8,
    c1=0.5,
    c2=0.5
)

pso.run()

# --- 提取最佳参数 ---
best_args = pso.gbest_x
best_score = pso.gbest_y

best_params_pso = {
    'n_estimators': int(best_args[0]),
    'max_samples': ['auto', 50, 100, 200][int(np.clip(best_args[1], 0, 3))],
    'contamination': best_args[2],
    'max_features': best_args[3]
}

print(f"\n最佳参数 (粒子群优化): {best_params_pso}")
print(f"最佳适应度值: {np.squeeze(best_score):.4f}")

end_time = time.time()
training_duration = end_time - start_time
print(f"✅ 训练耗时: {training_duration:.2f} 秒 ({training_duration / 60:.2f} 分钟)")

# ==================== 修改从这里开始：测试集评估 ====================
# 用最佳模型在测试集上评估 —— 改为使用异常分数 + 最佳 F1 阈值
best_model_pso = IsolationForest(**best_params_pso, random_state=42)
best_model_pso.fit(X_train_final)

# 获取异常分数（越大越异常）
anomaly_scores = -best_model_pso.decision_function(X_test_final)

# 导入评估指标（确保可用）
from sklearn.metrics import roc_auc_score, f1_score, accuracy_score, precision_recall_curve

# 计算 AUC
auc_pso = roc_auc_score(y_test_true, anomaly_scores)

# 找使 F1 最大的阈值
precision, recall, thresholds = precision_recall_curve(y_test_true, anomaly_scores)
f1_scores = 2 * (precision * recall) / (precision + recall + 1e-8)
best_idx = np.argmax(f1_scores)
best_threshold = thresholds[best_idx]

# 生成二分类预测
y_pred_pso_binary = (anomaly_scores >= best_threshold).astype(int)

# 计算最终指标
f1_pso = f1_score(y_test_true, y_pred_pso_binary)
accuracy_pso = accuracy_score(y_test_true, y_pred_pso_binary)

# 输出结果（保持格式，并新增阈值）
print(f"粒子群优化 - 测试集 AUC: {auc_pso:.4f}")
print(f"粒子群优化 - 测试集 F1-Score: {f1_pso:.4f}")
print(f"粒子群优化 - 测试集 Accuracy: {accuracy_pso:.4f}")
print(f"粒子群优化 - 使用的最佳阈值: {best_threshold:.4f}")
# ==================== 修改到这里结束 ====================


开始粒子群优化
正在执行粒子群优化...

最佳参数 (粒子群优化): {'n_estimators': 186, 'max_samples': 50, 'contamination': np.float64(0.05), 'max_features': np.float64(0.7174926792636086)}
最佳适应度值: -0.1630
✅ 训练耗时: 1622.63 秒 (27.04 分钟)
粒子群优化 - 测试集 AUC: 0.9304
粒子群优化 - 测试集 F1-Score: 0.8675
粒子群优化 - 测试集 Accuracy: 0.8283
粒子群优化 - 使用的最佳阈值: -0.2000
