## 数据预处理与环境安装

In [None]:
!pip install -q bnlearn

In [None]:
import pandas as pd

def process_csv(file_path,output_path):
    # 读取CSV文件
    df = pd.read_csv(file_path)

    # 遍历每一列
    for col in df.columns:
        # 进行min-max归一化
        df[col] = (df[col] - df[col].min()) / (df[col].max() - df[col].min())

    # 保存处理后的CSV文件
    df.to_csv(output_path, index=False)


process_csv('data_origin.csv', 'data_train.csv')
process_csv('metrics_abnormal.csv', 'data_test_abnormal.csv')
process_csv('metrics_anomaly.csv', 'data_test_normal.csv')

## 数据划分

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# 读取数据
data_train_path = 'data_train.csv'
data_test_abnormal_path = 'data_test_abnormal.csv'
data_test_normal_path = 'data_test_normal.csv'

train_df = pd.read_csv(data_train_path)
test_df_normal = pd.read_csv(data_test_normal_path)
test_df_abnormal = pd.read_csv(data_test_abnormal_path)

# 给测试数据添加标签
test_df_normal['label'] = 0  # 正常样本标记为 0
test_df_abnormal['label'] = 1  # 异常样本标记为 1

# 合并两个测试集
test_df = pd.concat([test_df_normal, test_df_abnormal], axis=0).reset_index(drop=True)

## 相关性分析

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# 计算特征之间的相关性
correlation_matrix = train_df.corr()

# 画热力图（取消 annot）
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, cmap="coolwarm", linewidths=0.5, cbar=True)
plt.title("Feature Correlation Heatmap")
plt.show()

# 计算每个特征与目标变量（假设最后一列为目标）之间的相关性
target_feature = train_df.columns[-1]
feature_correlations = correlation_matrix[target_feature].drop(target_feature)

# 按照相关性排序
feature_correlations_sorted = feature_correlations.abs().sort_values(ascending=False)

# 可视化相关性（柱状图）
plt.figure(figsize=(10, 5))
feature_correlations_sorted.plot(kind="bar", color="teal")
plt.xlabel("Feature")
plt.ylabel("Correlation with Target")
plt.title("Feature-Target Correlation")
plt.show()


## 训练贝叶斯网络

In [None]:
import bnlearn as bn

def construct_bayesian_network(train_df):
    """构建贝叶斯网络"""
    # 结构学习
    model = bn.structure_learning.fit(train_df, methodtype='hc',verbose=0)
    # 参数学习
    model = bn.parameter_learning.fit(model, train_df,verbose=0)
    return model

# 训练贝叶斯网络
model = construct_bayesian_network(train_df)

import matplotlib.pyplot as plt

plt.ioff()  # 关闭交互模式，减少额外输出
bn.plot(model)
plt.show()  # 仅显示最终的图，而不会产生额外文本输出

## 异常检测与可视化

In [None]:
def detect_anomalies(model, df, threshold=None):
    """使用贝叶斯网络计算对数似然，并检测异常"""
    log_likelihoods = []

    for i in range(len(df)):
        sample = df.iloc[i].to_dict()
        
        # 选择目标变量作为预测变量，而不是所有变量
        target_variable = list(sample.keys())[-1]  # 假设最后一列为目标变量
        evidence = {k: v for k, v in sample.items() if k != target_variable and k != 'label'}

        try:
            log_likelihood = bn.inference.fit(model, variables=[target_variable], evidence=evidence)
            log_likelihoods.append(log_likelihood)
        except Exception as e:
            print(f"⚠️ 计算第 {i} 行时发生错误: {e}")
            log_likelihoods.append(None)

    df['log_likelihood'] = log_likelihoods

    # 如果未指定阈值，则使用训练集对数似然的均值 - 3 * 标准差 作为阈值
    if threshold is None:
        threshold = np.nanmean(log_likelihoods) - 3 * np.nanstd(log_likelihoods)

    df['anomaly'] = df['log_likelihood'] < threshold  # 低于阈值的视为异常
    return df[['log_likelihood', 'anomaly']], threshold

# 计算训练集的对数似然分布，并确定异常阈值
train_results, threshold = detect_anomalies(model, train_df)

# 在测试集上检测异常
test_results, _ = detect_anomalies(model, test_df, threshold)

# 计算准确率
y_true = test_df['label'].values  # 真实标签
y_pred = test_results['anomaly'].astype(int).values  # 预测的异常值 (1: 异常, 0: 正常)

accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred)
recall = recall_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-score: {f1:.4f}")

# 可视化结果
plt.figure(figsize=(10, 5))
plt.hist(train_results['log_likelihood'], bins=30, alpha=0.6, label="Train Log-Likelihood", color='blue')
plt.hist(test_results['log_likelihood'], bins=30, alpha=0.6, label="Test Log-Likelihood", color='orange')
plt.axvline(threshold, color='red', linestyle='dashed', linewidth=2, label="Anomaly Threshold")
plt.xlabel("Log-Likelihood")
plt.ylabel("Frequency")
plt.title("Log-Likelihood Distribution (Train vs Test)")
plt.legend()
plt.show()

# 标记异常点
plt.figure(figsize=(8, 4))
plt.scatter(range(len(test_results)), test_results['log_likelihood'], c=test_results['anomaly'], cmap="coolwarm", edgecolors='k')
plt.axhline(threshold, color='red', linestyle='dashed', linewidth=2, label="Anomaly Threshold")
plt.xlabel("Sample Index")
plt.ylabel("Log-Likelihood")
plt.title("Anomaly Detection in Test Data")
plt.legend()
plt.show()