In [55]:
import numpy as np
import pandas as pd
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import IsolationForest
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from pyod.models.abod import ABOD
from pyod.models.ecod import ECOD
import matplotlib.pyplot as plt

In [56]:
def box_plot_outliers(data):
    Q1 = np.percentile(data, 25, axis=0)
    Q3 = np.percentile(data, 75, axis=0)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (data < lower_bound) | (data > upper_bound)

In [57]:
def label(data):
    lof = LocalOutlierFactor(n_neighbors=100, contamination=0.5)
    y_pred_lof = lof.fit_predict(embeddings)
    y_pred_lof = np.where(y_pred_lof == 1, 0, 1)  # LOF输出1表示正常，-1表示异常，需转换

# 方法2：Isolation Forest (IF)
    iforest = IsolationForest(contamination=0.5, random_state=42)
    y_pred_if = iforest.fit_predict(embeddings)
    y_pred_if = np.where(y_pred_if == 1, 0, 1)  # IF输出1表示正常，-1表示异常，需转换

    outliers = box_plot_outliers(embeddings)
    y_pred_box_plot = np.any(outliers, axis=1).astype(int)

# 方法4：ABOD
    abod = ABOD(contamination=0.5)
    abod.fit(embeddings)
    y_pred_abod = abod.labels_

# 方法5：ECOD
    ecod = ECOD(contamination=0.5)
    ecod.fit(embeddings)
    y_pred_ecod = ecod.labels_

    lables = []
    for i in range (y_pred_lof.shape[0]):
        if y_pred_lof[i] == 1:
            lables.append(i)
    labels = np.array(lables)
    label = []
    for i in range (y_pred_lof.shape[0]):
        if y_pred_lof[i] == y_pred_abod[i] == y_pred_ecod[i] == y_pred_box_plot[i] == y_pred_if[i] == 0:
            label.append(0)
        elif y_pred_lof[i] == 1 or y_pred_abod[i] == 1 or y_pred_ecod[i] == 1 or y_pred_box_plot[i] == 1 or y_pred_if[i] == 1:
            label.append(1)
    #labels
    return label


In [58]:
file_path = 'embeddings.npy'
embeddings = np.load(file_path).reshape(200,16)

In [59]:
a = label(embeddings)
a

[1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 1]

In [60]:
Labels = []
for i in range (1,8):
    #Label = []
    file_path = f'Embeddings_ACCESS{i}_ETA.npy'
    embeddings = np.load(file_path).reshape(200,16)
    Label = label(embeddings)
    Labels.append(Label)
Labels = np.array(Labels)


  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return np.nan_to_num(skew_sp(X, axis=axis))
  return np.nan_to_num(skew_sp(X, axis=axis))
  return np.nan_to_num(skew_sp(X, axis=axis))


In [61]:
Labels.shape

(7, 200)

In [62]:
def detect_anomalies_lof(data):
    clf = LocalOutlierFactor(n_neighbors=100, contamination=0.5)
    y_pred = clf.fit_predict(data)
    return (y_pred == -1).astype(int)

def detect_anomalies_if(data):
    clf = IsolationForest(contamination=0.5, random_state=42)
    y_pred = clf.fit_predict(data)
    return (y_pred == -1).astype(int)

def detect_anomalies_abod(data):
    clf = ABOD(contamination=0.5)
    clf.fit(data)
    y_pred = clf.predict(data)
    return y_pred

def detect_anomalies_ecod(data):
    clf = ECOD(contamination=0.5)
    clf.fit(data)
    y_pred = clf.predict(data)
    return y_pred

def detect_anomalies_box_plot(data):
    Q1 = np.percentile(data, 25, axis=0)
    Q3 = np.percentile(data, 75, axis=0)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    is_outlier = np.any((data < lower_bound) | (data > upper_bound), axis=1)
    return is_outlier.astype(int)

# 评估指标
def evaluate_performance(y_true, y_pred):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    return accuracy, precision, recall, f1

In [63]:
def main():
    results = []
    for dataset in range(1, 8):
        for method in ['ETA', 'gcn', 'aernn', 'deepwork', 'node2vec', 'structure2vec']:
            file_path = f'Embeddings_ACCESS{dataset}_{method}.npy'
            data = np.load(file_path).reshape(200,16)
            labels = Labels[dataset-1]
            
            for detection_method in ['LOF', 'IF', 'BOX-plot', 'ABOD', 'ECOD']:
                if detection_method == 'LOF':
                    y_pred = detect_anomalies_lof(data)
                elif detection_method == 'IF':
                    y_pred = detect_anomalies_if(data)
                elif detection_method == 'BOX-plot':
                    y_pred = detect_anomalies_box_plot(data)
                elif detection_method == 'ABOD':
                    y_pred = detect_anomalies_abod(data)
                elif detection_method == 'ECOD':
                    y_pred = detect_anomalies_ecod(data)

                accuracy, precision, recall, f1 = evaluate_performance(labels, y_pred)
                
                results.append({
                    'Dataset': f'ACCESS{dataset}',
                    'Method': method,
                    'Detection Method': detection_method,
                    'Accuracy': accuracy,
                    'Precision': precision,
                    'Recall': recall,
                    'F1-score': f1
                })
                
    results_df = pd.DataFrame(results)
    results_df.to_csv('Results.csv')
    print(results_df)

if __name__ == "__main__":
    main()

  _warn_prf(average, modifier, msg_start, len(result))
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  return np.nan_to_num(skew_sp(X, axis=axis))
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  return _methods._var(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  arrmean = um.true_divide(arrmean, div, out=arrmean,
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_star

     Dataset         Method Detection Method  Accuracy  Precision    Recall   
0    ACCESS1            ETA              LOF     0.935   1.000000  0.884956  \
1    ACCESS1            ETA               IF     0.435   0.000000  0.000000   
2    ACCESS1            ETA         BOX-plot     0.890   1.000000  0.805310   
3    ACCESS1            ETA             ABOD     0.435   0.000000  0.000000   
4    ACCESS1            ETA             ECOD     0.860   1.000000  0.752212   
..       ...            ...              ...       ...        ...       ...   
205  ACCESS7  structure2vec              LOF     0.455   0.610000  0.465649   
206  ACCESS7  structure2vec               IF     0.445   0.600000  0.458015   
207  ACCESS7  structure2vec         BOX-plot     0.475   0.861111  0.236641   
208  ACCESS7  structure2vec             ABOD     0.650   0.808081  0.610687   
209  ACCESS7  structure2vec             ECOD     0.405   0.560000  0.427481   

     F1-score  
0    0.938967  
1    0.000000  
2  