### Calculate MinN

In [None]:
import numpy as np
from sklearn.neighbors import BallTree, NearestNeighbors
from data_preprocess.config import bank,credit,census,meps
import joblib
import os
from sklearn.ensemble import IsolationForest
from data_preprocess.bank import bank_data
from data_preprocess.credit import credit_data
from data_preprocess.census import census_data
from data_preprocess.meps import meps_data
import pandas as pd
import miniball

dataset_names = ['bank', 'census', 'credit', 'meps']
method_names = ['Cobweb_BT_random','Cobweb_GAN_BT_random', 'expga', 'LIMI']
column_names = ['Cobweb_BT_random','Cobweb_GAN_BT_random', 'expga', 'LIMI']
data_config = {"census":census, "credit":credit, "bank":bank, "meps": meps}
dataset_dict = {"census":census_data, "credit":credit_data, "bank":bank_data, 'meps':meps_data}

# Get leaf nodes
def get_leaf_nodes(tree, data, leaf_size):
    leaf_nodes = []
    for i in range(0, data.shape[0], leaf_size):
        leaf_nodes.append(data[i:i+leaf_size])
    
    # Calculate the center and radius of each leaf node
    centers = []
    radii = []
    for node in leaf_nodes:
        center = np.mean(node, axis=0)
        radius = np.max(np.linalg.norm(node - center, axis=1))
        centers.append(center)
        radii.append(radius)
    
    return np.array(centers), np.array(radii)

# Merge adjacent leaf nodes
def merge_nodes(centers, radii, merge_radius):
    nbrs = NearestNeighbors(radius=merge_radius).fit(centers)
    merged = np.zeros(len(centers), dtype=bool)
    num_merged_nodes = 0
    
    for i in range(len(centers)):
        if not merged[i]:
            indices = nbrs.radius_neighbors([centers[i]], return_distance=False)[0]
            merged[indices] = True
            num_merged_nodes += 1
            
    return num_merged_nodes

if __name__ == '__main__':
    model = 'MLP'

    merge_radius = 1# Sets the distance of the merged sphere

    log = []
    log_index = []
    log_path = f'./result/RQ3/{model}_MinN.csv'

    for dataset_name in dataset_names:
        sens_list = data_config[dataset_name].sensitive_param
        for sens_index in sens_list:
            print('==========================================')
            print(f'========{dataset_name}-{sens_index}-{model}=========')
            final_node_counts = []
            for method_name in method_names:
                sens = sens_index if 'BF' in method_name else sens_index + 1
                data_path = f'result/{method_name}/{dataset_name}{sens}_{model}_{method_name}.npy'

                data = np.load(data_path)
                data = data[np.random.choice(data.shape[0], min(10000, len(data)), replace=False)]

                leaf_size = 40
                ball_tree = BallTree(data, leaf_size=leaf_size)

                # Get the center and radius of leaf nodes
                centers, radii = get_leaf_nodes(ball_tree, data, leaf_size)

                # Calculate the number of nodes after merging
                final_node_count = merge_nodes(centers, radii, merge_radius)
                print(method_name, np.round(np.mean(np.array(radii))), final_node_count)
                final_node_counts.append(final_node_count)
            log_index.append(f'{dataset_name} {sens_index}')
            log.append(final_node_counts)
    log = [[np.round(x, 3) for x in row] for row in log]
    df = pd.DataFrame(log, columns=column_names, index=log_index)
    df.to_csv(log_path, index=True)


### TSNE

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec, GridSpecFromSubplotSpec
import os
from data_preprocess.config import bank, census, credit, meps
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
import seaborn as sns

if __name__ == '__main__':
    method_names = [ 'expga', 'LIMI', 'Cobweb_BT_random', 'Cobweb_GAN_BT_random']
    legend_names = ['Expga', 'LIMI', 'Cobweb', 'Cobweb GAN']
    dataset_names = ['bank', 'census', 'credit', 'meps']
    data_config = {"census":census, "credit":credit, "bank":bank, "meps": meps}
    # Set the global font size
    plt.rcParams.update({'font.size': 42})
    fig = plt.figure(figsize=(24, 12))
    gs = GridSpec(nrows=2, ncols=4, figure=fig)

    count = 0
    for dataset_name in dataset_names:
        for sens_param_index in data_config[dataset_name].sensitive_param:
            print(f'-------------{dataset_name}-{sens_param_index}---------------------------')
            sens_name = data_config[dataset_name].sens_name[sens_param_index + 1]
            # read data
            data_list = []
            for method_name in method_names:
                sens = sens_param_index if 'BF' in method_name else sens_param_index + 1
                data = np.load(f'./result/{method_name}/{dataset_name}{sens}_MLP_{method_name}.npy')
                data = np.delete(data, [sens_param_index], axis = 1)
                data = data[np.random.choice(data.shape[0], min(len(data),10000),replace=False)]
                data_list.append(data)

            # Delete sensitive attribute columns
            total_data = np.vstack(data_list)

            # Standardization
            scaler = StandardScaler()
            data_scaled = scaler.fit_transform(total_data)

            tsen = TSNE(n_components=2,perplexity=50, n_iter=1000, random_state=0) 
            tsne_results = []
            for data in data_list:
                tsne_result = tsen.fit_transform(data)
                tsne_results.append(tsne_result)
            
            # set seabon
            sns.set_theme(style="whitegrid")
            sns.set_context("notebook", rc={"font.size":14, 
                                "axes.titlesize":16, 
                                "axes.labelsize":14})
            colors = sns.color_palette("Set2")
            sub_gs = GridSpecFromSubplotSpec(2, 2, subplot_spec=gs[count])

            count += 1
            main_ax = None  
            for i, (tsne_result, method_name, color) in enumerate(zip(tsne_results, legend_names, colors)):
                if main_ax is None:
                    ax = fig.add_subplot(sub_gs[int(i/2), i % 2])
                    main_ax = ax  
                else:
                    ax = fig.add_subplot(sub_gs[int(i/2), i % 2], sharex=main_ax, sharey=main_ax)
                ax.scatter(tsne_result[:, 0], tsne_result[:, 1], label= method_name, color=color)
            # ax.set_xlabel('D1')
            # ax.set_ylabel('D2')
            # add legend
                ax.legend(loc='upper right')
                main_ax.set_title(f'{dataset_name} - {sens_name}')


    fig.suptitle('TSNE visualization')
    # Automatically adjust layout
    plt.tight_layout()
    plt.savefig(f'./result/RQ2TSNE/compare.png')



### 筛选真实数据并记录真实值

In [None]:
from sklearn.ensemble import IsolationForest
import numpy as np
from data_preprocess.bank import bank_data
from data_preprocess.credit import credit_data
from data_preprocess.census import census_data
from data_preprocess.meps import meps_data
from data_preprocess.config import bank,credit,census,meps

import joblib
import os
import pandas as pd
from sklearn.decomposition import PCA

dataset_dict = {"census":census_data, "credit":credit_data, "bank":bank_data, 'meps':meps_data}
data_config = {"census":census, "credit":credit, "bank":bank, "meps": meps}

# 判断新数据是否为异常点
def is_real_data(sample, iso_forest):
    prediction = iso_forest.predict(sample.reshape(1, -1))
    return prediction == 1  # 1 表示真实数据，-1 表示异常点

if __name__ == "__main__":
    dataset_names = ['bank','census', 'credit', 'meps']
    # dataset_names = ['meps']
    method_names = ['Cobweb_BT_random','Cobweb_BT_nocluster_random','expga','LIMI']
    column_names = ['Cobweb_BT_random', 'Cobweb_BT_nocluster_random','expga','LIMI']
    # method_names = ['Cobweb_BT_random','Cobweb_BT_GR_random','Cobweb_LR_random','Cobweb_GAN_BT_random','Cobweb_GAN_BT_GR_random','Cobweb_GAN_LR_random', 'expga', 'LIMI']
    # column_names = ['Cobweb_BT_random','Cobweb_BT_GR_random','Cobweb_LR_random','Cobweb_GAN_BT_random','Cobweb_GAN_BT_GR_random','Cobweb_GAN_LR_random', 'expga', 'LIMI']
    # method_names = ['Cobweb_BT_random','Cobweb_GAN_BT_random', 'expga', 'LIMI']
    # column_names = ['Cobweb_BT_random','Cobweb_GAN_BT_random', 'expga', 'LIMI']

    model = "MLP"
    new_data_lens = []
    result_info = []
    row_index = []
    result_path = f'result/{model}_natural.csv'
    lens_path = f'natural_IDS/{model}_NID.csv'
    for dataset_name in dataset_names:
        
        real_data, Y, input_shape, nb_classes = dataset_dict[dataset_name]()
        
        if os.path.exists(f'model_info/iso/{dataset_name}.pkl'):
            iso_forest = joblib.load(f'model_info/iso/{dataset_name}.pkl')
        else:
            print("training")
            iso_forest = IsolationForest(contamination='auto', random_state=42).fit(real_data)
            joblib.dump(iso_forest, f'model_info/iso/{dataset_name}.pkl')
            print('train over')
        for sens_param_index in data_config[dataset_name].sensitive_param:
            row_index.append(f'{dataset_name}_{sens_param_index}')
            print(f'------------------------{dataset_name}-{sens_param_index}---------------------')
            temp_result = []
            temp_lens = []
            for method_name in method_names:
                if not os.path.exists(f'natural_IDS/{method_name}/'):
                    os.makedirs(f'natural_IDS/{method_name}/')
                sens = sens_param_index if 'BF' in method_name else sens_param_index + 1
                data_path = f'result/{method_name}/{dataset_name}{sens}_{model}_{method_name}.npy'
                data = np.load(data_path)
                test_result = np.array(list(map(lambda row: is_real_data(row, iso_forest), data)))
                suc_rate = np.sum(test_result)/len(data)
                temp_result.append(suc_rate)

                new_IDS = data[test_result.ravel()]
                temp_lens.append(len(new_IDS))
                np.save(f'natural_IDS/{method_name}/{dataset_name}{sens}_{model}_{method_name}.npy',new_IDS)

            new_data_lens.append(temp_lens)
            result_info.append(temp_result)
            print(method_names)
            print(temp_result)
    result_info = [[np.round(x, 3) for x in row] for row in result_info]
    df = pd.DataFrame(result_info, columns=column_names, index=row_index)
    df.to_csv(result_path, index=True)
    new_data_lens = pd.DataFrame(new_data_lens, columns=column_names, index = row_index)
    new_data_lens.to_csv(lens_path, index=True)


