# 主要是框架内部的处理类

写一个类

## 数据预处理部分

In [1]:
# 重新加载模块
%load_ext autoreload
%autoreload 2
from exp.utils.DataPreProcessor_0605 import DataPreProcessor, Group 

sensitive_names = ["sex"]
processor = DataPreProcessor(
    data_path="../input/adult.csv", 
    sensitive_names=sensitive_names,
    label="income",
    label_mapper={"<=50K":0, ">50K":1}
)

{(0.703665851855809,): ('Male',), (-1.4211290733558493,): ('Female',)}


## 模型训练与预测

In [2]:
import xgboost

# 训练集、测试集和敏感属性
X_train = processor.X_train_label_scale
y_train = processor.y_train
X_test = processor.X_test_label_scale
y_test = processor.y_test
sensitive_indexes = processor.X_test[sensitive_names]

# 模型训练 
model = xgboost.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [3]:
from exp.utils.FairMetric_0605 import FairMetric

fm = FairMetric("XGBClassifier-baseline", y_test, y_pred, sensitive_features=sensitive_indexes)
fm.metrics

准确度(1)         0.871635
精确度(1)         0.772044
召回率(1)         0.658547
f1分数(1)        0.710793
AUC分数(1)       0.798650
TPR(1)      1541.000000
TNR(1)      6974.000000
FPR(0)       455.000000
FNR(0)       799.000000
DP差异(0)        0.186382
DP比率(1)        0.298818
EO差异(0)        0.069172
EO比率(1)        0.215589
Name: XGBClassifier-baseline, dtype: float64

In [4]:
import shap
import pandas as pd

def get_shapley_values(X, model):
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    shapley_values = pd.DataFrame(data=shap_values.values, columns=X.columns, index=X.index)
    return shapley_values
shapley_values = get_shapley_values(processor.X_train_label_scale, model)
shapley_values.head()


IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html


Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
19749,0.494813,0.026526,-0.213601,-0.045938,-0.376381,0.463009,-0.021303,0.509124,0.029389,0.075267,-0.155306,-0.064937,-0.049951,0.02353
1216,0.500095,-0.002224,-0.207415,-0.189268,-0.220711,-0.490526,0.12301,-0.486618,0.022525,-0.38151,-0.13295,-2.609746,-0.836442,0.001223
27962,-0.201751,0.024517,-0.071968,0.033012,0.767833,0.451731,-0.474881,0.655555,-0.308522,0.038492,-0.198145,-0.099592,-0.055016,-0.609599
23077,-1.82628,0.035445,-0.060712,-0.215791,0.111234,-1.321697,0.256745,-0.766208,-0.008487,0.073554,-0.147942,-0.066919,-0.916901,0.023987
10180,-0.303005,0.05249,0.2859,-0.134943,-0.460785,0.547875,0.05778,1.367415,0.068808,-0.1713,-0.102331,-0.071025,-1.030011,0.015273


## SHAP-FAIR 框架

![图片](./assets/2024-05-27-流程图.drawio.svg)

In [5]:
# 挑选一个 group，以男性举例
group: Group = processor.grouped[1]
X: pd.DataFrame = group.X
y: pd.Series = group.y
group.origin_name, X.shape, y.shape, sensitive_indexes

(('Male',),
 (15244, 14),
 (15244,),
           sex
 14160    Male
 27048    Male
 28868  Female
 5667     Male
 7827   Female
 ...       ...
 32476  Female
 21100    Male
 27131    Male
 25526    Male
 21385    Male
 
 [9769 rows x 1 columns])

### 个体间-SHAP距离

Shapley Distance between Individual and Individual

$$
{\rm SDII}(F, \phi_i, \phi_j) = \sum_{f\in F}^f|\phi_i^f-\phi_j^f|
$$

### 个体与组-SHAP距离 

Shapley Distance between Individual and Group

$$
{\rm SDIG}(F, \phi_i, \phi) = \frac 1{m-1}\sum_{\phi_j \in \phi}^{\phi}{\rm SDII}(F, \phi_i, \phi_j)
$$

In [324]:
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
import logging
import pandas as pd
import os
from sklearn.neighbors import NearestNeighbors

logging.basicConfig(level=logging.INFO)


class ShapFairFrameworkGroup:
    def __init__(

        self,
        X: pd.DataFrame,
        y: pd.Series,
        shapley_values: pd.DataFrame,
        sensitive_names: list[str],
    ):


        if X.shape != shapley_values.shape:
            raise BaseException("X 和 shapley values 的 shape 不匹配")


        self.X = X


        self.y = y


        self.shapley_values = shapley_values


        self.sensitive_names = sensitive_names
        self.df_sdgg = pd.DataFrame(columns=sensitive_names)


    def sdii(self): ...

    def sdig(self, xi_idx):
        sv = self.shapley_values
        sv_feat_i = sv.loc[xi_idx, self.sensitive_names]
        sv_feat_group = sv.loc[:, self.sensitive_names]
        result = np.abs(sv_feat_i - sv_feat_group).sum(axis=0)
        self.df_sdgg.loc[xi_idx] = result / len(sv_feat_group)

    def sdgg(self, path: str = None):
        if path is not None and os.path.exists(path):
            self.df_sdgg = pd.read_csv(path, index_col=0)
            print("读取缓存: ", path)
            return
        indexes = self.shapley_values.index
        for idx in tqdm(indexes):
            self.sdig(idx)
        self.df_sdgg.to_csv(path, index=True)

    def show_sigg_hist(self, sensitive_name):
        """画统计图"""
        df = self.df_sdgg
        values, bins, bars = plt.hist(df[sensitive_name], edgecolor="white")
        plt.bar_label(bars, fontsize=10, color="navy")
        plt.margins(x=0.01, y=0.1)
        plt.show()

    def get_candidates_fair_X(self, sensitive_name, threshold):
        """获得候选集和公平样本"""
        df = self.df_sdgg
        candidates_idx = df[df[sensitive_name] > threshold].index
        fair_idx = df[df[sensitive_name] <= threshold].index
        self.X_fair = X.loc[fair_idx]
        self.X_candidates = X.loc[candidates_idx]
        print(f"统计: 大于 threshold {len(self.X_candidates)} 个, 小于 {len(self.X_fair)} 个")

    def get_df_neighbors_by_knn(self, knn_k=7):
        knn = NearestNeighbors(n_neighbors=knn_k)
        knn.fit(self.X_fair)
        # 此处的 idx 应该是 X_fair 的序号
        distance, idxs = knn.kneighbors(self.X_candidates)
        self.idxs = idxs # 方便调试
        self.df_neighbors = pd.DataFrame(idxs, index=self.X_candidates.index)
        # 对每行进行处理，映射到 y.iloc[row].index 或者 X.
        self.df_neighbors = self.df_neighbors.apply(lambda row: self.X_fair.iloc[row].index.values)

    def get_unfair_idx(self):
        """获取不公平的标签"""
        unfair_idx = []
        ne = self.df_neighbors
        for i in ne.index:
            neighbors = ne.loc[i].values
            # 候选者标签
            candidate_label = self.y.loc[i]
            # 公平者中的众数标签
            neighbors_mode_label = y.loc[neighbors].mode().values[0]

            if candidate_label != neighbors_mode_label:
                unfair_idx.append(i)
        
        self.unfair_idx = pd.Series(unfair_idx)
        print(f"候选者 {len(ne)} 个, 不公平者 {len(unfair_idx)} 个, 比例 {len(unfair_idx) / len(ne) * 100:.2f}%")


In [325]:
## 无关小测试
def test_svgg_1():
    xi_idx = 19749
    sv_i_f = shapley_values_X.loc[xi_idx, sensitive_names]
    sv_f = shapley_values_X.loc[:, sensitive_names]
    np.abs(sv_i_f - sv_f).sum(axis=0)


In [328]:
shapley_values_X = shapley_values.loc[X.index]
sff = ShapFairFrameworkGroup(X, y, shapley_values_X, sensitive_names)
sff.sdgg("pd-cache/2024-06-06-Male-test.csv")
sff.get_candidates_fair_X('sex', 0.1)
sff.get_df_neighbors_by_knn()
sff.get_unfair_idx()
sff.unfair_idx.values

读取缓存:  pd-cache/2024-06-06-Male-test.csv
统计: 大于 threshold 379 个, 小于 14865 个
候选者 379 个, 不公平者 99 个, 比例 26.12%


array([13857, 26890, 27533, 21707, 21737, 16266,  5333,  5088, 15917,
       28722,  3171, 11352,  4921, 20479, 16297, 31585,  7683, 31052,
       25056, 28753,   304, 14959,  8724, 14225,  8565, 10632,  6115,
       23543, 13881,   171, 26353, 23837, 11231, 21724,  9143,  4364,
       24391, 22709, 21183,    36, 15678,  6999,  1494,  7624, 27871,
       19466, 22667,  9386, 26401, 31323, 26692, 28255, 26996, 13436,
         524, 17458, 26185, 19289,   901, 27543,  6582,   917, 25613,
        8490,  7674, 20335,  1380, 23989, 14562, 24802,  4728,   159,
       32142, 32076, 28011, 29846, 21494,  6629, 14566, 19056,  3173,
        8712,  1007, 10745,  2343, 26361, 18449, 27632, 14153, 29721,
        8415, 23486,  7494, 21689, 12043, 20764, 25547,  7158,   197],
      dtype=int64)

In [304]:
# 无关小测试
def test_neighbors(ne):
    # ne = sff.df_neighbors
    cnt = 0
    for i in ne.index:
        neighbors = ne.loc[i].values
        candidate_label = y.loc[i]
        neighbors_mode_label = y.loc[neighbors].mode().values[0]
        if candidate_label != neighbors_mode_label:
            cnt += 1
    print(cnt)