# FairShap 框架

![svg](assets/2024-06-07-流程图-v2.drawio.svg)

## 数据预处理部分


In [5]:
from utils.dataPreProcessor_0608 import DataPreProcessor

        

In [6]:
# 重新加载模块
dataset_name = "adult"
data_path = "../input/adult.csv"
label="income"
sensitive_names = ["sex"]
columns_mapper = {
    "income": {"<=50K": 0, ">50K": 1},
    "sex": {"Male": 1, "Female": 0},
}
np_seed = 42
test_size=0.3
# 数据处理


p = DataPreProcessor(
    dataset_name=dataset_name,
    data_path=data_path,
    sensitive_names=sensitive_names,
    label=label,
    columns_mapper=columns_mapper,
    np_seed=np_seed,
    log_level=None,
)

In [9]:
X_train, X_test, y_train, y_test = p.get_split()

In [11]:
X_train.head()

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
19749,58,290661,9,1,0,0,40,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1216,62,109463,10,0,0,1617,33,False,False,False,...,False,False,False,False,False,False,False,True,False,False
27962,33,137088,13,1,0,0,40,False,False,False,...,False,False,False,False,False,False,False,False,False,False
23077,24,117767,12,1,0,0,20,False,False,False,...,False,False,False,False,False,False,False,True,False,False
10180,67,431426,9,0,0,0,2,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [14]:
import xgboost
model = xgboost.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [28]:
sensitive_features = X_test["sex"]
sensitive_features

14160    1
27048    1
28868    0
5667     1
7827     0
        ..
32476    0
21100    1
27131    1
25526    1
21385    1
Name: sex, Length: 9769, dtype: int64

In [29]:
%reload_ext autoreload
from utils.FairMetric_0605 import FairMetric

fm = FairMetric("adult", y_true=y_test, y_pred=y_pred, sensitive_features=sensitive_features)
fm.eval_metrics()
fm.metrics

准确度(1)      0.870918
精确度(1)      0.779389
召回率(1)      0.643162
f1分数(1)     0.704753
AUC分数(1)    0.792910
DP差异(0)     0.185713
DP比率(1)     0.282786
EO差异(0)     0.099638
EO比率(1)     0.218000
Name: adult, dtype: float64

In [30]:
# 找到 优势集 和 劣势集 的 index
male_y1_index = ((X_train['sex'] == 1) & (y_train == 1))
male_y1 = X_train.loc[male_y1_index]

female_y0_index = ((X_train['sex'] == 0) & (y_train == 0))
female_y0 = X_train.loc[female_y0_index]

In [31]:
# 简单验证
print(male_y1['sex'].value_counts(), y_train[male_y1_index].value_counts())
print(female_y0['sex'].value_counts(), y_train[female_y0_index].value_counts())

sex
1    4652
Name: count, dtype: int64 income
1    4652
Name: count, dtype: int64
sex
0    6699
Name: count, dtype: int64 income
0    6699
Name: count, dtype: int64


In [32]:
import shap
def get_shapley_values(X, model):
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    shapley_values = pd.DataFrame(
        data=shap_values.values, columns=X.columns, index=X.index
    )
    return shapley_values
shapley_values = get_shapley_values(X_train, model)

In [33]:
shapley_values.head()

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
19749,0.484842,-0.228446,-0.373211,0.066091,-0.1519,-0.064256,-0.026789,-0.029827,0.012762,0.0,...,0.0,0.001327,0.0,0.0,-0.00042,0.0,0.0,0.021753,0.001019,-0.00048
1216,0.478362,-0.138005,-0.213564,-0.603941,-0.130865,-2.648266,-0.882516,-0.023052,0.003886,0.0,...,0.0,0.001471,0.0,0.0,-0.00042,0.0,0.0,0.002083,0.000933,-0.000479
27962,-0.293473,-0.031952,0.700457,0.044355,-0.198254,-0.095064,-0.079898,-0.017346,0.00611,0.0,...,0.0,0.001892,0.0,0.0,-0.002735,0.0,0.0,-0.289931,0.001018,-0.000479
23077,-1.785897,-0.044517,0.085909,0.122137,-0.126202,-0.058107,-0.902482,-0.016421,-0.002605,0.0,...,0.0,0.001336,0.0,0.0,-9.6e-05,0.0,0.0,0.016154,0.001018,-0.000479
10180,-0.627592,0.178091,-0.516186,-0.28031,-0.129516,-0.08409,-0.577171,-0.015572,0.004121,0.0,...,0.0,0.001101,0.0,0.0,-0.000391,0.0,0.0,-0.003176,0.001018,-0.00048


In [34]:
male_y1_shap_values = shapley_values.loc[male_y1_index]
female_y0_shap_values = shapley_values.loc[female_y0_index]

In [35]:
X_train.loc[male_y1_index]['sex'].value_counts()

sex
1    4652
Name: count, dtype: int64

In [37]:
def f_select(
    *,
    percent=0.03,
    df_sv: pd.DataFrame = None,
    X: pd.DataFrame = None,
    y: pd.Series = None,
    ascending=True,
    changed_label=None
):
    df_sorted = df_sv.sort_values("sex", ascending=ascending)
    num = round(df_sorted.shape[0] * percent)
    idx = df_sorted.head(num).index
    select_X = X.loc[idx]
    select_X["sex"] = changed_label
    select_y = y.loc[idx]
    return select_X, select_y

percent = 0.3
X_origin_male, y_origin_male = f_select(
    percent=percent,
    df_sv=male_y1_shap_values, X=X_train, y=y_train, changed_label=0
)
X_origin_female, y_origin_female = f_select(
    percent=percent,
    df_sv=female_y0_shap_values, ascending=False, X=X_train, y=y_train, changed_label=1
)

In [38]:
X_train.shape, y_train.shape

((22792, 104), (22792,))

In [39]:
X_train_new = pd.concat([X_train, X_origin_male, X_origin_female])
y_train_new = pd.concat([y_train, y_origin_male, y_origin_female])
X_train_new.shape, y_train_new.shape

((26198, 104), (26198,))

In [40]:
X_train_new.shape

(26198, 104)

In [41]:
model_new = xgboost.XGBClassifier()
model_new.fit(X_train_new, y_train_new)
y_pred_new = model_new.predict(X_test)


In [44]:
fm_new = FairMetric("adult-new", y_true=y_test, y_pred=y_pred_new, sensitive_features=sensitive_features)
fm_new.eval_metrics()
fm_new.metrics

准确度(1)      0.872454
精确度(1)      0.781379
召回率(1)      0.649145
f1分数(1)     0.709150
AUC分数(1)    0.795969
DP差异(0)     0.182143
DP比率(1)     0.296990
EO差异(0)     0.071325
EO比率(1)     0.228215
Name: adult-new, dtype: float64

In [45]:
fm.metrics

准确度(1)      0.870918
精确度(1)      0.779389
召回率(1)      0.643162
f1分数(1)     0.704753
AUC分数(1)    0.792910
DP差异(0)     0.185713
DP比率(1)     0.282786
EO差异(0)     0.099638
EO比率(1)     0.218000
Name: adult, dtype: float64