# 主要是框架内部的处理类

写一个类

新的 idea：

1. 将 Male 分类出来后，继续划分，如果是 Male 且 标签是 1，分为 1 组，从该组中寻找，如果该组存在 shapley_value 明显较大的值，则增加 Male 改为 Female 的样本重新训练
2. 将 Female 分类出来后，继续划分，如果是 Female 且 标签是 0，单独分为 1 组，从该组中寻找，如果该组存在 shapley_value 明显较小的值，则 增加 Female 改为 Male 的样本重新训练


![svg](assets/2024-06-07-流程图-v2.drawio.svg)

## 数据预处理部分


In [1]:
# %%
"""数据集预处理类"""

from collections import namedtuple
import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

import os
import logging
from rich.logging import RichHandler

Group = namedtuple("Group", ["scale_name", "origin_name", "X", "y"])


# %%
class DataPreProcessor:
    """数据预处理器"""

    def __init__(
        self,
        *,
        dataset_name: str,
        data_path: str,
        sensitive_names: list[str],
        label: str,
        columns_mapper: dict[dict],
        np_seed: int,
        log_level: int,
    ) -> None:
        """初始化数据预处理器"""
        self.dataset_name = dataset_name
        # 设置 numpy 随机数种子
        np.random.seed(np_seed)
        self.seed = np_seed
        self.sensitive_names = sensitive_names
        # 初始化日志
        self._init_logger(level=log_level)
        # 初始化 df
        self._load_df(data_path)
        # 设置列映射
        self.label = label
        self.columns_mapper = columns_mapper
        # 初始化数据集
        self._init_df_Xy_mapper()
        self._init_Xy_dummy()

    def _init_Xy_dummy(self):
        # dummy + split + group
        self.X = pd.get_dummies(self.X)
        # self._split_Xy(self.X, self.y)
        # self._group_Xy(self.X_train, self.y)

    def _init_logger(self, logger_name="default_dataset", level=None) -> None:
        """初始化日志"""
        if level is None:
            level = logging.INFO
        FORMAT = "%(message)s"
        logging.basicConfig(
            level="INFO", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
        )
        self.log = logging.getLogger(logger_name)
        self.log.setLevel(level)

    def _load_df(self, data_path: str):
        """载入数据"""
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"{data_path} 文件路径不存在")
        if not data_path.endswith("csv"):
            raise TypeError("文件类型错误，应该是 csv 文件")
        self.df = pd.read_csv(data_path, encoding="latin-1")
        self.log.debug(f"df 包含 {self.df.shape[0]} 行数据，{self.df.shape[1]} 列")

    def _init_df_Xy_mapper(self):
        """清洗数据 + 划分数据集"""
        label = self.label
        self.df.replace("?", np.nan, inplace=True)
        for column, mapper in self.columns_mapper.items():
            self.df[column] = self.df[column].map(mapper)
        self.X = self.df.drop(label, axis=1)
        self.y = self.df[label]

    def _split_Xy(self, X, y, test_size=0.3):
        """分割数据 = 训练集 + 测试集"""
        if self.seed is None:
            raise BaseException("self.seed 未定义")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            X, y, test_size=test_size, random_state=self.seed
        )

    def _group_Xy(self, X, y):
        result = []
        groups = X.groupby(self.sensitive_names)
        for name, data in groups:
            label = y[data.index]
            result.append(Group(name, name, data, label))
        self.groups = result
        return result

In [2]:
# 重新加载模块
dataset_name = "adult"
data_path = "../input/adult.csv"
label="income"
sensitive_names = ["sex"]
columns_mapper = {
    "income": {"<=50K": 0, ">50K": 1},
    "sex": {"Male": 1, "Female": 0},
}
np_seed = 42
test_size=0.3
# 数据处理


p = DataPreProcessor(
    dataset_name=dataset_name,
    data_path=data_path,
    sensitive_names=sensitive_names,
    label=label,
    columns_mapper=columns_mapper,
    np_seed=np_seed,
    log_level=None,
)

In [3]:
import xgboost

X = p.X
y = p.y
X.shape, y.shape

((32561, 104), (32561,))

In [4]:
X.head()

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
0,90,77053,9,0,0,4356,40,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1,82,132870,9,0,0,4356,18,False,False,False,...,False,False,False,False,False,False,False,True,False,False
2,66,186061,10,0,0,4356,40,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,54,140359,4,0,0,3900,40,False,False,False,...,False,False,False,False,False,False,False,True,False,False
4,41,264663,10,0,0,3900,40,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [5]:
## 分割数据集
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=np_seed, shuffle=True
)

In [6]:
X_train.head()

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
19749,58,290661,9,1,0,0,40,False,False,False,...,False,False,False,False,False,False,False,True,False,False
1216,62,109463,10,0,0,1617,33,False,False,False,...,False,False,False,False,False,False,False,True,False,False
27962,33,137088,13,1,0,0,40,False,False,False,...,False,False,False,False,False,False,False,False,False,False
23077,24,117767,12,1,0,0,20,False,False,False,...,False,False,False,False,False,False,False,True,False,False
10180,67,431426,9,0,0,0,2,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [7]:
model = xgboost.XGBClassifier()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [8]:
sensitive_features = X_test["sex"]
sensitive_features

14160    1
27048    1
28868    0
5667     1
7827     0
        ..
32476    0
21100    1
27131    1
25526    1
21385    1
Name: sex, Length: 9769, dtype: int64

In [9]:
%reload_ext autoreload
from utils.FairMetric_0605 import FairMetric

fm = FairMetric("adult", y_true=y_test, y_pred=y_pred, sensitive_features=sensitive_features)
fm.eval_metrics()
fm.metrics

准确度(1)      0.870918
精确度(1)      0.779389
召回率(1)      0.643162
f1分数(1)     0.704753
AUC分数(1)    0.792910
DP差异(0)     0.185713
DP比率(1)     0.282786
EO差异(0)     0.099638
EO比率(1)     0.218000
Name: adult, dtype: float64

In [10]:
# 找到 优势集 和 劣势集 的 index
male_y1_index = ((X['sex'] == 1) & (y == 1))
male_y1 = X_train.loc[male_y1_index]

female_y0_index = ((X['sex'] == 0) & (y == 0))
female_y0 = X_train.loc[female_y0_index]

In [11]:
# 简单验证
print(male_y1['sex'].value_counts(), y_train[male_y1_index].value_counts())
print(female_y0['sex'].value_counts(), y_train[female_y0_index].value_counts())

sex
1    4652
Name: count, dtype: int64 income
1    4652
Name: count, dtype: int64
sex
0    6699
Name: count, dtype: int64 income
0    6699
Name: count, dtype: int64


In [15]:
import shap
def get_shapley_values(X, model):
    explainer = shap.Explainer(model)
    shap_values = explainer(X)
    shapley_values = pd.DataFrame(
        data=shap_values.values, columns=X.columns, index=X.index
    )
    return shapley_values
shapley_values = get_shapley_values(X_train, model)

In [16]:
shapley_values.head()

Unnamed: 0,age,fnlwgt,education.num,sex,capital.gain,capital.loss,hours.per.week,workclass_Federal-gov,workclass_Local-gov,workclass_Never-worked,...,native.country_Portugal,native.country_Puerto-Rico,native.country_Scotland,native.country_South,native.country_Taiwan,native.country_Thailand,native.country_Trinadad&Tobago,native.country_United-States,native.country_Vietnam,native.country_Yugoslavia
19749,0.484842,-0.228446,-0.373211,0.066091,-0.1519,-0.064256,-0.026789,-0.029827,0.012762,0.0,...,0.0,0.001327,0.0,0.0,-0.00042,0.0,0.0,0.021753,0.001019,-0.00048
1216,0.478362,-0.138005,-0.213564,-0.603941,-0.130865,-2.648266,-0.882516,-0.023052,0.003886,0.0,...,0.0,0.001471,0.0,0.0,-0.00042,0.0,0.0,0.002083,0.000933,-0.000479
27962,-0.293473,-0.031952,0.700457,0.044355,-0.198254,-0.095064,-0.079898,-0.017346,0.00611,0.0,...,0.0,0.001892,0.0,0.0,-0.002735,0.0,0.0,-0.289931,0.001018,-0.000479
23077,-1.785897,-0.044517,0.085909,0.122137,-0.126202,-0.058107,-0.902482,-0.016421,-0.002605,0.0,...,0.0,0.001336,0.0,0.0,-9.6e-05,0.0,0.0,0.016154,0.001018,-0.000479
10180,-0.627592,0.178091,-0.516186,-0.28031,-0.129516,-0.08409,-0.577171,-0.015572,0.004121,0.0,...,0.0,0.001101,0.0,0.0,-0.000391,0.0,0.0,-0.003176,0.001018,-0.00048


In [20]:
male_y1_shap_values = shapley_values.loc[male_y1_index]
female_y0_shap_values = shapley_values.loc[female_y0_index]

In [45]:
X_train.loc[male_y1_index]['sex'].value_counts()

sex
1    4652
Name: count, dtype: int64

In [100]:
def f_select(
    *,
    percent=0.03,
    df_sv: pd.DataFrame = None,
    X: pd.DataFrame = None,
    y: pd.Series = None,
    ascending=True,
    changed_label=None
):
    df_sorted = df_sv.sort_values("sex", ascending=ascending)
    num = round(df_sorted.shape[0] * percent)
    idx = df_sorted.head(num).index
    select_X = X.loc[idx]
    select_X["sex"] = changed_label
    select_y = y.loc[idx]
    return select_X, select_y

percent = 0.5
X_origin_male, y_origin_male = f_select(
    percent=percent,
    df_sv=male_y1_shap_values, X=X_train, y=y_train, changed_label=0
)
X_origin_female, y_origin_female = f_select(
    percent=percent,
    df_sv=female_y0_shap_values, ascending=False, X=X_train, y=y_train, changed_label=1
)

In [101]:
X_train.shape, y_train.shape

((22792, 104), (22792,))

In [102]:
X_train_new = pd.concat([X_train, X_origin_male, X_origin_female])
y_train_new = pd.concat([y_train, y_origin_male, y_origin_female])
X_train_new.shape, y_train_new.shape

((28468, 104), (28468,))

In [103]:
X_train_new.shape

(28468, 104)

In [104]:
model_new = xgboost.XGBClassifier()
model_new.fit(X_train_new, y_train_new)
y_pred_new = model_new.predict(X_test)


In [105]:
fm_new = FairMetric("adult", y_true=y_test, y_pred=y_pred_new, sensitive_features=sensitive_features)
fm_new.eval_metrics()
fm_new.metrics

准确度(1)      0.871635
精确度(1)      0.778747
召回率(1)      0.648291
f1分数(1)     0.707556
AUC分数(1)    0.795137
DP差异(0)     0.183217
DP比率(1)     0.294923
EO差异(0)     0.077386
EO比率(1)     0.229350
Name: adult, dtype: float64

In [106]:
fm.metrics

准确度(1)      0.870918
精确度(1)      0.779389
召回率(1)      0.643162
f1分数(1)     0.704753
AUC分数(1)    0.792910
DP差异(0)     0.185713
DP比率(1)     0.282786
EO差异(0)     0.099638
EO比率(1)     0.218000
Name: adult, dtype: float64