In [21]:
"""数据集预处理

主要是为了 2024-05-08-shap-knn-v1.0.ipynb 准备
如果
"""

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
from pathlib import Path

In [3]:
# 加载数据路径
try:
    file_path = Path(__file__)
    data_path = file_path.joinpath("..", "..", "..", "input", "adult.csv")
except Exception:
    file_path = Path(".")
    data_path = file_path.joinpath("..", "input", "adult.csv")
# 加载原始数据
df = pd.read_csv(data_path, encoding="latin-1")
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,income
0,90,?,77053,HS-grad,9,Widowed,?,Not-in-family,White,Female,0,4356,40,United-States,<=50K
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States,<=50K
2,66,?,186061,Some-college,10,Widowed,?,Unmarried,Black,Female,0,4356,40,United-States,<=50K
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States,<=50K
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States,<=50K


In [4]:
# 标签处理
y = (df['income'] == '>50K') * 1 # 没有 * 1 会是 False/True，这样就变成了 0/1
# 另一种写法
# y = y.map({"<=50K": 0, ">50K": 1})
y

0        0
1        0
2        0
3        0
4        0
        ..
32556    0
32557    0
32558    1
32559    0
32560    0
Name: income, Length: 32561, dtype: int32

In [5]:
# 提取数据
X_v0 = df.drop('income', axis=1)

In [6]:
(X_v0 == '?').sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
dtype: int64

In [7]:
# 替换 ?
print('替换 ? 前：', (X_v0 == '?').sum().sum())
X_v1 = X_v0.replace('?', np.nan)
print('替换 ? 后：', (X_v1 == '?').sum().sum())

替换 ? 前： 4262
替换 ? 后： 0


In [8]:
# 处理 object 列
X_v1.select_dtypes('object').columns

Index(['workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'native.country'],
      dtype='object')

In [9]:
# 按众数填
X_v2 = X_v1.copy()
# 展示 np.nan 的行
X_v2[X_v2.isnull().any(axis=1)].head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,
14,51,,172175,Doctorate,16,Never-married,,Not-in-family,White,Male,0,2824,40,United-States
18,22,Private,119592,Assoc-acdm,12,Never-married,Handlers-cleaners,Not-in-family,Black,Male,0,2824,40,


In [10]:
X_v2.head(10)

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,90,,77053,HS-grad,9,Widowed,,Not-in-family,White,Female,0,4356,40,United-States
1,82,Private,132870,HS-grad,9,Widowed,Exec-managerial,Not-in-family,White,Female,0,4356,18,United-States
2,66,,186061,Some-college,10,Widowed,,Unmarried,Black,Female,0,4356,40,United-States
3,54,Private,140359,7th-8th,4,Divorced,Machine-op-inspct,Unmarried,White,Female,0,3900,40,United-States
4,41,Private,264663,Some-college,10,Separated,Prof-specialty,Own-child,White,Female,0,3900,40,United-States
5,34,Private,216864,HS-grad,9,Divorced,Other-service,Unmarried,White,Female,0,3770,45,United-States
6,38,Private,150601,10th,6,Separated,Adm-clerical,Unmarried,White,Male,0,3770,40,United-States
7,74,State-gov,88638,Doctorate,16,Never-married,Prof-specialty,Other-relative,White,Female,0,3683,20,United-States
8,68,Federal-gov,422013,HS-grad,9,Divorced,Prof-specialty,Not-in-family,White,Female,0,3683,40,United-States
9,41,Private,70037,Some-college,10,Never-married,Craft-repair,Unmarried,White,Male,0,3004,60,


In [11]:
df.drop('income', axis=1).select_dtypes("object").columns

Index(['workclass', 'education', 'marital.status', 'occupation',
       'relationship', 'race', 'sex', 'native.country'],
      dtype='object')

In [41]:
def m1_label_scaler(X: pd.DataFrame, y: pd.Series, sensi_names: list[str]):
    """将数据集先分割，然后数据缩放
    sensi_feat_names: list[str]
    比如 sensi_feat_names = ["sex"]
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0
    )
    origin_names = X_train[sensi_names].value_counts().index

    # 数据分类
    # 类别映射关系
    categorical_map = {}
    categorical = X.select_dtypes("object").columns
    for feature in categorical:
        le = preprocessing.LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_test[feature] = le.transform(X_test[feature])
        for cl in le.classes_:
            categorical_map.update({cl: le.transform([cl])[0]})
    # 映射关系
    # print(categorical_map)
    # 数据缩放（需要加上 columns 和 index 参数，这样才可以保证和原来的一样）
    scaler = StandardScaler()
    X_train = pd.DataFrame(
        scaler.fit_transform(X_train), columns=X.columns, index=X_train.index
    )
    X_test = pd.DataFrame(
        scaler.transform(X_test), columns=X.columns, index=X_test.index
    )
    handle_names = X_train[sensi_names].value_counts().index
    names_map_scaler = {}
    for key, value in zip(origin_names, handle_names):
        names_map_scaler[key] = value
    names_map_scaler
    # 统计保护属性的 index
    # index_attrs = list(map(lambda f: X.columns.get_loc(f), sensi_feat_names))
    # extra["sensi_feat_indexes"] = index_attrs

    return X_train, X_test, y_train, y_test, names_map_scaler


sensi_names = ["sex"]
X_train, X_test, y_train, y_test, name_map_scaler = m1_label_scaler(X_v2, y, sensi_names)

In [42]:
def group_df(X, y, features):
    result = []
    grouped = X.groupby(features)
    for name, data in grouped:
        # 在这里对每个组的数据进行操作
        print(type(name))
        label = y[data.index]
        result.append((name, data, label))
    return result

result = group_df(X_train, y, sensi_names)
result[0][0]

<class 'tuple'>
<class 'tuple'>


(-1.4304699646272252,)

## 单个函数

In [49]:
from collections import namedtuple
import logging
from rich.logging import RichHandler

def logger_factory(logger_name="rich"):
    FORMAT = "%(message)s"
    logging.basicConfig(
        level="NOTSET", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
    )
    return logging.getLogger(logger_name)


def handle(
    *,
    data_path: str | Path,
    label_name: str,
    label_mapper: dict,

    sensi_names: list[str],
) -> list[tuple[tuple, pd.DataFrame, pd.Series]]:
    log = logger_factory()
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"{data_path} 文件不存在")
    log.info(f"{data_path}文件导入成功")
    df = pd.read_csv(data_path, encoding="latin-1")
    log.info("文件读取成功，输出基本信息")
    log.info(f"df 包含 {df.shape[0]} 行数据，{df.shape[1]} 列")
    log.info("开始处理标签")
    y = df[label_name].map(label_mapper)
    log.info(y.value_counts())
    log.info("开始处理数据集")
    X_v0 = df.drop(label_name, axis=1)
    # todo 这里不同的数据集可能不一样
    X_v1 = X_v0.replace("?", np.nan)
    X_v2 = X_v1.copy()
    # 标准化处理数据集
    X_train, X_test, y_train, y_test, name_map_scaler = m1_label_scaler(X_v2, y, sensi_names)
    log.info("标准化处理成功")
    log.info(f"X_train 包含 {X_train.shape[0]} 行数据，{X_train.shape[1]} 列")
    log.info(f"X_test 包含 {X_test.shape[0]} 行数据，{X_test.shape[1]} 列")
    log.info(f"敏感属性对应值 \n{name_map_scaler}" )
    # 数据集分组
    return X_train, X_test, y_train, y_test, name_map_scaler


handle(
    data_path="../input/adult.csv",
    label_name="income",
    label_mapper={},
    sensi_names=["sex"],
)

(            age  workclass    fnlwgt  education  education.num  \
 32098  0.101484   1.645998 -1.494279  -0.332263       1.133894   
 25206  0.028248  -1.501124  0.438778   0.184396      -0.423425   
 23491  0.247956  -0.242275  0.045292   1.217715      -0.034095   
 12367 -0.850587  -1.501124  0.793152   0.184396      -0.423425   
 7054  -0.044989  -2.130548 -0.853275   0.442726       1.523223   
 ...         ...        ...       ...        ...            ...   
 13123  3.763293   1.016573  0.870243   1.217715      -0.034095   
 19648 -0.191461  -0.242275  0.847831   0.184396      -0.423425   
 9845  -0.923823  -0.242275 -1.302317  -2.140570      -0.812755   
 10799  0.394429  -0.242275 -0.704154   0.442726       1.523223   
 2732   0.028248  -0.242275  0.326815   0.184396      -0.423425   
 
        marital.status  occupation  relationship      race       sex  \
 32098       -0.402341   -0.782920      2.214196  0.392980 -1.430470   
 25206       -0.402341   -0.090314     -0.899410  