In [1]:
"""数据集预处理类"""

import numpy as np
import pandas as pd
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os
from pathlib import Path
import logging
from rich.logging import RichHandler

In [2]:
def normalize_label_scaler(X: pd.DataFrame, y: pd.Series, sensi_names: list[str]):
    """将数据集先分割，然后数据缩放
    sensi_feat_names: list[str]
    比如 sensi_feat_names = ["sex"]
    """
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.3, random_state=0
    )
    origin_names = X_train[sensi_names].value_counts().index

    # 数据分类
    # 类别映射关系
    categorical_map = {}
    categorical = X.select_dtypes("object").columns
    for feature in categorical:
        le = preprocessing.LabelEncoder()
        X_train[feature] = le.fit_transform(X_train[feature])
        X_test[feature] = le.transform(X_test[feature])
        for cl in le.classes_:
            categorical_map.update({cl: le.transform([cl])[0]})
    # 映射关系
    # print(categorical_map)
    # 数据缩放（需要加上 columns 和 index 参数，这样才可以保证和原来的一样）
    scaler = StandardScaler()
    X_train = pd.DataFrame(
        scaler.fit_transform(X_train), columns=X.columns, index=X_train.index
    )
    X_test = pd.DataFrame(
        scaler.transform(X_test), columns=X.columns, index=X_test.index
    )
    handle_names = X_train[sensi_names].value_counts().index
    names_map_scaler = {}
    for key, value in zip(origin_names, handle_names):
        names_map_scaler[key] = value
    names_map_scaler
    # 统计保护属性的 index
    # index_attrs = list(map(lambda f: X.columns.get_loc(f), sensi_feat_names))
    # extra["sensi_feat_indexes"] = index_attrs

    return X_train, X_test, y_train, y_test, names_map_scaler


def logger_factory(logger_name="rich", level=logging.INFO):
    FORMAT = "%(message)s"
    logging.basicConfig(
        level="NOSET", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
    )
    logger = logging.getLogger(logger_name)
    logger.setLevel(level)
    return logger


def handle(
    *,
    data_path: str | Path,
    label_name: str,
    label_mapper: dict,
    sensi_names: list[str],
) -> list[tuple[tuple, pd.DataFrame, pd.Series]]:
    log = logger_factory()
    if not os.path.exists(data_path):
        raise FileNotFoundError(f"{data_path} 文件不存在")
    log.debug(f"{data_path}文件导入成功")
    df = pd.read_csv(data_path, encoding="latin-1")
    log.info("文件读取成功，输出基本信息")
    log.info(f"df 包含 {df.shape[0]} 行数据，{df.shape[1]} 列")
    log.info("开始处理标签")
    y = df[label_name].map(label_mapper)
    log.info(y.value_counts())
    log.info("开始处理数据集")
    X_v0 = df.drop(label_name, axis=1)
    # todo 这里不同的数据集可能不一样
    X_v1 = X_v0.replace("?", np.nan)
    X_v2 = X_v1.copy()
    # 标准化处理数据集
    X_train, X_test, y_train, y_test, name_map_scaler = normalize_label_scaler(
        X_v2, y, sensi_names
    )
    log.info("标准化处理成功")
    log.info(f"X_train 包含 {X_train.shape[0]} 行数据，{X_train.shape[1]} 列")
    log.info(f"X_test 包含 {X_test.shape[0]} 行数据，{X_test.shape[1]} 列")
    log.info(f"敏感属性对应值 \n{name_map_scaler}")
    # 数据集分组
    return X_train, X_test, y_train, y_test, name_map_scaler


handle(
    data_path="../input/adult.csv",
    label_name="income",
    label_mapper={},
    sensi_names=["sex"],
)
None

ValueError: Unknown level: 'NOSET'

In [24]:
class DataProcessorV1:
    def __init__(
        self,
        data_path: str,
        *,
        sensi_names: list[str] = None,
        np_seed: int = 42,
        level: int = None,
    ) -> None:
        # 初始化随机数种子
        np.random.seed(np_seed)
        self.seed = np_seed
        self.sensi_names = sensi_names
        # 初始化日志
        self.init_logger(level=level)
        # 初始化 df
        self.init_df(data_path)

    def init_logger(self, logger_name="default_dataset", level=None) -> None:
        if level is None:
            level = logging.INFO
        FORMAT = "%(message)s"
        logging.basicConfig(
            level="NOSET", format=FORMAT, datefmt="[%X]", handlers=[RichHandler()]
        )
        self.logger = logging.getLogger(logger_name)
        self.logger.setLevel(level)

    def init_df(self, data_path: str):
        if not os.path.exists(data_path):
            raise FileNotFoundError(f"{data_path} 文件路径不存在")
        if not data_path.endswith("csv"):
            raise TypeError("文件类型错误，应该是 csv 文件")
        self.df = pd.read_csv(data_path, encoding="latin-1")
        self.logger.debug(f"df 包含 {self.df.shape[0]} 行数据，{self.df.shape[1]} 列")

    def init_Xy(self, label, mapper):
        self.X = self.df.drop(label, axis=1)
        self.y = self.df[label].map(mapper)
        self.X.replace("?", np.nan)
        self.split_Xy()
        self.label_X()

    def split_Xy(self, test_size=0.3):
        if self.seed is None:
            raise BaseException("self.seed 未定义")
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(
            self.X, self.y, test_size=test_size, random_state=self.seed
        )

    def label_X(self):
        X_object_columns = self.X.select_dtypes("object").columns
        self.X_train_label = self.X_train.copy()
        self.X_test_label = self.X_test.copy()
        for feature in X_object_columns:
            le = preprocessing.LabelEncoder()
            self.X_train_label[feature] = le.fit_transform(self.X_train[feature])
            self.X_test_label[feature] = le.transform(self.X_test[feature])

    def scaler_X(self):
        if self.sensi_names is None:
            raise BaseException("self.sensi_names 未定义")
        # 获得原始名字
        origin_names = self.X_train[self.sensi_names].value_counts().index

        X_columns = self.X_train.columns
        # 数据缩放（需要加上 columns 和 index 参数，这样才可以保证和原来的一样）
        scaler = StandardScaler()
        self.X_train_label_scale = pd.DataFrame(
            scaler.fit_transform(self.X_train_label),
            columns=X_columns,
            index=self.X_train.index,
        )
        self.X_test_label_scale = pd.DataFrame(
            scaler.transform(self.X_test_label),
            columns=X_columns,
            index=self.X_test.index,
        )
        # 映射 原始名 -> 数字
        handle_names = self.X_train_label_scale[self.sensi_names].value_counts().index
        self.names_map_scaler = {}
        for key, value in zip(origin_names, handle_names):
            self.names_map_scaler[key] = value


processor = DataProcessorV1("../input/adult.csv", sensi_names=["sex", "race"])
processor.init_Xy("income", {"<=50K": 0, ">50K": 1})
processor.X_train_label
processor.scaler_X()
processor.X_train_label_scale
processor.X_test_label_scale
processor.names_map_scaler

int