In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import confusion_matrix, roc_curve, auc, ConfusionMatrixDisplay
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset


In [3]:

def load_adult_data():
    # 列名
    col_names = [
        "age", "workclass", "fnlwgt", "education", "education-num", "marital-status",
        "occupation", "relationship", "race", "sex", "capital-gain", "capital-loss",
        "hours-per-week", "native-country", "income"
    ]

    # 训练集
    train_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
    df_train = pd.read_csv(train_url, header=None, names=col_names, na_values=" ?", skipinitialspace=True)

    # 测试集
    test_url = "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
    df_test = pd.read_csv(test_url, header=0, names=col_names, na_values=" ?", skipinitialspace=True)

    # 去掉标签里的 "." 符号
    df_test["income"] = df_test["income"].str.replace(".", "", regex=False).str.strip()

    return df_train, df_test

In [4]:
df_train, df_test = load_adult_data()


In [5]:
df_train.sample(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
30547,18,Private,137646,11th,7,Never-married,Other-service,Own-child,White,Female,0,0,15,United-States,<=50K
29887,25,Private,180778,HS-grad,9,Never-married,Craft-repair,Not-in-family,White,Male,0,0,75,United-States,<=50K
3416,39,Local-gov,305597,HS-grad,9,Separated,Exec-managerial,Unmarried,White,Male,0,0,40,United-States,<=50K
5419,28,Private,183780,HS-grad,9,Married-civ-spouse,Transport-moving,Husband,White,Male,0,1740,40,United-States,<=50K
23744,55,?,170994,HS-grad,9,Never-married,?,Not-in-family,White,Female,0,0,40,United-States,<=50K


In [7]:
def preprocess_data(df_train, df_test):
    # 分离特征与标签
    X_train, y_train = df_train.drop("income", axis=1), df_train["income"]
    X_test, y_test = df_test.drop("income", axis=1), df_test["income"]

    # 转换标签为 0/1
    y_train = y_train.apply(lambda v: 1 if v.strip() == ">50K" else 0).values
    y_test = y_test.apply(lambda v: 1 if v.strip() == ">50K" else 0).values

    # 数值/类别特征
    numeric_features = ["age", "fnlwgt", "education-num", "capital-gain", "capital-loss", "hours-per-week"]
    categorical_features = ["workclass", "education", "marital-status", "occupation",
                            "relationship", "race", "sex", "native-country"]

    # 预处理管道
    numeric_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler())
    ])
    categorical_transformer = Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("onehot", OneHotEncoder(handle_unknown="ignore"))
    ])
    preprocessor = ColumnTransformer([
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

    # 拟合 & 转换
    X_train_processed = preprocessor.fit_transform(X_train)
    X_test_processed = preprocessor.transform(X_test)

    return X_train_processed, y_train, X_test_processed, y_test

In [8]:
X_train_processed, y_train, X_test_processed, y_test = preprocess_data(df_train, df_test)

In [9]:
X_train_processed.shape, y_train.shape, X_test_processed.shape, y_test.shape

((32561, 108), (32561,), (16281, 108), (16281,))

In [10]:
from sklearn.model_selection import train_test_split

X_train_processed, X_val_processed, y_train, y_val = train_test_split(X_train_processed, y_train, test_size=0.2, random_state=42)


In [11]:
X_val_processed.shape, y_val.shape

((6513, 108), (6513,))