## 导包

In [21]:
import warnings
import numpy as np
warnings.filterwarnings("ignore")
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## 数据分析

In [26]:
train_df = pd.read_csv('adult.data', header=None)
train_df.rename(columns={0: 'age', 
                          1: 'workclass', 
                          2: 'fnlwgt',
                          3: 'education', 
                          4: 'education-num', 
                          5: 'marital-status', 
                          6: 'occupation',
                          7: 'relationship',
                          8: 'race',
                          9: 'sex',
                          10: 'capital-gain',
                          11: 'capital-loss',
                          12: 'hours-per-week',
                          13: 'native-country',
                          14: 'income'}, inplace=True)
train_df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [29]:
object_columns = train_df.dtypes[train_df.dtypes == 'object'].index.tolist()
for i in object_columns:
    train_df[f'{i}'] = train_df[f'{i}'].str.strip()

In [31]:
# 数据清洗
# 1. 去掉无用的列
train_df = train_df.drop(['fnlwgt', 'education'], axis=1)

In [32]:
train_df['income'] = train_df['income'].map({'<=50K': 0, '>50K': 1})

In [33]:
# 选择数值特征
numeric_features = train_df.select_dtypes(include=[np.number])
# 计算相关矩阵
correlation_matrix = numeric_features.corr()
correlation_matrix

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income
age,1.0,0.036527,0.077674,0.057775,0.068756,0.234037
education-num,0.036527,1.0,0.12263,0.079923,0.148123,0.335154
capital-gain,0.077674,0.12263,1.0,-0.031615,0.078409,0.223329
capital-loss,0.057775,0.079923,-0.031615,1.0,0.054256,0.150526
hours-per-week,0.068756,0.148123,0.078409,0.054256,1.0,0.229689
income,0.234037,0.335154,0.223329,0.150526,0.229689,1.0


In [34]:
object_columns = train_df.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for label in object_columns:
    train_df[f'{label}'] = le.fit_transform(train_df[f'{label}'])

In [35]:
train_df.dtypes

age               int64
workclass         int64
education-num     int64
marital-status    int64
occupation        int64
relationship      int64
race              int64
sex               int64
capital-gain      int64
capital-loss      int64
hours-per-week    int64
native-country    int64
income            int64
dtype: object

模型训练

In [36]:
from sklearn.model_selection import train_test_split

X = train_df.drop(['income'], axis=1)
y = train_df['income']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [40]:
model = LogisticRegression(C=1.0, max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'{accuracy:.4f}')

0.8250
