## 导包

In [26]:
import warnings
import numpy as np
warnings.filterwarnings("ignore")
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [27]:
def result_show(model, X_test, y_test, y_pred):
    # 计算混淆矩阵
    confusion = confusion_matrix(y_test, y_pred)
    print("混淆矩阵:")
    print(confusion)

    # 计算预测概率
    y_pred_prob = model.predict_proba(X_test)[:, 1]
    # 计算ROC曲线和AUC
    fpr, tpr, _ = roc_curve(y_test, y_pred_prob)
    roc_auc = auc(fpr, tpr)
    print(f'{roc_auc:.4f}')

    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()


## 数据分析

In [28]:
train_df = pd.read_csv('data/P2/train.csv')
test_df = pd.read_csv('data/P2/test.csv')

In [29]:
# 数据清洗
# 1. 去掉无用的列
train_df = train_df.drop(['人口普查员序号', '教育程度'], axis=1)
test_df = test_df.drop(['人口普查员序号', '教育程度'], axis=1)

In [32]:
train_df['收入'] = train_df['收入'].map({'<=50K': 0, '>50K': 1})
# 选择数值特征
numeric_features = train_df.select_dtypes(include=[np.number])
# 计算相关矩阵
correlation_matrix = numeric_features.corr()
correlation_matrix

Unnamed: 0,年龄,教育年数,资本收益,资本支出,每周工作小时数,收入
年龄,1.0,0.038138,0.077736,0.05738,0.066875,0.231879
教育年数,0.038138,1.0,0.128177,0.078855,0.147866,0.336221
资本收益,0.077736,0.128177,1.0,-0.03131,0.080102,0.223558
资本支出,0.05738,0.078855,-0.03131,1.0,0.055381,0.149318
每周工作小时数,0.066875,0.147866,0.080102,0.055381,1.0,0.229942
收入,0.231879,0.336221,0.223558,0.149318,0.229942,1.0


In [None]:
object_columns = train_df.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for label in object_columns:
    train_df[f'{label}'] = le.fit_transform(train_df[f'{label}'])

object_columns = test_df.select_dtypes(include=['object']).columns.tolist()
le = LabelEncoder()
for label in object_columns:
    test_df[f'{label}'] = le.fit_transform(test_df[f'{label}'])

模型训练

In [None]:
from sklearn.model_selection import train_test_split

X = train_df.drop(['收入'], axis=1)
y = train_df['收入']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f'{accuracy:.4f}')
result_show(model, X_val, y_val, y_pred)

## 测试

In [None]:
y_pred = model.predict(test_df)
y_pred_labels = ['<=50K' if pred == 0 else '>50K' for pred in y_pred]

true_label = pd.read_csv('data/P2/true.csv')
true_label_list = true_label.iloc[:, 0].tolist()
len(true_label_list), len(y_pred_labels)

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# 确保预测标签和真实标签的长度相同
assert len(y_pred_labels) == len(true_label), "预测标签和真实标签的长度不匹配"

# 计算性能指标
accuracy = accuracy_score(true_label, y_pred_labels)
precision = precision_score(true_label, y_pred_labels, pos_label='>50K')
recall = recall_score(true_label, y_pred_labels, pos_label='>50K')
f1 = f1_score(true_label, y_pred_labels, pos_label='>50K')

# 打印性能指标
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

| 准确率 | Precision |
| --- | --- |
| 0.8146 | 0.6912 |