In [None]:
# Step 1: 导入常用库
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# Step 2: 读取数据
url = "https://raw.githubusercontent.com/JamesLo94/schulich_data_science/main/credit_card_default/UCI_Credit_Card.csv"
df = pd.read_csv(url)


In [15]:
df

Unnamed: 0,ID,LIMIT_BAL,SEX,EDUCATION,MARRIAGE,AGE,PAY_0,PAY_2,PAY_3,PAY_4,...,BILL_AMT4,BILL_AMT5,BILL_AMT6,PAY_AMT1,PAY_AMT2,PAY_AMT3,PAY_AMT4,PAY_AMT5,PAY_AMT6,default.payment.next.month
0,1,20000.0,2,2,1,24,2,2,-1,-1,...,0.0,0.0,0.0,0.0,689.0,0.0,0.0,0.0,0.0,1
1,2,120000.0,2,2,2,26,-1,2,0,0,...,3272.0,3455.0,3261.0,0.0,1000.0,1000.0,1000.0,0.0,2000.0,1
2,3,90000.0,2,2,2,34,0,0,0,0,...,14331.0,14948.0,15549.0,1518.0,1500.0,1000.0,1000.0,1000.0,5000.0,0
3,4,50000.0,2,2,1,37,0,0,0,0,...,28314.0,28959.0,29547.0,2000.0,2019.0,1200.0,1100.0,1069.0,1000.0,0
4,5,50000.0,1,2,1,57,-1,0,-1,0,...,20940.0,19146.0,19131.0,2000.0,36681.0,10000.0,9000.0,689.0,679.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,29996,220000.0,1,3,1,39,0,0,0,0,...,88004.0,31237.0,15980.0,8500.0,20000.0,5003.0,3047.0,5000.0,1000.0,0
29996,29997,150000.0,1,3,2,43,-1,-1,-1,-1,...,8979.0,5190.0,0.0,1837.0,3526.0,8998.0,129.0,0.0,0.0,0
29997,29998,30000.0,1,2,2,37,4,3,2,-1,...,20878.0,20582.0,19357.0,0.0,0.0,22000.0,4200.0,2000.0,3100.0,1
29998,29999,80000.0,1,3,1,41,1,-1,0,0,...,52774.0,11855.0,48944.0,85900.0,3409.0,1178.0,1926.0,52964.0,1804.0,1


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30000 entries, 0 to 29999
Data columns (total 25 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   ID                          30000 non-null  int64  
 1   LIMIT_BAL                   30000 non-null  float64
 2   SEX                         30000 non-null  int64  
 3   EDUCATION                   30000 non-null  int64  
 4   MARRIAGE                    30000 non-null  int64  
 5   AGE                         30000 non-null  int64  
 6   PAY_0                       30000 non-null  int64  
 7   PAY_2                       30000 non-null  int64  
 8   PAY_3                       30000 non-null  int64  
 9   PAY_4                       30000 non-null  int64  
 10  PAY_5                       30000 non-null  int64  
 11  PAY_6                       30000 non-null  int64  
 12  BILL_AMT1                   30000 non-null  float64
 13  BILL_AMT2                   300

In [17]:
# Step 3: 数据初探
print("数据基本情况：")
print(df.head())
print("数据维度：", df.shape)
print("每列缺失值：\n", df.isnull().sum())

数据基本情况：
   ID  LIMIT_BAL  SEX  EDUCATION  MARRIAGE  AGE  PAY_0  PAY_2  PAY_3  PAY_4  \
0   1    20000.0    2          2         1   24      2      2     -1     -1   
1   2   120000.0    2          2         2   26     -1      2      0      0   
2   3    90000.0    2          2         2   34      0      0      0      0   
3   4    50000.0    2          2         1   37      0      0      0      0   
4   5    50000.0    1          2         1   57     -1      0     -1      0   

   ...  BILL_AMT4  BILL_AMT5  BILL_AMT6  PAY_AMT1  PAY_AMT2  PAY_AMT3  \
0  ...        0.0        0.0        0.0       0.0     689.0       0.0   
1  ...     3272.0     3455.0     3261.0       0.0    1000.0    1000.0   
2  ...    14331.0    14948.0    15549.0    1518.0    1500.0    1000.0   
3  ...    28314.0    28959.0    29547.0    2000.0    2019.0    1200.0   
4  ...    20940.0    19146.0    19131.0    2000.0   36681.0   10000.0   

   PAY_AMT4  PAY_AMT5  PAY_AMT6  default.payment.next.month  
0       0.0     

In [5]:
# 3. 数据清洗
# 修复EDUCATION
data['EDUCATION'] = data['EDUCATION'].map({1:1, 2:2, 3:3, 4:4, 5:4, 6:4, 0:4})
# 修复MARRIAGE
data['MARRIAGE'] = data['MARRIAGE'].map({1:1, 2:2, 3:3, 0:3})
# 修复PAY_X（示例PAY_0）
pay_cols = ['PAY_0','PAY_2','PAY_3','PAY_4','PAY_5','PAY_6']
for col in pay_cols:
    data[col] = data[col].map(lambda x: 0 if x < 0 else x)

In [6]:
# 4. 特征工程
# 财务特征
data['UTILIZATION_RATIO'] = data['BILL_AMT1'] / data['LIMIT_BAL'].replace(0, 1)
# 行为特征
data['MAX_DELAY'] = data[pay_cols].max(axis=1)
# 趋势特征（示例：账单斜率）
bill_cols = ['BILL_AMT1','BILL_AMT2','BILL_AMT3','BILL_AMT4','BILL_AMT5','BILL_AMT6']
x = np.arange(6)
data['BILL_TREND'] = data[bill_cols].apply(lambda row: np.polyfit(x, row, 1)[0], axis=1)

In [7]:
# 5. 数据预处理
# 删除ID
data = data.drop('ID', axis=1)
# 独热编码
cat_cols = ['SEX','EDUCATION','MARRIAGE']
data = pd.get_dummies(data, columns=cat_cols, drop_first=True)
# 标准化
scaler = StandardScaler()
num_cols = ['LIMIT_BAL','AGE'] + bill_cols + ['PAY_AMT1','PAY_AMT2','PAY_AMT3','PAY_AMT4','PAY_AMT5','PAY_AMT6']
data[num_cols] = scaler.fit_transform(data[num_cols])

In [8]:
# 6. 划分数据集
X = data.drop('default.payment.next.month', axis=1)
y = data['default.payment.next.month']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [9]:
# 7. 建模与评估（示例：逻辑回归）
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, recall_score

lr = LogisticRegression(class_weight='balanced', max_iter=1000)
lr.fit(X_train, y_train)
probs = lr.predict_proba(X_test)[:,1]
print(f"AUC: {roc_auc_score(y_test, probs):.4f}")
print(f"Recall: {recall_score(y_test, lr.predict(X_test)):.4f}")


AUC: 0.7701
Recall: 0.6240


In [10]:
# 8. 对比随机森林
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(class_weight='balanced_subsample')
rf.fit(X_train, y_train)

In [11]:
# 9. 特征重要性分析
pd.Series(rf.feature_importances_, index=X.columns).sort_values(ascending=False)

UTILIZATION_RATIO    0.061174
MAX_DELAY            0.060505
PAY_0                0.060439
AGE                  0.057080
BILL_TREND           0.056479
BILL_AMT1            0.051083
LIMIT_BAL            0.050883
PAY_AMT1             0.047094
BILL_AMT2            0.046128
PAY_AMT2             0.045379
BILL_AMT3            0.045166
BILL_AMT4            0.045151
PAY_AMT3             0.042963
BILL_AMT5            0.042855
PAY_AMT6             0.042828
BILL_AMT6            0.042610
PAY_AMT4             0.041794
PAY_AMT5             0.038801
PAY_2                0.026898
PAY_3                0.020168
PAY_4                0.014880
PAY_5                0.010897
SEX_2                0.010214
MARRIAGE_2           0.009917
EDUCATION_2          0.009663
PAY_6                0.008326
EDUCATION_3          0.007186
MARRIAGE_3           0.001727
EDUCATION_4          0.001709
dtype: float64