In [4]:
!pip install skrub
!pip install xgboost
!pip install catboost




In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from skrub import TableVectorizer
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score, recall_score, precision_score, confusion_matrix
import joblib
import warnings

warnings.filterwarnings('ignore')
sns.set_style("whitegrid")

# --- 配置文件 ---
# ⚠️ Consultant 升级：明确定义要剔除的列
LEAKAGE_COLS = ['grade', 'sub_grade', 'int_rate', 'Predictions', 'Predicted probabilities']
SENSITIVE_COLS = ['zip_code', 'Pct_afro_american']
METADATA_COLS = ['issue_d', 'emp_title', 'Unnamed: 0']

# 加载数据
df = pd.read_csv('../data/raw/dataproject2025.csv')
print(f"Original Shape: {df.shape}")

Original Shape: (1086236, 39)


In [6]:
# 1. 转换 emp_length (保留有序性)
def parse_emp_length(x):
    x = str(x)
    if '< 1' in x: return 0
    if '10+' in x: return 10
    try:
        return int(x.split()[0])
    except:
        return 0

df['emp_length_num'] = df['emp_length'].apply(parse_emp_length)

# 2. 剔除黑名单特征
cols_to_drop = LEAKAGE_COLS + SENSITIVE_COLS + METADATA_COLS + ['emp_length']
# 确保只剔除存在的列
cols_to_drop = [c for c in cols_to_drop if c in df.columns]

df_clean = df.drop(columns=cols_to_drop)
print(f"Cleaned Shape: {df_clean.shape}")
print(f"Columns Removed: {cols_to_drop}")

# 3. 准备 X 和 y
X = df_clean.drop(columns=['target'])
y = df_clean['target']

Cleaned Shape: (1086236, 29)
Columns Removed: ['grade', 'sub_grade', 'int_rate', 'Predictions', 'Predicted probabilities', 'zip_code', 'Pct_afro_american', 'issue_d', 'emp_title', 'Unnamed: 0', 'emp_length']


In [None]:
# 提取年份用于切分
issue_years = pd.to_numeric(df['issue_d'], errors='coerce')

# 定义测试集年份
TEST_YEARS = [2018, 2019, 2020]

train_mask = ~issue_years.isin(TEST_YEARS)
test_mask = issue_years.isin(TEST_YEARS)

X_train, y_train = X[train_mask], y[train_mask]
X_test, y_test = X[test_mask], y[test_mask]

print(f"Train Set (Pre-2018): {X_train.shape}")
print(f"Test Set (2018-2020): {X_test.shape} (OOT Validation)")

Train Set (Pre-2018): (957145, 28)
Test Set (2018-2020): (129091, 28) (OOT Validation)


: 