In [None]:
# 环境说明：在 conda 环境 DataMining 中运行
import os
import gc
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
import joblib

plt.style.use('seaborn')

In [None]:
# 载入特征（如果不存在，请先运行 src.data.make_dataset 和 src.features.build_features）
processed_path = Path('data/processed')
features_path = processed_path / 'train_features.parquet'
assert features_path.exists(), f'features not found: {features_path}'
df = pd.read_parquet(features_path)
print('loaded', df.shape)
display(df.head())

In [None]:
# 简单 EDA：缺失率与目标分布（若有）
def missing_stats(df):
    miss = df.isna().mean().sort_values(ascending=False)
    return miss[miss>0]

print('missing features:')
print(missing_stats(df).head(30))

if 'isDefault' in df.columns:
    print('Target distribution:')
    print(df['isDefault'].value_counts(normalize=True))

In [None]:
# 准备训练数据（简化）：分离 X,y 并做一次简单划分
target_col = 'isDefault'
assert target_col in df.columns, 'target column not found in features'
X = df.drop(columns=[target_col])
y = df[target_col].astype(int)

# 小样本训练或全量（按内存自行调整）
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print('train shape', X_train.shape, 'val shape', X_val.shape)

In [None]:
# 训练 LightGBM 基线（使用 sklearn 接口）
from lightgbm import LGBMClassifier
clf = LGBMClassifier(n_estimators=1000, learning_rate=0.05, n_jobs=-1, random_state=42)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)], eval_metric='auc', early_stopping_rounds=50, verbose=50)

# 验证 AUC
pred_val = clf.predict_proba(X_val)[:,1]
auc = roc_auc_score(y_val, pred_val)
print('Validation AUC:', auc)

# 保存模型与简单 OOF（演示）
model_path = Path('experiments/results/baseline/model.joblib')
model_path.parent.mkdir(parents=True, exist_ok=True)
joblib.dump(clf, model_path)
print('Saved model to', model_path)

## 后续改进建议
- 用 StratifiedKFold 做 OOF 并记录 CV AUC
- 做更多特征（目标编码、时间衍生、交互项）
- 使用 Optuna 搜索参数并做模型融合
- 用 SHAP 做特征解释与异常检测