## Model ensembling with weighted average

This is a much simplified model ensembling as it only includes three single models.
In the full solution, with 27 single models and 5 open solution models, ensembling can achieve a boost of ~0.002.

private LB: 0.8004, public LB: 0.8046, local CV: 0.8028

In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score

Read single model predictions

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
import gc

# ======================================================================================
# 步骤 1: 设置文件路径并加载数据
# ======================================================================================
print("Step 1: Loading and preparing data...")

# 定义输入文件路径
train_files = ['/Users/chenzeyu/Documents/GitHub/home-credit-default-risk/output/train_pred_lgb1.csv',
               '/Users/chenzeyu/Documents/GitHub/home-credit-default-risk/output/train_pred_lgb2.csv',
               '/Users/chenzeyu/Documents/GitHub/home-credit-default-risk/output/train_pred_lgb3.csv']
test_files = ['../output/test_pred_lgb1.csv', 
              '../output/test_pred_lgb2.csv', 
              '../output/test_pred_lgb3.csv']

n_models = len(train_files)

# --- 加载训练数据 ---
# 读取第一个文件作为基础
train_df = pd.read_csv(train_files[0])
# 重命名预测列，以便区分
train_df.rename(columns={'prob': 'lgb1_prob'}, inplace=True)

# 循环读取并合并后续文件，确保 SK_ID_CURR 对齐
for i in range(1, n_models):
    model_name = f"lgb{i+1}"
    preds_df = pd.read_csv(train_files[i])
    preds_df.rename(columns={'prob': f'{model_name}_prob'}, inplace=True)
    train_df = pd.merge(train_df, preds_df[['SK_ID_CURR', f'{model_name}_prob']], on='SK_ID_CURR', how='left')

# 提取特征 (X) 和目标 (y)
feature_cols = [f'lgb{i+1}_prob' for i in range(n_models)]
train_x = train_df[feature_cols]
train_y = train_df['target']

# --- 加载测试数据 ---
# 读取第一个文件作为基础
test_df = pd.read_csv(test_files[0])
# 注意：Kaggle 提交文件中的预测列通常名为 'TARGET'
test_df.rename(columns={'TARGET': 'lgb1_prob'}, inplace=True)

# 循环读取并合并后续文件
for i in range(1, n_models):
    model_name = f"lgb{i+1}"
    preds_df = pd.read_csv(test_files[i])
    preds_df.rename(columns={'TARGET': f'{model_name}_prob'}, inplace=True)
    test_df = pd.merge(test_df, preds_df[['SK_ID_CURR', f'{model_name}_prob']], on='SK_ID_CURR', how='left')

# 提取测试集的特征和 ID
test_x = test_df[feature_cols]
test_id = test_df['SK_ID_CURR']

print(f"Training features shape: {train_x.shape}")
print(f"Test features shape: {test_x.shape}")


# ======================================================================================
# 步骤 2: 诊断单个模型的性能 (与您原脚本功能类似)
# ======================================================================================
print("\nStep 2: Evaluating individual model performance...")
for col in train_x.columns:
    auc = roc_auc_score(train_y, train_x[col])
    print(f"Model '{col}' single AUC: {auc:.6f}")


# ======================================================================================
# 步骤 3: 使用线性模型进行 Stacking (核心改动)
# ======================================================================================
print("\nStep 3: Stacking models using Logistic Regression with Cross-Validation...")

# 初始化用于存储 Out-of-Fold 预测和测试集预测的数组
oof_preds = np.zeros(train_x.shape[0])
test_preds = np.zeros(test_x.shape[0])

# 使用分层 K 折交叉验证来训练元模型，更稳健
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for n_fold, (trn_idx, val_idx) in enumerate(kf.split(train_x, train_y)):
    print(f"--- Training Fold {n_fold + 1} ---")
    
    # 划分训练集和验证集
    trn_x_fold, val_x_fold = train_x.iloc[trn_idx], train_x.iloc[val_idx]
    trn_y_fold, val_y_fold = train_y.iloc[trn_idx], train_y.iloc[val_idx]
    
    # 定义线性模型 (元模型)
    # C=0.1 提供了轻微的正则化，防止过拟合
    meta_model = LogisticRegression(C=0.1, random_state=42)
    
    # 在训练折上训练元模型
    meta_model.fit(trn_x_fold, trn_y_fold)
    
    # 在验证折上进行预测，并存储到 OOF 数组中
    oof_preds[val_idx] = meta_model.predict_proba(val_x_fold)[:, 1]
    
    # 在整个测试集上进行预测，并累加（后续会求平均）
    test_preds += meta_model.predict_proba(test_x)[:, 1] / kf.get_n_splits()

# 计算总的 Out-of-Fold AUC 分数
oof_auc = roc_auc_score(train_y, oof_preds)
print("\n" + "="*50)
print(f"Stacking Model OOF AUC: {oof_auc:.6f}")
print("="*50)


# ======================================================================================
# 步骤 4: 生成提交文件
# ======================================================================================
print("\nStep 4: Generating submission file...")

submission = pd.DataFrame({'SK_ID_CURR': test_id, 'TARGET': test_preds})
submission.to_csv('submission_stacking_linear.csv', index=False)

print("Submission file 'submission_stacking_linear.csv' created successfully.")
gc.collect()

Check correlation between single model predictions, idealy we want low correlation for larger diversition between single models.

In [3]:
print('correlation train:')
print(train_x.corr())
print('correlation test:')
print(test_x.corr())

correlation train:
          lgb1      lgb2      lgb3
lgb1  1.000000  0.990643  0.981924
lgb2  0.990643  1.000000  0.984013
lgb3  0.981924  0.984013  1.000000
correlation test:
          lgb1      lgb2      lgb3
lgb1  1.000000  0.993755  0.990406
lgb2  0.993755  1.000000  0.992537
lgb3  0.990406  0.992537  1.000000


Blending and check blended model local CV

In [4]:
weights = [1.0/3, 1.0/3, 1.0/3]
train_pred = pd.Series(np.zeros([train_x.shape[0]]))
test_pred = pd.Series(np.zeros([test_x.shape[0]]))

for i in range(n_model):
    train_pred += weights[i] * train_x.iloc[:,i].values
    test_pred += weights[i] * test_x.iloc[:,i].values
    print ('%25s, auc %.6f   weight: %.4f' %(train_x.columns.values[i], roc_auc_score(train_y,train_x.iloc[:,i]), weights[i]))

print ('stacking model auc: train %.6f' %(roc_auc_score(train_y,train_pred)))

                     lgb1, auc 0.802002   weight: 0.3333
                     lgb2, auc 0.801779   weight: 0.3333
                     lgb3, auc 0.801157   weight: 0.3333
stacking model auc: train 0.802830


Save test prediction to disk. This will be our final submission.

In [5]:
sub = pd.DataFrame()
sub['SK_ID_CURR'] = test_id
sub['TARGET'] = test_pred
sub.to_csv('../output/stacked_sub.csv',index=False)