In [1]:
# KAGGLE NOVEMBER 2025 - IMPROVED PIPELINE
# Multi-Seed + External Data Augmentation

import numpy as np
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import roc_auc_score
import warnings
warnings.filterwarnings('ignore')


In [2]:
# CONFIGURATION


# File paths
TRAIN_PATH = 'train.csv'
TEST_PATH = 'test.csv'
ORIG_PATH = 'loan_dataset_20000.csv'

# Training configuration
SEEDS = [42, 43, 44, 45, 46]  # Multi-seed for variance reduction
N_SPLITS = 5  # Stratified K-Fold
TARGET = 'loan_paid_back'

# LightGBM parameters (from your Optuna optimization)
LGBM_PARAMS = {
    'objective': 'binary',
    'metric': 'auc',
    'boosting_type': 'gbdt',
    'n_estimators': 1000,
    'learning_rate': 0.04619852582842627,
    'num_leaves': 57,
    'max_depth': 12,
    'min_child_samples': 190,
    'subsample': 0.7152363987011763,
    'colsample_bytree': 0.6517074545892816,
    'lambda_l1': 4.023591669670226,
    'lambda_l2': 3.995624195713791,
    'min_split_gain': 0.1165571634393688,
    'verbosity': -1
}

print(f"   Seeds: {SEEDS}")
print(f"   CV folds: {N_SPLITS}")
print(f"   LightGBM learning_rate: {LGBM_PARAMS['learning_rate']:.4f}")

   Seeds: [42, 43, 44, 45, 46]
   CV folds: 5
   LightGBM learning_rate: 0.0462


In [3]:
# LOAD DATA

# Load train and test
train = pd.read_csv(TRAIN_PATH)
test = pd.read_csv(TEST_PATH)
orig = pd.read_csv(ORIG_PATH)

print(f" Train shape: {train.shape}")
print(f" Test shape: {test.shape}")
print(f" Original dataset shape: {orig.shape}")

print(f"\nTarget distribution (train):")
print(train[TARGET].value_counts(normalize=True))

print(f"\nTarget distribution (original):")
print(orig[TARGET].value_counts(normalize=True))

 Train shape: (593994, 13)
 Test shape: (254569, 12)
 Original dataset shape: (20000, 22)

Target distribution (train):
loan_paid_back
1.0    0.79882
0.0    0.20118
Name: proportion, dtype: float64

Target distribution (original):
loan_paid_back
1    0.7999
0    0.2001
Name: proportion, dtype: float64


In [4]:
# EXTERNAL DATA AUGMENTATION
# Categorical features to augment
cat_features = ['gender', 'marital_status', 'education_level',
                'employment_status', 'loan_purpose', 'grade_subgrade']

external_features = []

for col in cat_features:
    print(f"\nProcessing {col}...")

    # Calculate statistics from original dataset
    orig_stats = orig.groupby(col)[TARGET].agg(['mean', 'std', 'count'])

    # Create feature names
    mean_col = f'orig_mean_{col}'
    std_col = f'orig_std_{col}'
    count_col = f'orig_count_{col}'

    # Map to train
    train[mean_col] = train[col].map(orig_stats['mean'])
    train[std_col] = train[col].map(orig_stats['std'])
    train[count_col] = train[col].map(orig_stats['count'])

    # Map to test
    test[mean_col] = test[col].map(orig_stats['mean'])
    test[std_col] = test[col].map(orig_stats['std'])
    test[count_col] = test[col].map(orig_stats['count'])

    # Fill NaN with global statistics
    global_mean = orig[TARGET].mean()
    global_std = orig[TARGET].std()

    for df in [train, test]:
        df[mean_col].fillna(global_mean, inplace=True)
        df[std_col].fillna(global_std, inplace=True)
        df[count_col].fillna(0, inplace=True)

    external_features.extend([mean_col, std_col, count_col])
    print(f"   Created: {mean_col}, {std_col}, {count_col}")

print(f" Created {len(external_features)} external features")

# Correlation check
print("\n Correlation check (external vs base features):")
base_numerical = ['annual_income', 'loan_amount', 'credit_score',
                  'debt_to_income_ratio', 'interest_rate']

for i, ext_feat in enumerate(external_features[:6]):  # Show first 6
    max_corr = train[[ext_feat] + base_numerical].corr()[ext_feat][1:].abs().max()
    status = "" if max_corr < 0.9 else ""
    print(f"  {status} {ext_feat[:35]:35s} max_r = {max_corr:.4f}")

print("\nExternal data augmentation complete!")


Processing gender...
   Created: orig_mean_gender, orig_std_gender, orig_count_gender

Processing marital_status...
   Created: orig_mean_marital_status, orig_std_marital_status, orig_count_marital_status

Processing education_level...
   Created: orig_mean_education_level, orig_std_education_level, orig_count_education_level

Processing employment_status...
   Created: orig_mean_employment_status, orig_std_employment_status, orig_count_employment_status

Processing loan_purpose...
   Created: orig_mean_loan_purpose, orig_std_loan_purpose, orig_count_loan_purpose

Processing grade_subgrade...
   Created: orig_mean_grade_subgrade, orig_std_grade_subgrade, orig_count_grade_subgrade
 Created 18 external features

 Correlation check (external vs base features):
   orig_mean_gender                    max_r = 0.0049
   orig_std_gender                     max_r = 0.0050
   orig_count_gender                   max_r = 0.0049
   orig_mean_marital_status            max_r = 0.0051
   orig_std_mar

In [5]:

# PREPROCESSING

# Create copies
train_df = train.copy()
test_df = test.copy()

# Save ID for submission
test_ids = test_df['id'].copy()

# Drop ID
train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

# Encode categorical features
cat_cols = ['gender', 'marital_status', 'education_level',
            'employment_status', 'loan_purpose', 'grade_subgrade']

for col in cat_cols:
    le = LabelEncoder()
    combined = pd.concat([train_df[col], test_df[col]]).astype(str)
    le.fit(combined)
    train_df[col] = le.transform(train_df[col])
    test_df[col] = le.transform(test_df[col])
    print(f"Encoded {col}")

# Create feature matrix and target
X = train_df.drop(columns=[TARGET])
y = train_df[TARGET]
X_test = test_df.copy()


print("Preprocessing complete")
print(f"Total features: {X.shape[1]}")
print(f"  - Base features: 11")
print(f"  - External features: {len(external_features)}")
print(f"Samples: {X.shape[0]} train, {X_test.shape[0]} test")

Encoded gender
Encoded marital_status
Encoded education_level
Encoded employment_status
Encoded loan_purpose
Encoded grade_subgrade
Preprocessing complete
Total features: 29
  - Base features: 11
  - External features: 18
Samples: 593994 train, 254569 test


In [6]:
# TRAINING FUNCTION

def train_lgbm_single_seed(X, y, X_test, params, seed, n_splits=5):
    """
    Train LightGBM with a single seed using K-Fold CV

    Returns:
        oof_preds: Out-of-fold predictions on train
        test_preds: Predictions on test
        cv_score: Cross-validation AUC score
    """
    # Update seed
    params_seed = params.copy()
    params_seed['random_state'] = seed

    # Initialize predictions
    oof_preds = np.zeros(len(X))
    test_preds = np.zeros(len(X_test))

    # Stratified K-Fold
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)

    fold_scores = []
    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        # Split data
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train model
        model = lgb.LGBMClassifier(**params_seed)
        model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],
            callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
        )

        # Predict
        oof_preds[val_idx] = model.predict_proba(X_val)[:, 1]
        test_preds += model.predict_proba(X_test)[:, 1] / n_splits

        # Score
        fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
        fold_scores.append(fold_auc)

    # Calculate overall CV
    cv_score = roc_auc_score(y, oof_preds)

    return oof_preds, test_preds, cv_score


In [7]:

# MULTI-SEED TRAINING


print("\nTraining LightGBM with multi-seed averaging")

all_oof = []
all_test = []
all_cv = []

for i, seed in enumerate(SEEDS):
    print(f"\n Seed {i+1}/{len(SEEDS)}: {seed}")
    oof, test, cv = train_lgbm_single_seed(
        X, y, X_test,
        LGBM_PARAMS,
        seed,
        n_splits=N_SPLITS
    )

    all_oof.append(oof)
    all_test.append(test)
    all_cv.append(cv)

    print(f"   CV AUC: {cv:.6f}")

# Average predictions
oof_final = np.mean(all_oof, axis=0)
test_final = np.mean(all_test, axis=0)
cv_final = roc_auc_score(y, oof_final)

print("multi seed results")
print(f"\nIndividual CVs:")
for i, cv in enumerate(all_cv):
    print(f"  Seed {SEEDS[i]}: {cv:.6f}")

print(f"\nMean of individuals: {np.mean(all_cv):.6f} ± {np.std(all_cv):.6f}")
print(f"Multi-seed ensemble: {cv_final:.6f}")
print(f"Gain from averaging: +{cv_final - np.mean(all_cv):.6f}")



Training LightGBM with multi-seed averaging

 Seed 1/5: 42
   CV AUC: 0.923178

 Seed 2/5: 43
   CV AUC: 0.923035

 Seed 3/5: 44
   CV AUC: 0.923323

 Seed 4/5: 45
   CV AUC: 0.923196

 Seed 5/5: 46
   CV AUC: 0.923245
multi seed results

Individual CVs:
  Seed 42: 0.923178
  Seed 43: 0.923035
  Seed 44: 0.923323
  Seed 45: 0.923196
  Seed 46: 0.923245

Mean of individuals: 0.923195 ± 0.000094
Multi-seed ensemble: 0.923682
Gain from averaging: +0.000487


In [8]:
# CREATE SUBMISSION


submission = pd.DataFrame({
    'id': test_ids,
    TARGET: test_final
})

submission.to_csv('submission_improved.csv', index=False)
print(f"File: submission_improved.csv")
print(f"Rows: {len(submission)}")
print(f"\nSample predictions:")
print(submission.head())

File: submission_improved.csv
Rows: 254569

Sample predictions:
       id  loan_paid_back
0  593994        0.932328
1  593995        0.978760
2  593996        0.529594
3  593997        0.908904
4  593998        0.968299


In [9]:
# COMPARISON WITH BASELINE

baseline_cv = 0.92321  # Your Phase 1 baseline


print("improvement summary")
print(f"\nBaseline (single LGB):           {baseline_cv:.6f}")
print(f"New pipeline (multi-seed + ext): {cv_final:.6f}")
print(f"Improvement:                     +{cv_final - baseline_cv:.6f} ({((cv_final/baseline_cv - 1)*100):.2f}%)")

print(f"\nFeature breakdown:")
print(f"  Base features:      11")
print(f"  External features:  {len(external_features)}")
print(f"  Total features:     {X.shape[1]}")




improvement summary

Baseline (single LGB):           0.923210
New pipeline (multi-seed + ext): 0.923682
Improvement:                     +0.000472 (0.05%)

Feature breakdown:
  Base features:      11
  External features:  18
  Total features:     29
