In [None]:
# Install necessary libraries
!pip3 install xgboost lightgbm catboost hillclimbers

# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
import lightgbm as lgb
from catboost import CatBoostClassifier
from hillclimbers import climb_hill, partial
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Data Loading
train = pd.read_csv("train.csv")
original = pd.read_csv('original.csv')
train = pd.concat([train, original], ignore_index=True)
train['id'] = np.arange(len(train))

test = pd.read_csv("test.csv")
submission = pd.read_csv("sample_submission.csv")

target = "loan_status"

# Combine train and test for consistent preprocessing
combined = pd.concat([train.drop(columns=[target]), test], ignore_index=True)

# Data Preprocessing
# Fill missing values
combined['person_emp_length'].fillna(combined['person_emp_length'].median(), inplace=True)
combined['loan_int_rate'].fillna(combined['loan_int_rate'].median(), inplace=True)

# Encode categorical variables
categorical_cols = ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']
for col in categorical_cols:
    le = LabelEncoder()
    combined[col] = le.fit_transform(combined[col])

# Split combined data back into train and test
X = combined.iloc[:len(train), :].drop(columns=['id'])
X_test = combined.iloc[len(train):, :].drop(columns=['id'])
y = train[target]

# Cross-validation strategy
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

# Initialize dictionaries to store out-of-fold and test predictions
oof_preds = {}
test_preds = {}

# Define evaluation metric
eval_metric = partial(roc_auc_score)

# Model parameters
xgb_params = {
    'objective': 'binary:logistic',
    'eval_metric': 'auc',
    'use_label_encoder': False,
    'random_state': 42,
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'random_state': 42,
    'n_estimators': 1000,
    'learning_rate': 0.05,
    'max_depth': 6,
    'num_leaves': 31,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
}

cat_params = {
    'iterations': 891,
    'depth': 7,
    'learning_rate': 0.0448563477253477,
    'l2_leaf_reg': 8.309884320742215,
    'border_count': 161,
    'random_strength': 5.297381835241815,
    'bagging_temperature': 0.10498408144882451,
    'scale_pos_weight': 2.3753101495123747,
    'loss_function': 'Logloss',
    'min_data_in_leaf': 5,
    'verbose': False
}

rfc_params = {
    'max_features': 2,
    'n_estimators': 2000,
    'min_samples_leaf': 3,
    'criterion': 'entropy',
    'random_state': 0
}

# Define models
models = {
    'XGB': (XGBClassifier, xgb_params),
    'LGB': (lgb.LGBMClassifier, lgb_params),
    'CAT': (CatBoostClassifier, cat_params),
    'RFC': (RandomForestClassifier, rfc_params)  
}

# Perform cross-validation for each model
for model_name, (ModelClass, params) in models.items():
    print(f"Training {model_name}...")
    oof_pred = np.zeros(X.shape[0])
    test_pred = np.zeros(X_test.shape[0])

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[train_idx], y.iloc[val_idx]

        # Train the model
        if model_name == 'CAT':
            model = ModelClass(**params)
            model.fit(
                X_tr, y_tr,
                eval_set=(X_val, y_val),
                use_best_model=True,
                early_stopping_rounds=50,
            )
        elif model_name == 'LGB':
            model = ModelClass(**params)
            model.fit(
                X_tr, y_tr,
                eval_set=[(X_val, y_val)],
                eval_names=['valid'],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=50),
                    lgb.log_evaluation(period=0)
                ],
            )
        else:
            model = ModelClass(**params)
            model.fit(X_tr, y_tr)
        
        # Generate predictions
        if hasattr(model, 'predict_proba'):
            oof_pred[val_idx] = model.predict_proba(X_val)[:, 1]
            test_pred += model.predict_proba(X_test)[:, 1] / n_splits
        else:
            oof_pred[val_idx] = model.predict(X_val)
            test_pred += model.predict(X_test) / n_splits

    oof_preds[model_name] = oof_pred
    test_preds[model_name] = test_pred

# Convert predictions to DataFrames
oof_pred_df = pd.DataFrame(oof_preds)
test_pred_df = pd.DataFrame(test_preds)

# Prepare train DataFrame with target
train_df = pd.DataFrame({
    'loan_status': y
})

# Use the provided climb_hill function for blending
blended_test_preds = climb_hill(
    train=train_df,
    oof_pred_df=oof_pred_df,
    test_pred_df=test_pred_df,
    target='loan_status',
    objective='maximize',
    eval_metric=eval_metric,
    negative_weights=False,
    precision=0.01,
    plot_hill=True,
    plot_hist=True,
    return_oof_preds=False
)

# Prepare submission
submission[target] = blended_test_preds
submission.to_csv('submission.csv', index=False)
print("\nSubmission file created: 'submission.csv'")


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.3.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Training XGB...
Training LGB...
[LightGBM] [Info] Number of positive: 12366, number of negative: 60614
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000809 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 885
[LightGBM] [Info] Number of data points in the train set: 72980, number of used features: 11
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.169444 -> initscore=-1.589575
[LightGBM] [Info] Start training from score -1.589575
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[790]	valid's auc: 0.962916
[LightGBM] [Info] Number of positi