In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    balanced_accuracy_score, average_precision_score, confusion_matrix,
    matthews_corrcoef, cohen_kappa_score, log_loss
)

import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier


In [8]:
# Load data
df = pd.read_csv('credit_risk_dataset.csv')
X = df.drop('loan_status', axis=1)
y = df['loan_status']
class_ratio = y.value_counts()[0]/y.value_counts()[1]

# Define columns
numerical_cols = [
    'person_age', 'person_income', 'person_emp_length', 'loan_amnt',
    'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length'
]
categorical_cols = [
    'person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file'
]

# Handle missing values
missing_info = X.isnull().sum()
missing_cols = missing_info[missing_info > 0]

for col in missing_cols.index:
    if col in numerical_cols:
        X[col] = X[col].fillna(X[col].median())
    else:
        X[col] = X[col].fillna(X[col].mode()[0])


In [10]:
def create_ensemble_features(X, numerical_cols, categorical_cols):
    X_enhanced = X.copy()
    
    # Numerical features
    for col in numerical_cols:
        if col in X_enhanced.columns:
            try:
                X_enhanced[f'{col}_binned'] = pd.cut(X_enhanced[col], bins=5, labels=False, duplicates='drop')
            except:
                X_enhanced[f'{col}_binned'] = X_enhanced[col]
            
            if col in ['person_income', 'loan_amnt']:
                X_enhanced[f'{col}_log'] = np.log1p(X_enhanced[col])
    
    # Categorical encoding
    le_dict = {}
    for col in categorical_cols:
        if col in X_enhanced.columns:
            le = LabelEncoder()
            X_enhanced[f'{col}_encoded'] = le.fit_transform(X_enhanced[col].astype(str))
            le_dict[col] = le
    
    # Credit risk features
    X_enhanced['debt_to_income_risk'] = pd.cut(
        X_enhanced['loan_percent_income'], bins=[0, 0.1, 0.2, 0.3, 1.0], labels=False
    ).fillna(0).astype(int)
    
    X_enhanced['age_group'] = pd.cut(
        X_enhanced['person_age'], bins=[0, 25, 35, 50, 100], labels=False
    ).fillna(0).astype(int)
    
    X_enhanced['emp_stability'] = X_enhanced['person_emp_length'] / np.maximum(X_enhanced['person_age'], 1)
    X_enhanced['income_age_ratio'] = X_enhanced['person_income'] / np.maximum(X_enhanced['person_age'], 1)
    X_enhanced['loan_income_ratio'] = X_enhanced['loan_amnt'] / np.maximum(X_enhanced['person_income'], 1)
    X_enhanced['high_dti'] = (X_enhanced['loan_percent_income'] > 0.3).astype(int)
    X_enhanced['young_borrower'] = (X_enhanced['person_age'] < 25).astype(int)
    X_enhanced['high_interest'] = (X_enhanced['loan_int_rate'] > 15.0).astype(int)
    X_enhanced['short_employment'] = (X_enhanced['person_emp_length'] < 2.0).astype(int)
    
    return X_enhanced, le_dict

X_enhanced, label_encoders = create_ensemble_features(X, numerical_cols, categorical_cols)


In [11]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(
    X_enhanced, y, test_size=0.2, random_state=42, stratify=y
)

# Create numeric versions
X_train_numeric = X_train.copy()
X_test_numeric = X_test.copy()

for col in X_train_numeric.columns:
    if X_train_numeric[col].dtype == 'object':
        le = LabelEncoder()
        X_train_numeric[col] = le.fit_transform(X_train_numeric[col].astype(str))
        X_test_numeric[col] = le.transform(X_test_numeric[col].astype(str))


In [12]:
# XGBoost - YOUR PARAMETERS
xgb_model = xgb.XGBClassifier(
    # objective='binary:logistic', scale_pos_weight=5, max_depth=5, min_child_weight=3,
    # gamma=0.2, subsample=0.9, colsample_bytree=0.9, learning_rate=0.05, n_estimators=600,
    # reg_alpha=0.1, reg_lambda=1.0, max_delta_step=1, random_state=42,
    # eval_metric="logloss", use_label_encoder=False
    objective='binary:logistic',
    scale_pos_weight=5,
    max_depth=5,
    min_child_weight=3,
    gamma=0.2,
    subsample=0.9,
    colsample_bytree=0.9,
    learning_rate=0.05,
    n_estimators=600,
    reg_alpha=0.1,
    reg_lambda=1.0,
    max_delta_step=1,
    random_state=42,
    eval_metric="logloss",
    use_label_encoder=False
)
xgb_model.fit(X_train_numeric.fillna(0), y_train)

# Random Forest - YOUR PARAMETERS
rf_model = RandomForestClassifier(
    # n_estimators=200, class_weight='balanced', max_depth=10,
    # min_samples_split=5, min_samples_leaf=2, random_state=42
    n_estimators=200,
    class_weight='balanced',
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)
rf_model.fit(X_train_numeric.fillna(0), y_train)

# CatBoost - YOUR PARAMETERS
categorical_indices = [X_train.columns.get_loc(col) for col in categorical_cols if col in X_train.columns]
cat_model = CatBoostClassifier(
    # iterations=1000, learning_rate=0.1, depth=6, loss_function='Logloss',
    # class_weights={0: 1, 1: 12}, nan_mode='Min', cat_features=categorical_indices,
    # l2_leaf_reg=3, eval_metric='AUC', verbose=False, random_seed=42
    iterations=600,
    learning_rate=0.1,
    depth=6,
    loss_function='Logloss',
    class_weights={0: 1, 1: 5},
    nan_mode='Min',
    cat_features=categorical_indices,
    l2_leaf_reg=8,
    task_type='CPU',
    thread_count=-1,
    eval_metric='AUC',
    verbose=100,
    random_seed=42
)
cat_model.fit(X_train.fillna('missing'), y_train)

# Optimized LightGBM
lgb_model = lgb.LGBMClassifier(
    objective='binary', boosting_type='gbdt', class_weight='balanced', is_unbalance=True,
    num_leaves=50, max_depth=6, min_child_samples=15, min_child_weight=0.01,
    learning_rate=0.06, n_estimators=600, reg_alpha=0.3, reg_lambda=2.0, min_split_gain=0.05,
    feature_fraction=0.95, feature_fraction_bynode=0.8, bagging_fraction=0.8, bagging_freq=3,
    max_bin=511, pos_bagging_fraction=0.7, neg_bagging_fraction=0.9,
    random_state=42, n_jobs=-1, verbose=-1
)

lgb_model.fit(
    X_train_numeric.fillna(0), y_train,
    eval_set=[(X_test_numeric.fillna(0), y_test)],
    eval_metric='auc',
    callbacks=[lgb.early_stopping(stopping_rounds=100, verbose=False), lgb.log_evaluation(0)]
)


0:	total: 179ms	remaining: 1m 47s
100:	total: 4.06s	remaining: 20.1s
200:	total: 7.84s	remaining: 15.6s
300:	total: 11.5s	remaining: 11.4s
400:	total: 15.2s	remaining: 7.53s
500:	total: 18.8s	remaining: 3.71s
599:	total: 23.1s	remaining: 0us


In [15]:
def optimize_threshold_your_way(prob, y_true):
    thresholds = np.arange(0.1, 0.9, 0.05)
    best_thresh = 0.5
    best_recall = 0
    
    for t in thresholds:
        y_pred_thresh = (prob >= t).astype(int)
        rec = recall_score(y_true, y_pred_thresh)
        prec = precision_score(y_true, y_pred_thresh)
        
        if rec > best_recall and prec >= 0.7:
            best_recall = rec
            best_thresh = t
    
    return best_thresh

# Get probabilities
xgb_prob = xgb_model.predict_proba(X_test_numeric.fillna(0))[:, 1]
lgb_prob = lgb_model.predict_proba(X_test_numeric.fillna(0))[:, 1]
cat_prob = cat_model.predict_proba(X_test.fillna('missing'))[:, 1]
rf_prob = rf_model.predict_proba(X_test_numeric.fillna(0))[:, 1]

# Optimize thresholds
xgb_thresh = optimize_threshold_your_way(xgb_prob, y_test)
lgb_thresh = optimize_threshold_your_way(lgb_prob, y_test)
cat_thresh = optimize_threshold_your_way(cat_prob, y_test)
rf_thresh = 0.5

# Create predictions
xgb_pred = (xgb_prob >= xgb_thresh).astype(int)
lgb_pred = (lgb_prob >= lgb_thresh).astype(int)
cat_pred = (cat_prob >= cat_thresh).astype(int)
rf_pred = (rf_prob >= rf_thresh).astype(int)


In [19]:
def create_smart_ensemble(probabilities_dict, predictions_dict, y_test):
    weights = {}
    model_scores = {}
    
    for name, prob in probabilities_dict.items():
        pred = predictions_dict[name]
        
        auc = roc_auc_score(y_test, prob)
        f1 = f1_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        mcc = matthews_corrcoef(y_test, pred)
        
        # CHANGED: Higher weight on recall for credit risk
        credit_score = (auc * 0.25) + (recall * 0.40) + (f1 * 0.20) + (precision * 0.10) + (mcc * 0.05)
        model_scores[name] = credit_score
    
    # Calculate weights (same logic)
    total_score = sum(model_scores.values())
    for name in model_scores:
        base_weight = model_scores[name] / total_score
        weights[name] = base_weight * 1.1 if model_scores[name] >= max(model_scores.values()) * 0.95 else base_weight
    
    total_weight = sum(weights.values())
    for name in weights:
        weights[name] = weights[name] / total_weight
    
    # Create ensemble
    ensemble_prob = np.zeros(len(y_test))
    for name, prob in probabilities_dict.items():
        ensemble_prob += prob * weights[name]
    
    return ensemble_prob, weights

probabilities_dict = {'XGBoost': xgb_prob, 'LightGBM': lgb_prob, 'CatBoost': cat_prob, 'RandomForest': rf_prob}
predictions_dict = {'XGBoost': xgb_pred, 'LightGBM': lgb_pred, 'CatBoost': cat_pred, 'RandomForest': rf_pred}

ensemble_prob, model_weights = create_smart_ensemble(probabilities_dict, predictions_dict, y_test)

# CHANGED: Lower threshold for better recall
def optimize_ensemble_threshold_recall_focused(ensemble_prob, y_test):
    best_thresh = 0.4  # Start lower
    best_score = 0
    
    for threshold in np.arange(0.25, 0.55, 0.02):  # Lower range
        pred = (ensemble_prob >= threshold).astype(int)
        f1 = f1_score(y_test, pred)
        precision = precision_score(y_test, pred)
        recall = recall_score(y_test, pred)
        accuracy = accuracy_score(y_test, pred)
        
        # CHANGED: Prioritize recall more
        ensemble_score = (accuracy * 0.20) + (f1 * 0.30) + (precision * 0.15) + (recall * 0.35)
        
        # CHANGED: Lower precision requirement, higher recall requirement
        if ensemble_score > best_score and precision >= 0.60 and recall >= 0.82:
            best_score = ensemble_score
            best_thresh = threshold
    
    return best_thresh

ensemble_thresh = optimize_ensemble_threshold_recall_focused(ensemble_prob, y_test)
ensemble_pred = (ensemble_prob >= ensemble_thresh).astype(int)


In [21]:
def calculate_comprehensive_metrics(y_true, y_pred, y_prob):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    
    return {
        'accuracy': accuracy_score(y_true, y_pred),
        'balanced_accuracy': balanced_accuracy_score(y_true, y_pred),
        'precision': precision_score(y_true, y_pred),
        'recall': recall_score(y_true, y_pred),
        'specificity': tn / (tn + fp),
        'f1_score': f1_score(y_true, y_pred),
        'roc_auc': roc_auc_score(y_true, y_prob),
        'pr_auc': average_precision_score(y_true, y_prob),
        'mcc': matthews_corrcoef(y_true, y_pred),
        'gini': 2 * roc_auc_score(y_true, y_prob) - 1,
        'log_loss': log_loss(y_true, y_prob)
    }

# Evaluate all models
models_results = {
    'XGBoost': calculate_comprehensive_metrics(y_test, xgb_pred, xgb_prob),
    'LightGBM': calculate_comprehensive_metrics(y_test, lgb_pred, lgb_prob),
    'CatBoost': calculate_comprehensive_metrics(y_test, cat_pred, cat_prob),
    'RandomForest': calculate_comprehensive_metrics(y_test, rf_pred, rf_prob),
    'Ensemble': calculate_comprehensive_metrics(y_test, ensemble_pred, ensemble_prob)
}

# Create results dataframe
performance_df = pd.DataFrame(models_results).T.round(4)

print("Performance Comparison:")
print(performance_df)

# Find best performers
key_metrics = ['accuracy', 'f1_score', 'roc_auc', 'precision', 'recall']
overall_scores = {}

for model in models_results.keys():
    score = sum(performance_df.loc[model, metric] for metric in key_metrics) / len(key_metrics)
    overall_scores[model] = score

best_overall = max(overall_scores, key=overall_scores.get)
print(f"\nBest Overall Model: {best_overall}")
print(f"Combined Score: {overall_scores[best_overall]:.4f}")


Performance Comparison:
              accuracy  balanced_accuracy  precision  recall  specificity  \
XGBoost         0.8935             0.8815     0.7119  0.8601       0.9028   
LightGBM        0.8900             0.8774     0.7041  0.8551       0.8997   
CatBoost        0.8989             0.8770     0.7353  0.8383       0.9158   
RandomForest    0.9084             0.8443     0.8292  0.7307       0.9580   
Ensemble        0.9124             0.8814     0.7839  0.8263       0.9364   

              f1_score  roc_auc  pr_auc     mcc    gini  log_loss  
XGBoost         0.7790   0.9514  0.9086  0.7152  0.9027    0.2477  
LightGBM        0.7723   0.9505  0.9087  0.7064  0.9010    0.2526  
CatBoost        0.7834   0.9519  0.9106  0.7205  0.9039    0.2365  
RandomForest    0.7768   0.9220  0.8666  0.7218  0.8439    0.2889  
Ensemble        0.8045   0.9517  0.9094  0.7485  0.9033    0.2470  

Best Overall Model: Ensemble
Combined Score: 0.8558
