In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [None]:
# Load the data
df = pd.read_csv("../data/train.csv")

# Split features and target
X = df.drop(['loan_status', 'id'], axis=1)
y = df['loan_status']

# Automatically identify categorical and numerical columns based on dtype
# Numerical columns: int64, float64
# Categorical columns: object, category, bool
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object', 'category', 'bool']).columns.tolist()

print(f"Numerical columns: {numerical_cols}")
print(f"Categorical columns: {categorical_cols}")

# Create preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ]
)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21)

Numerical columns: ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
Categorical columns: ['person_home_ownership', 'loan_intent', 'loan_grade', 'cb_person_default_on_file']


# 1. Scikit-learn implementation
## Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score

# Create pipeline with preprocessing and logistic regression
log_reg_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(random_state=21, max_iter=1000))
])

# Train the model
log_reg_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = log_reg_pipeline.predict(X_test)
y_pred_proba = log_reg_pipeline.predict_proba(X_test)[:, 1]

# Evaluate 
print("Logistic Regression Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Logistic Regression Results:
Accuracy: 0.9141
ROC AUC: 0.7615
              precision    recall  f1-score   support

           0       0.93      0.98      0.95     10054
           1       0.79      0.55      0.65      1675

    accuracy                           0.91     11729
   macro avg       0.86      0.76      0.80     11729
weighted avg       0.91      0.91      0.91     11729



## Scikit-learn: Random Forest

In [8]:
from sklearn.ensemble import RandomForestClassifier

# Create pipeline with preprocessing and Random Forest
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=21))
])

# Train the model
rf_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = rf_pipeline.predict(X_test)
y_pred_proba = rf_pipeline.predict_proba(X_test)[:, 1]

# Evaluate 
print("Random Forest Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

# Feature importance
feature_names = (
    numerical_cols + 
    rf_pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_cols).tolist()
)
importances = rf_pipeline.named_steps['classifier'].feature_importances_
indices = np.argsort(importances)[::-1]

print("Feature ranking:")
for f in range(min(10, len(feature_names))):
    print(f"{f+1}. {feature_names[indices[f]]} ({importances[indices[f]]:.4f})")

Random Forest Results:
Accuracy: 0.9513
ROC AUC: 0.8527
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     10054
           1       0.93      0.71      0.81      1675

    accuracy                           0.95     11729
   macro avg       0.94      0.85      0.89     11729
weighted avg       0.95      0.95      0.95     11729

Feature ranking:
1. loan_percent_income (0.2358)
2. loan_int_rate (0.1186)
3. person_income (0.1065)
4. loan_grade_D (0.0916)
5. loan_amnt (0.0727)
6. person_emp_length (0.0629)
7. person_home_ownership_RENT (0.0531)
8. person_age (0.0455)
9. cb_person_cred_hist_length (0.0362)
10. person_home_ownership_MORTGAGE (0.0277)


## Scikit-learn: Gradient Boosting

In [9]:
from sklearn.ensemble import GradientBoostingClassifier

# Create pipeline with preprocessing and Gradient Boosting
gb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', GradientBoostingClassifier(n_estimators=100, random_state=21))
])

# Train the model
gb_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = gb_pipeline.predict(X_test)
y_pred_proba = gb_pipeline.predict_proba(X_test)[:, 1]

# Evaluate 
print("Gradient Boosting Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Gradient Boosting Results:
Accuracy: 0.9517
ROC AUC: 0.8566
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     10054
           1       0.92      0.72      0.81      1675

    accuracy                           0.95     11729
   macro avg       0.94      0.86      0.89     11729
weighted avg       0.95      0.95      0.95     11729



## Scikit-learn: Support Vector Machine

In [10]:
from sklearn.svm import SVC

# Create pipeline with preprocessing and SVM
svm_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', SVC(probability=True, random_state=21))
])

# Train the model
svm_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = svm_pipeline.predict(X_test)
y_pred_proba = svm_pipeline.predict_proba(X_test)[:, 1]

# Evaluate 
print("SVM Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

SVM Results:
Accuracy: 0.9439
ROC AUC: 0.8312
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     10054
           1       0.91      0.67      0.77      1675

    accuracy                           0.94     11729
   macro avg       0.93      0.83      0.87     11729
weighted avg       0.94      0.94      0.94     11729



## Scikit-learn: K-Nearest Neighbors

In [14]:
from sklearn.neighbors import KNeighborsClassifier

# Create pipeline with preprocessing and KNN
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', KNeighborsClassifier(n_neighbors=5))
])

# Train the model
knn_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = knn_pipeline.predict(X_test)
y_pred_proba = knn_pipeline.predict_proba(X_test)[:, 1]

# Evaluate 
print("KNN Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

SVM Results:
Accuracy: 0.9318
ROC AUC: 0.8129
              precision    recall  f1-score   support

           0       0.94      0.98      0.96     10054
           1       0.84      0.65      0.73      1675

    accuracy                           0.93     11729
   macro avg       0.89      0.81      0.85     11729
weighted avg       0.93      0.93      0.93     11729



## Scikit-learn: Decision Tree

In [15]:
from sklearn.tree import DecisionTreeClassifier

# Create pipeline with preprocessing and KNN
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', DecisionTreeClassifier(random_state=21))
])

# Train the model
dt_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = dt_pipeline.predict(X_test)
y_pred_proba = dt_pipeline.predict_proba(X_test)[:, 1]

# Evaluate 
print("KNN Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

KNN Results:
Accuracy: 0.9143
ROC AUC: 0.8351
              precision    recall  f1-score   support

           0       0.95      0.95      0.95     10054
           1       0.69      0.72      0.71      1675

    accuracy                           0.91     11729
   macro avg       0.82      0.84      0.83     11729
weighted avg       0.92      0.91      0.92     11729



## Model Comparison and Hyperparameter Tuning

In [16]:
from sklearn.model_selection import cross_val_score

# List of models to compare
models = [
    ('Logistic Regression', LogisticRegression(random_state=21, max_iter=1000)),
    ('Random Forest', RandomForestClassifier(random_state=21)),
    ('Gradient Boosting', GradientBoostingClassifier(random_state=21)),
    ('SVM', SVC(probability=True, random_state=21)),
    ('KNN', KNeighborsClassifier(n_neighbors=5)),
    ('Decision Tree', DecisionTreeClassifier(random_state=21))
]

# Compare models using cross-validation
print("Cross-Validation Results:")
for name, model in models:
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('classifier', model)
    ])
    scores = cross_val_score(pipeline, X, y, cv=5, scoring='roc_auc')
    print(f"{name}: Mean ROC AUC = {scores.mean():.4f}, Std = {scores.std():.4f}")

Cross-Validation Results:
Logistic Regression: Mean ROC AUC = 0.9020, Std = 0.0021
Random Forest: Mean ROC AUC = 0.9343, Std = 0.0023
Gradient Boosting: Mean ROC AUC = 0.9398, Std = 0.0027
SVM: Mean ROC AUC = 0.8916, Std = 0.0064
KNN: Mean ROC AUC = 0.8773, Std = 0.0048
Decision Tree: Mean ROC AUC = 0.8325, Std = 0.0083


## Hyperparameter Tuning with GridSearchCV

In [18]:
from sklearn.model_selection import GridSearchCV

# Tuning Random Forest
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10],
    'classifier__min_samples_leaf': [1, 2, 4]
}

rf_model = RandomForestClassifier(random_state=21)
rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', rf_model)
])

grid_search = GridSearchCV(
    rf_pipeline,
    param_grid,
    cv=5,
    scoring='roc_auc',
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train, y_train)
print(f"Best parameters: {grid_search.best_params_}")
print(f"Best ROC AUC score: {grid_search.best_score_:.4f}")

# Use the best model
best_rf_model = grid_search.best_estimator_

# Make predictions
y_pred = best_rf_model.predict(X_test)
y_pred_proba = best_rf_model.predict_proba(X_test)[:, 1]

# Evaluate 
print("Best RF Model Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

Fitting 5 folds for each of 108 candidates, totalling 540 fits
Best parameters: {'classifier__max_depth': 20, 'classifier__min_samples_leaf': 1, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Best ROC AUC score: 0.9376
Best RF Model Results:
Accuracy: 0.9513
ROC AUC: 0.8512
              precision    recall  f1-score   support

           0       0.95      0.99      0.97     10054
           1       0.93      0.71      0.81      1675

    accuracy                           0.95     11729
   macro avg       0.94      0.85      0.89     11729
weighted avg       0.95      0.95      0.95     11729



## Stacking Ensemble

In [19]:
from sklearn.ensemble import StackingClassifier

# Define base models
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=21)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=21)),
    ('lr', LogisticRegression(random_state=21))
]

# Define meta-learner
meta_learner = LogisticRegression(random_state=21)

# Create stacking ensemble
stacking_clf = StackingClassifier(
    estimators=base_models,
    final_estimator=meta_learner,
    cv=5,
    stack_method='predict_proba'
)

# Create pipeline with preprocessing and stacking ensemble
stack_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', stacking_clf)
])

# Train the model
stack_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = stack_pipeline.predict(X_test)
y_pred_proba = stack_pipeline.predict_proba(X_test)[:, 1]

# Evaluate 
print("Stacking Ensemble Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred):.4f}")
print(f"ROC AUC: {roc_auc_score(y_test, y_pred):.4f}")
print(classification_report(y_test, y_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Stacking Ensemble Results:
Accuracy: 0.9518
ROC AUC: 0.8565
              precision    recall  f1-score   support

           0       0.96      0.99      0.97     10054
           1       0.92      0.72      0.81      1675

    accuracy                           0.95     11729
   macro avg       0.94      0.86      0.89     11729
weighted avg       0.95      0.95      0.95     11729



## Scikit-learn: Multi Layer Perceptron (Neural Network)

In [20]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report

# Create pipeline with preprocessing and MLP classifier
mlp_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', MLPClassifier(hidden_layer_sizes=(100, 50),
                                max_iter=1000,
                                activation='relu',
                                solver='adam',
                                random_state=21))
])

# Train the model
mlp_pipeline.fit(X_train, y_train)

# Make predictions
y_pred = mlp_pipeline.predict(X_test)

# Evaluate
print(f"Scikit-learn MLP Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

Scikit-learn MLP Accuracy: 0.9279563475147071
              precision    recall  f1-score   support

           0       0.95      0.96      0.96     10054
           1       0.77      0.71      0.74      1675

    accuracy                           0.93     11729
   macro avg       0.86      0.84      0.85     11729
weighted avg       0.93      0.93      0.93     11729

