# Loan Approval Prediction

Welcome to the 2024 Kaggle Playground Series! We plan to continue in the spirit of previous playgrounds, providing interesting an approachable datasets for our community to practice their machine learning skills, and anticipate a competition each month.

Your Goal: The goal for this competition is to predict whether an applicant is approved for a loan.
Current Score: 0.81795
Current rank: 3514/3859 (top 91%)

### Columns investigation

- id - identify number of row
- person_age - age 
- person_income - amount money that person have ( the higher income person have than lover chance to loan ? )
- person_home_ownership - type of ownership (OWN, MORTGAGE, RENT, OTHER)
- person_emp_length - how long person work in years (new feature percent_person_emp_length = person_emp_length/person_age)
- loan_intent - purpose of loan (PERSONAL, EDUCATION, MEDICAL, VENTURE, HOME, AUTO)
- loan_grade - grade of loan (A, B, C, D, E, F, G) A - the best, G - the worst
- loan_amnt - amount of loan
- loan_int_rate - interest rate of loan (the higher interest rate the higher chance to loan ?)
- loan_percent_income - percent of income that person want to loan (new feature loan_percent_income = loan_amnt/person_income)
- cb_person_default_on_file - if person have default on file (Y/N)
- cb_person_cred_hist_length - credit history length (new feature percent_cb_person_cred_hist_length = cb_person_cred_hist_length/person_age)
- loan_status - target variable (1 - loan approved, 0 - loan not approved)

In [24]:
import numpy as np
import pandas as pd
from sklearn.model_selection import cross_val_score
import multiprocessing
from joblib import Parallel, delayed
import time
import warnings
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import (Ridge,
                                  LinearRegression,
                                  Lasso,
                                  ElasticNet,
                                  BayesianRidge,
                                  SGDRegressor,
                                  HuberRegressor)
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB

In [25]:
submission_path = '/kaggle/working/submission.csv'
test_path = './data/loan/test.csv' # '/kaggle/input/playground-series-s4e10/test.csv'
train_path = './data/loan/train.csv' # '/kaggle/input/playground-series-s4e10/train.csv'

In [26]:
# load the dataset
test = pd.read_csv(test_path, index_col='id')
train = pd.read_csv(train_path, index_col='id')

In [27]:
train.head()

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


#### Possible improvements:
- Check for outliers and align / remove them - To Do
- Check for missing values and fill them - **Done**
- Check for categorical features and encode them - **Done**
- Check for numerical features and scale them - **Done**
- Check for correlation between features and target variable - To Do
- Check for multicollinearity between features - To Do
- Check for feature importance and drop unimportant features - To Do

In [28]:
# training set contains one row with age = 123 (it's an outlier), need to remove it
train = train[train['person_age'] < 100]

In [29]:
num_cols = ['person_age', 'person_income', 'person_emp_length', 'loan_amnt', 'loan_int_rate', 'loan_percent_income', 'cb_person_cred_hist_length']
cat_cols = ['loan_grade', 'cb_person_default_on_file', 'person_home_ownership', 'loan_intent']
pass_through = ['is_young']  # keep id and target variable for correlation analysis

num_pipeline = Pipeline(
    steps=[
        ('impute', KNNImputer(n_neighbors=5)),
        ('scale', StandardScaler())
        # ('scale', MinMaxScaler())
    ]
)

cat_pipeline = Pipeline(
    steps=[
        ('impute', SimpleImputer(strategy='most_frequent')),
        ('encode', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_pipeline, num_cols),
        ('cat', cat_pipeline, cat_cols),
        ('passthrough', 'passthrough', pass_through)  # keep the target variable for correlation analysis
    ],
    remainder='drop', n_jobs=-1
)

In [30]:
class ThresholdTransformer(BaseEstimator, TransformerMixin):
    """ Custom transformer to drop columns with low correlation to the target variable."""

    def __init__(self, threshold=0.1, status_column_index=-1):
        self.threshold = threshold
        self.status_column_index = status_column_index

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """ Drop columns with correlation below the threshold with respect to the target variable."""
        corr = X.corr()[X.columns[self.status_column_index]]
        _columns_to_drop = corr[corr.abs() < self.threshold].index
        X.drop(columns=_columns_to_drop, inplace=True)
        X.drop(columns=self.status_column_index, inplace=True)
        return X

In [31]:
class FeatureTransformer(BaseEstimator, TransformerMixin):
    """ Custom transformer to create new features based on existing ones."""

    def __init__(self):
        pass

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        """ Create new features based on existing ones."""
        X['percent_person_emp_length'] = X['person_emp_length'] / X['person_age']
        X['percent_cb_person_cred_hist_length'] = X['cb_person_cred_hist_length'] / X['person_age']
        X['loan_percent_income'] = X['loan_amnt'] / X['person_income']
        X['is_young'] = (X['person_age'] < 18).astype(int)
        return X

    def get_feature_names_out(self, input_features=None):
        # Return the names of the features after transformation
        if input_features is None:
            input_features = [
                'person_age', 'person_income', 'person_emp_length', 'loan_amnt',
                'loan_int_rate', 'cb_person_cred_hist_length', 'loan_grade',
                'cb_person_default_on_file', 'person_home_ownership', 'loan_intent'
            ]
        return np.array(
            list(input_features) +
            ['percent_person_emp_length', 'percent_cb_person_cred_hist_length', 'loan_percent_income', 'is_young']
            )

In [32]:
# let's use train_test split to separate features and target variable

X_train, X_test, y_train, y_test = train_test_split(
    train.drop(columns=['loan_status']),
    train['loan_status'],
    test_size=0.2,
    random_state=42,
    stratify=train['loan_status']
)

In [33]:
train_index = X_train.index
test_index = X_test.index

# Create a pipeline with custom transformer
transformer = Pipeline(steps=[
    ('feature_engineering', FeatureTransformer()),
    # ('low_threshold_drop', ThresholdTransformer(threshold=0.1, status_column_index=-1)),
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler())
])

In [34]:
import_features = X_train.columns

# Fit the pipeline
transformer.fit(X_train)

# Get feature names from the preprocessor step
feature_names = transformer.named_steps['preprocessor'].get_feature_names_out()

# Create DataFrame
transformed_train_df = pd.DataFrame(
    transformer.transform(X_train),
    index=train_index,
    columns=feature_names
)

In [35]:
transformed_train_df.head(10)

Unnamed: 0_level_0,num__person_age,num__person_income,num__person_emp_length,num__loan_amnt,num__loan_int_rate,num__loan_percent_income,num__cb_person_cred_hist_length,cat__loan_grade_A,cat__loan_grade_B,cat__loan_grade_C,...,cat__person_home_ownership_OTHER,cat__person_home_ownership_OWN,cat__person_home_ownership_RENT,cat__loan_intent_DEBTCONSOLIDATION,cat__loan_intent_EDUCATION,cat__loan_intent_HOMEIMPROVEMENT,cat__loan_intent_MEDICAL,cat__loan_intent_PERSONAL,cat__loan_intent_VENTURE,passthrough__is_young
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
29484,2.725815,0.310281,-0.434877,1.94319,0.820132,1.12794,2.515263,-0.748599,-0.729362,2.079711,...,-0.038099,-0.236698,0.959367,-0.427354,-0.51584,-0.346271,-0.479323,-0.45132,2.191839,0.0
50310,-0.755626,-0.830673,0.592685,-0.755766,2.752315,-0.144922,-0.946463,-0.748599,-0.729362,-0.480836,...,-0.038099,-0.236698,-1.042354,-0.427354,-0.51584,2.887912,-0.479323,-0.45132,-0.456238,0.0
51881,0.073289,0.730315,-0.948658,2.303051,-0.249646,0.893597,0.042602,-0.748599,1.371062,-0.480836,...,-0.038099,-0.236698,0.959367,-0.427354,1.938585,-0.346271,-0.479323,-0.45132,-0.456238,0.0
47114,-0.092494,-0.109754,-0.434877,1.043538,-1.039636,0.952183,0.042602,1.335829,-0.729362,-0.480836,...,-0.038099,-0.236698,-1.042354,-0.427354,-0.51584,2.887912,-0.479323,-0.45132,-0.456238,0.0
3780,-0.42406,0.030258,-1.205548,-0.575836,-0.137731,-0.710753,-0.451931,-0.748599,1.371062,-0.480836,...,-0.038099,-0.236698,-1.042354,-0.427354,-0.51584,-0.346271,-0.479323,2.215724,-0.456238,0.0
32718,-0.092494,-1.005828,-0.434877,-1.475488,-1.161426,-1.307556,0.289868,1.335829,-0.729362,-0.480836,...,-0.038099,-0.236698,0.959367,-0.427354,-0.51584,-0.346271,2.086274,-0.45132,-0.456238,0.0
17787,-0.092494,1.161551,-0.948658,2.662912,0.928755,0.717061,1.031666,-0.748599,-0.729362,2.079711,...,-0.038099,-0.236698,0.959367,2.339979,-0.51584,-0.346271,-0.479323,-0.45132,-0.456238,0.0
41074,-1.087192,-0.949823,-1.205548,-0.935697,-0.68414,-0.278119,-0.946463,1.335829,-0.729362,-0.480836,...,-0.038099,-0.236698,0.959367,-0.427354,1.938585,-0.346271,-0.479323,-0.45132,-0.456238,0.0
16302,0.073289,-0.075703,1.620247,1.043538,0.342846,0.899814,0.537134,-0.748599,1.371062,-0.480836,...,-0.038099,-0.236698,-1.042354,-0.427354,-0.51584,-0.346271,2.086274,-0.45132,-0.456238,0.0
45487,0.073289,0.730315,-0.691767,2.842842,0.678592,1.245112,0.042602,-0.748599,-0.729362,2.079711,...,-0.038099,-0.236698,0.959367,2.339979,-0.51584,-0.346271,-0.479323,-0.45132,-0.456238,0.0


In [36]:
regression_models = [
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
    BayesianRidge,
    SGDRegressor,
    HuberRegressor,
    DecisionTreeRegressor,
    RandomForestRegressor,
    GradientBoostingRegressor,
    SVR,
    GaussianNB
]

In [37]:
# let's use ThreadPool to speed up the process
warnings.filterwarnings('ignore')
num_cores = multiprocessing.cpu_count()
print(f'Number of cores: {num_cores}')

def fit_and_evaluate_model(model, transformer, X_train, y_train, X_test, y_test):
    try:
        start = time.time()
        reg = model()
        pipeline = make_pipeline(transformer, reg)
        pipeline.fit(X_train, y_train)

        # let's calculate negative mean squared error
        cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        scores = cross_val_score(pipeline, X_test, y_test, scoring='neg_mean_squared_error', cv=cv)
        tree_rmse_scores = np.sqrt(-scores) # root mean squared errors
        r2_score = pipeline.score(X_test, y_test)
        mae_score = -cross_val_score(pipeline, X_test, y_test, scoring='neg_mean_absolute_error', cv=cv).mean()

        end = time.time()
        return {'Model': model.__name__,
                'Mean RMSE': round(tree_rmse_scores.mean(), 3),
                'Std RMSE': round(tree_rmse_scores.std(), 3),
                'Training Time': round(round(end - start, 3)),
                'R2 Score': round(r2_score, 3),
                'MAE': round(mae_score, 3)
                }

    except Exception as e:
        return {'Model': model.__name__,
                'Mean RMSE': None,
                'Std RMSE': None,
                'Training Time': None,
                'R2 Score': None,
                'MAE': None
                }

Number of cores: 8


In [38]:
# le'ts use Parallel to fit the models
start_time = time.time()
results = Parallel(n_jobs=num_cores)(delayed(fit_and_evaluate_model)(model, transformer, X_train, y_train, X_test, y_test) for model in regression_models)
end_time = time.time()
print(f'Time taken: {end_time - start_time} seconds')

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of iterations to improve the convergence (max_iter=100).
You might also want to scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT

Increase the number of itera

Time taken: 189.61100482940674 seconds


In [44]:
report_time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())

In [45]:
pd.DataFrame(results).sort_values(by='Mean RMSE').to_csv(f'model_comparison_{report_time_str}.csv', index=False, header=True)

In [52]:
param_grid_list = [
                # {
                #     'model': RandomForestRegressor,
                #     'param_grid': {
                #         'randomforestregressor__n_estimators': [100, 200, 300],
                #         'randomforestregressor__max_depth': [None, 10, 20],
                #         'randomforestregressor__min_samples_split': [2, 5, 10],
                #         'randomforestregressor__min_samples_leaf': [1, 2, 4]
                #     }
                # },
                {
                    'model': GradientBoostingRegressor,
                    'param_grid': {
                        'gradientboostingregressor__n_estimators': [100, 200],
                        'gradientboostingregressor__learning_rate': [0.01, 0.1, 0.2],
                        'gradientboostingregressor__max_depth': [3, 5, 7]
                    }
                },
                {
                    'model': SVR,
                    'param_grid': {
                        'svr__kernel': ['linear', 'rbf'],
                        'svr__C': [0.1, 1, 10],
                        'svr__gamma': ['scale', 'auto']
                    }
                },
                {
                    'model': LinearRegression,
                    'param_grid': {
                        'linearregression__fit_intercept': [True, False],
                        'linearregression__normalize': [True, False]
                    }
                }
            ]

In [54]:
# let's use RandomizedSearchCV to tune the models
# TODO: Run this on Kaggle session ( too hard to run locally )
tuned_results = []

for item in param_grid_list:
    model = item['model']
    param_grid = item['param_grid']

    print(f'Tuning {model.__name__}...')

    # Create a pipeline with the transformer and the model
    pipeline = make_pipeline(transformer, model())

    # Use RandomizedSearchCV to find the best parameters
    search = RandomizedSearchCV(pipeline, param_grid, n_iter=5, cv=StratifiedKFold(n_splits=3), scoring='neg_mean_squared_error', n_jobs=-1, verbose=1)

    search.fit(X_train, y_train)

    best_model = search.best_estimator_
    best_score = -search.best_score_  # Convert back to positive RMSE
    tuned_results.append({'Model': model.__name__, 'Best RMSE': round(best_score, 3)})

    print(f'Best parameters for {model.__name__}: {search.best_params_}')
    print(f'Best RMSE: {round(best_score, 3)}\n')

Tuning GradientBoostingRegressor...
Fitting 3 folds for each of 5 candidates, totalling 15 fits


KeyboardInterrupt: 

In [None]:
tuned_results

In [40]:
# # Tune model using RandomizedSearchCV
# param_grid = {
#     'var_smoothing': np.logspace(0,-9, num=100),
# }
#
# cv_gnb = RandomizedSearchCV(GaussianNB(),
#                             param_grid,
#                             cv=StratifiedKFold(n_splits=5),
#                             scoring='accuracy',
#                             n_iter=10,
#                             n_jobs=-1,
#                             verbose=1)
#
# pipeline = make_pipeline(transformer, cv_gnb)
#
# pipeline.fit(X_train, y_train)
#
# best_model = pipeline.named_steps['randomizedsearchcv'].best_estimator_
# model_score = pipeline.score(X_test, y_test)
# cross_val = cross_val_score(pipeline, X_train, y_train, cv=3, scoring='accuracy')
#
# print('Model score:', model_score)
# print('Cross validation score:', cross_val) # Train Accuracy: 0.89, Test Accuracy: 0.9
# print('Mean:', cross_val.mean())            # Mean: 0.8862837694373447

In [41]:
# # let's make full fit on the whole training set
# full_pipeline = make_pipeline(transformer, best_model)
# full_pipeline.fit(train.drop(columns=['loan_status']), train['loan_status'])

In [42]:
# test_pred = full_pipeline.predict(test)

In [43]:
# # Save results (prediction + id )
# submission = pd.DataFrame(test_pred, index=test.index, columns=['loan_status'])
# submission.to_csv(submission_path, index=True)
# print('Prediction saved to submission.csv')