# Loan Approval Prediction

Welcome to the 2024 Kaggle Playground Series! We plan to continue in the spirit of previous playgrounds, providing interesting an approachable datasets for our community to practice their machine learning skills, and anticipate a competition each month.

Your Goal: The goal for this competition is to predict whether an applicant is approved for a loan.
Current Score: 0.81795
Current rank: 3514/3859 (top 91%)

### Columns investigation

- id - identify number of row
- person_age - age 
- person_income - amount money that person have ( the higher income person have than lover chance to loan ? )
- person_home_ownership - type of ownership (OWN, MORTGAGE, RENT, OTHER)
- person_emp_length - how long person work in years (new feature percent_person_emp_length = person_emp_length/person_age)
- loan_intent - purpose of loan (PERSONAL, EDUCATION, MEDICAL, VENTURE, HOME, AUTO)
- loan_grade - grade of loan (A, B, C, D, E, F, G) A - the best, G - the worst
- loan_amnt - amount of loan
- loan_int_rate - interest rate of loan (the higher interest rate the higher chance to loan ?)
- loan_percent_income - percent of income that person want to loan (new feature loan_percent_income = loan_amnt/person_income)
- cb_person_default_on_file - if person have default on file (Y/N)
- cb_person_cred_hist_length - credit history length (new feature percent_cb_person_cred_hist_length = cb_person_cred_hist_length/person_age)
- loan_status - target variable (1 - loan approved, 0 - loan not approved)

In [1]:
import numpy as np
# Import the necessary libraries
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer, MinMaxScaler, StandardScaler
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score


In [2]:
# load the dataset
test = pd.read_csv('data/loan/test.csv', index_col='id')
train = pd.read_csv('data/loan/train.csv', index_col='id')

In [4]:
# training set contains one row with age = 123 (it's an outlier), need to remove it
train = train[train['person_age'] < 100]

In [5]:
# Check correlation for the target variable (loan_status)
train_copy = train.copy()

# # Person Age, cb_person_cred_hist_length and loan_intent have low correlation with loan_status (we can drop them)
# columns_to_drop = ['person_age', 'cb_person_cred_hist_length', 'loan_intent']
# train_copy.drop(columns=columns_to_drop, inplace=True)

# columns loan_grade can be encode to numbers
# columns cb_person_default_on_file can be encode to numbers
# person_home_ownership can be encode to one-hot encoding

train_copy['loan_grade'] = train_copy['loan_grade'].astype('category').cat.codes
train_copy['cb_person_default_on_file'] = train_copy['cb_person_default_on_file'].astype('category').cat.codes
train_copy = pd.get_dummies(train_copy, columns=['person_home_ownership', 'loan_intent'])

train_copy.head(10)

Unnamed: 0_level_0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
0,37,35000,0.0,1,6000,11.49,0.17,0,14,0,False,False,False,True,False,True,False,False,False,False
1,22,56000,6.0,2,4000,13.35,0.07,0,2,0,False,False,True,False,False,False,False,True,False,False
2,29,28800,8.0,0,6000,8.9,0.21,0,10,0,False,False,True,False,False,False,False,False,True,False
3,30,70000,14.0,1,12000,11.11,0.17,0,5,0,False,False,False,True,False,False,False,False,False,True
4,22,60000,2.0,0,6000,6.92,0.1,0,3,0,False,False,False,True,False,False,False,True,False,False
5,27,45000,2.0,0,9000,8.94,0.2,0,5,0,False,False,False,True,False,False,False,False,False,True
6,25,45000,9.0,0,12000,6.54,0.27,0,3,0,True,False,False,False,False,True,False,False,False,False
7,21,20000,0.0,2,2500,13.49,0.13,1,3,0,False,False,False,True,False,False,False,False,True,False
8,37,69600,11.0,3,5000,14.84,0.07,1,11,0,False,False,False,True,False,True,False,False,False,False
9,35,110000,0.0,2,15000,12.98,0.14,1,6,0,True,False,False,False,True,False,False,False,False,False


In [6]:
# Feature Engineering
train_copy['percent_person_emp_length'] = train_copy['person_emp_length'] / train_copy['person_age']
train_copy['percent_cb_person_cred_hist_length'] = train_copy['cb_person_cred_hist_length'] / train_copy['person_age']
train_copy['loan_percent_income'] = train_copy['loan_amnt'] / train_copy['person_income']

In [7]:
train_copy.head(10)

Unnamed: 0_level_0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,...,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE,percent_person_emp_length,percent_cb_person_cred_hist_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,37,35000,0.0,1,6000,11.49,0.171429,0,14,0,...,False,True,False,True,False,False,False,False,0.0,0.378378
1,22,56000,6.0,2,4000,13.35,0.071429,0,2,0,...,True,False,False,False,False,True,False,False,0.272727,0.090909
2,29,28800,8.0,0,6000,8.9,0.208333,0,10,0,...,True,False,False,False,False,False,True,False,0.275862,0.344828
3,30,70000,14.0,1,12000,11.11,0.171429,0,5,0,...,False,True,False,False,False,False,False,True,0.466667,0.166667
4,22,60000,2.0,0,6000,6.92,0.1,0,3,0,...,False,True,False,False,False,True,False,False,0.090909,0.136364
5,27,45000,2.0,0,9000,8.94,0.2,0,5,0,...,False,True,False,False,False,False,False,True,0.074074,0.185185
6,25,45000,9.0,0,12000,6.54,0.266667,0,3,0,...,False,False,False,True,False,False,False,False,0.36,0.12
7,21,20000,0.0,2,2500,13.49,0.125,1,3,0,...,False,True,False,False,False,False,True,False,0.0,0.142857
8,37,69600,11.0,3,5000,14.84,0.071839,1,11,0,...,False,True,False,True,False,False,False,False,0.297297,0.297297
9,35,110000,0.0,2,15000,12.98,0.136364,1,6,0,...,False,False,True,False,False,False,False,False,0.0,0.171429


In [8]:
train_copy.corr()['loan_status'].sort_values(ascending=False)

loan_status                           1.000000
loan_grade                            0.385908
loan_percent_income                   0.375611
loan_int_rate                         0.338948
person_home_ownership_RENT            0.239616
cb_person_default_on_file             0.186959
loan_amnt                             0.144980
loan_intent_DEBTCONSOLIDATION         0.057679
loan_intent_MEDICAL                   0.049136
loan_intent_HOMEIMPROVEMENT           0.031060
person_home_ownership_OTHER           0.002918
person_age                           -0.001022
cb_person_cred_hist_length           -0.003033
percent_cb_person_cred_hist_length   -0.008737
loan_intent_PERSONAL                 -0.012446
loan_intent_EDUCATION                -0.051009
loan_intent_VENTURE                  -0.064381
person_home_ownership_OWN            -0.087558
person_emp_length                    -0.100425
percent_person_emp_length            -0.104626
person_income                        -0.169962
person_home_o

In [9]:
# Drop columns with low correlation by step in Pipeline using Custom Transformer by provide the threshold
# Also we can use a custom transformer to create new features
# Also we can check a low threshold value by using SearchGridCV

In [10]:
# Create Custom Transformer

from sklearn.base import BaseEstimator, TransformerMixin

class LowThresholdDrop(BaseEstimator, TransformerMixin):
    
    def __init__(self, threshold=0.1, status_column_index=-1):
        self.threshold = threshold
        self.status_column_index = status_column_index
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        corr = X.corr()[X.columns[self.status_column_index]]
        columns_to_drop = corr[corr.abs() < self.threshold].index
        X.drop(columns=columns_to_drop, inplace=True)
        X.drop(columns=self.status_column_index, inplace=True)
        return X
    
class FeatureEngineering(BaseEstimator, TransformerMixin):
    
    def __init__(self):
        pass
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X['percent_person_emp_length'] = X['person_emp_length'] / X['person_age']
        X['percent_cb_person_cred_hist_length'] = X['cb_person_cred_hist_length'] / X['person_age']
        X['loan_percent_income'] = X['loan_amnt'] / X['person_income']
        return X

In [11]:
feature_engineering = FeatureEngineering()

feature_engineering.fit_transform(train)
feature_engineering.transform(test)

Unnamed: 0_level_0,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,percent_person_emp_length,percent_cb_person_cred_hist_length
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
58645,23,69000,RENT,3.0,HOMEIMPROVEMENT,F,25000,15.76,0.362319,N,2,0.130435,0.086957
58646,26,96000,MORTGAGE,6.0,PERSONAL,C,10000,12.68,0.104167,Y,4,0.230769,0.153846
58647,26,30000,RENT,5.0,VENTURE,E,4000,17.19,0.133333,Y,2,0.192308,0.076923
58648,33,50000,RENT,4.0,DEBTCONSOLIDATION,A,7000,8.90,0.140000,N,7,0.121212,0.212121
58649,26,102000,MORTGAGE,8.0,HOMEIMPROVEMENT,D,15000,16.32,0.147059,Y,4,0.307692,0.153846
...,...,...,...,...,...,...,...,...,...,...,...,...,...
97738,22,31200,MORTGAGE,2.0,DEBTCONSOLIDATION,B,3000,10.37,0.096154,N,4,0.090909,0.181818
97739,22,48000,MORTGAGE,6.0,EDUCATION,A,7000,6.03,0.145833,N,3,0.272727,0.136364
97740,51,60000,MORTGAGE,0.0,PERSONAL,A,15000,7.51,0.250000,N,25,0.000000,0.490196
97741,22,36000,MORTGAGE,4.0,PERSONAL,D,14000,15.62,0.388889,Y,4,0.181818,0.181818


In [12]:
train = pd.get_dummies(train, columns=['person_home_ownership', 'loan_intent'])
test = pd.get_dummies(test, columns=['person_home_ownership', 'loan_intent'])

In [13]:
train['loan_grade'] = train['loan_grade'].astype('category').cat.codes
train['cb_person_default_on_file'] = train['cb_person_default_on_file'].astype('category').cat.codes

In [14]:
test['loan_grade'] = test['loan_grade'].astype('category').cat.codes
test['cb_person_default_on_file'] = test['cb_person_default_on_file'].astype('category').cat.codes

In [15]:
train_copy.corr()['loan_status'].sort_values(ascending=False)

loan_status                           1.000000
loan_grade                            0.385908
loan_percent_income                   0.375611
loan_int_rate                         0.338948
person_home_ownership_RENT            0.239616
cb_person_default_on_file             0.186959
loan_amnt                             0.144980
loan_intent_DEBTCONSOLIDATION         0.057679
loan_intent_MEDICAL                   0.049136
loan_intent_HOMEIMPROVEMENT           0.031060
person_home_ownership_OTHER           0.002918
person_age                           -0.001022
cb_person_cred_hist_length           -0.003033
percent_cb_person_cred_hist_length   -0.008737
loan_intent_PERSONAL                 -0.012446
loan_intent_EDUCATION                -0.051009
loan_intent_VENTURE                  -0.064381
person_home_ownership_OWN            -0.087558
person_emp_length                    -0.100425
percent_person_emp_length            -0.104626
person_income                        -0.169962
person_home_o

In [16]:
train.head(10)

Unnamed: 0_level_0,person_age,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status,...,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,person_home_ownership_RENT,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,37,35000,0.0,1,6000,11.49,0.171429,0,14,0,...,False,False,False,True,False,True,False,False,False,False
1,22,56000,6.0,2,4000,13.35,0.071429,0,2,0,...,False,False,True,False,False,False,False,True,False,False
2,29,28800,8.0,0,6000,8.9,0.208333,0,10,0,...,False,False,True,False,False,False,False,False,True,False
3,30,70000,14.0,1,12000,11.11,0.171429,0,5,0,...,False,False,False,True,False,False,False,False,False,True
4,22,60000,2.0,0,6000,6.92,0.1,0,3,0,...,False,False,False,True,False,False,False,True,False,False
5,27,45000,2.0,0,9000,8.94,0.2,0,5,0,...,False,False,False,True,False,False,False,False,False,True
6,25,45000,9.0,0,12000,6.54,0.266667,0,3,0,...,True,False,False,False,False,True,False,False,False,False
7,21,20000,0.0,2,2500,13.49,0.125,1,3,0,...,False,False,False,True,False,False,False,False,True,False
8,37,69600,11.0,3,5000,14.84,0.071839,1,11,0,...,False,False,False,True,False,True,False,False,False,False
9,35,110000,0.0,2,15000,12.98,0.136364,1,6,0,...,True,False,False,False,True,False,False,False,False,False


In [17]:
corr_matrix = train.corr()
correlated = corr_matrix['loan_status'].sort_values(ascending=False)

columns_to_drop = correlated[correlated.abs() < 0.1].index
print(columns_to_drop)

train.drop(columns=columns_to_drop, inplace=True)

Index(['loan_intent_DEBTCONSOLIDATION', 'loan_intent_MEDICAL',
       'loan_intent_HOMEIMPROVEMENT', 'person_home_ownership_OTHER',
       'person_age', 'cb_person_cred_hist_length',
       'percent_cb_person_cred_hist_length', 'loan_intent_PERSONAL',
       'loan_intent_EDUCATION', 'loan_intent_VENTURE',
       'person_home_ownership_OWN'],
      dtype='object')


In [18]:
train.head(10)

Unnamed: 0_level_0,person_income,person_emp_length,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,loan_status,percent_person_emp_length,person_home_ownership_MORTGAGE,person_home_ownership_RENT
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,35000,0.0,1,6000,11.49,0.171429,0,0,0.0,False,True
1,56000,6.0,2,4000,13.35,0.071429,0,0,0.272727,False,False
2,28800,8.0,0,6000,8.9,0.208333,0,0,0.275862,False,False
3,70000,14.0,1,12000,11.11,0.171429,0,0,0.466667,False,True
4,60000,2.0,0,6000,6.92,0.1,0,0,0.090909,False,True
5,45000,2.0,0,9000,8.94,0.2,0,0,0.074074,False,True
6,45000,9.0,0,12000,6.54,0.266667,0,0,0.36,True,False
7,20000,0.0,2,2500,13.49,0.125,1,0,0.0,False,True
8,69600,11.0,3,5000,14.84,0.071839,1,0,0.297297,False,True
9,110000,0.0,2,15000,12.98,0.136364,1,0,0.0,True,False


In [19]:
# split data 
X = train.drop('loan_status', axis=1)
y = train['loan_status']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
def test_vs_train_score(model, X_train, y_train, X_test, y_test):
    # predict on train test
    index_to_select = np.random.randint(0, X_train.shape[0], 100)
    some_test_data = X_train.iloc[index_to_select]
    some_test_labels = y_train.iloc[index_to_select]
    test_predicted_10 = model.predict(some_test_data)
    threshold = 0.5
    y_pred_binary = (np.abs(test_predicted_10) >= threshold).astype(int)
    accuracy = accuracy_score(y_pred_binary, some_test_labels)
    print('Train Accuracy:', accuracy) 
    
    # predict on test set
    index_to_select = np.random.randint(0, X_test.shape[0], 10)
    some_test_data = X_test.iloc[index_to_select]
    some_test_labels = y_test.iloc[index_to_select]
    test_predicted = model.predict(some_test_data)
    threshold = 0.5
    y_pred_binary = (np.abs(test_predicted) >= threshold).astype(int)
    accuracy = accuracy_score(y_pred_binary, some_test_labels)
    print('Test Accuracy:', accuracy)
        

In [26]:
columns_with_low_corr = ['loan_intent_DEBTCONSOLIDATION', 'loan_intent_MEDICAL',
       'loan_intent_HOMEIMPROVEMENT', 'person_home_ownership_OTHER',
       'person_age', 'cb_person_cred_hist_length',
       'percent_cb_person_cred_hist_length', 'loan_intent_PERSONAL',
       'loan_intent_EDUCATION', 'loan_intent_VENTURE',
       'person_home_ownership_OWN']

test.drop(columns=columns_with_low_corr, inplace=True)

In [31]:
# Use Naive Bayes
from sklearn.naive_bayes import GaussianNB

naive_bayes = GaussianNB()

naive_bayes.fit(X_train, y_train)

test_vs_train_score(naive_bayes, X_train, y_train, X_test, y_test)

cross_score = cross_val_score(naive_bayes, X_train, y_train, cv=3, scoring='accuracy')

print('Cross validation score:', cross_score)
print('Mean:', cross_score.mean())

Train Accuracy: 0.85
Test Accuracy: 1.0
Cross validation score: [0.88215359 0.88879652 0.8852155 ]
Mean: 0.8853885374602383


In [32]:
# Tune model using RandomizedSearchCV

from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    'var_smoothing': np.logspace(0,-9, num=100),
}

random_search = RandomizedSearchCV(naive_bayes, param_grid, cv=3, scoring='accuracy', n_iter=100)

random_search.fit(X_train, y_train)

print(random_search.best_params_)

test_vs_train_score(random_search, X_train, y_train, X_test, y_test)
cross_val = cross_val_score(random_search, X_train, y_train, cv=3, scoring='accuracy')

print('Cross validation score:', cross_val)
print('Mean:', cross_val.mean())

{'var_smoothing': 1.2328467394420635e-09}
Train Accuracy: 0.89
Test Accuracy: 0.9
Cross validation score: [0.88324062 0.88956388 0.88604681]
Mean: 0.8862837694373447


In [None]:
model = random_search.best_estimator_

In [28]:
test_pred = model.predict(test)

In [29]:
# Save results (prediction + id )
submission = pd.DataFrame(test_pred, index=test.index, columns=['loan_status'])
submission.to_csv('data/loan/submission.csv', index=True)
print('Prediction saved to submission.csv')