# Loan Approval Prediction

In [30]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score, confusion_matrix, classification_report

In [31]:
# Load dataset
data = pd.read_csv('data/train.csv')

## EDA

In [32]:
data.head()

Unnamed: 0,id,person_age,person_income,person_home_ownership,person_emp_length,loan_intent,loan_grade,loan_amnt,loan_int_rate,loan_percent_income,cb_person_default_on_file,cb_person_cred_hist_length,loan_status
0,0,37,35000,RENT,0.0,EDUCATION,B,6000,11.49,0.17,N,14,0
1,1,22,56000,OWN,6.0,MEDICAL,C,4000,13.35,0.07,N,2,0
2,2,29,28800,OWN,8.0,PERSONAL,A,6000,8.9,0.21,N,10,0
3,3,30,70000,RENT,14.0,VENTURE,B,12000,11.11,0.17,N,5,0
4,4,22,60000,RENT,2.0,MEDICAL,A,6000,6.92,0.1,N,3,0


In [33]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58645 entries, 0 to 58644
Data columns (total 13 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   id                          58645 non-null  int64  
 1   person_age                  58645 non-null  int64  
 2   person_income               58645 non-null  int64  
 3   person_home_ownership       58645 non-null  object 
 4   person_emp_length           58645 non-null  float64
 5   loan_intent                 58645 non-null  object 
 6   loan_grade                  58645 non-null  object 
 7   loan_amnt                   58645 non-null  int64  
 8   loan_int_rate               58645 non-null  float64
 9   loan_percent_income         58645 non-null  float64
 10  cb_person_default_on_file   58645 non-null  object 
 11  cb_person_cred_hist_length  58645 non-null  int64  
 12  loan_status                 58645 non-null  int64  
dtypes: float64(3), int64(6), object

In [34]:
data.isnull().sum()
# No null values

id                            0
person_age                    0
person_income                 0
person_home_ownership         0
person_emp_length             0
loan_intent                   0
loan_grade                    0
loan_amnt                     0
loan_int_rate                 0
loan_percent_income           0
cb_person_default_on_file     0
cb_person_cred_hist_length    0
loan_status                   0
dtype: int64

## Data preparation

In [35]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

In [60]:
X = data.drop(columns=['loan_status'])
y = data['loan_status']

### Categorical features

In [46]:
categorical_columns = X.select_dtypes(include=['object']).columns

X_categorical = X[categorical_columns]

hot_encoder = OneHotEncoder()
X_categorical = hot_encoder.fit_transform(X_categorical)

X_categorical = pd.DataFrame(X_categorical.toarray(), columns=hot_encoder.get_feature_names_out(categorical_columns))

### Numerical features

In [54]:
numerical_columns = X.select_dtypes(include=['int64', 'float64']).columns
numerical_columns = numerical_columns.drop('id')
X_numerical = X[numerical_columns]

scaler = StandardScaler()
X_numerical = scaler.fit_transform(X_numerical)

X_numerical = pd.DataFrame(X_numerical, columns=numerical_columns)

### Regroup

In [87]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
	transformers=[
		('num', scaler, numerical_columns),
		('cat', hot_encoder, categorical_columns)
	]
)
preprocessor.fit(X)
X_prep_ColTransf = preprocessor.transform(X)

In [55]:
X_prep = pd.concat([X_numerical, X_categorical], axis=1)
X_prep.head()

Unnamed: 0,person_age,person_income,person_emp_length,loan_amnt,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,person_home_ownership_MORTGAGE,person_home_ownership_OTHER,person_home_ownership_OWN,...,loan_intent_VENTURE,loan_grade_A,loan_grade_B,loan_grade_C,loan_grade_D,loan_grade_E,loan_grade_F,loan_grade_G,cb_person_default_on_file_N,cb_person_default_on_file_Y
0,1.5662,-0.765768,-1.1872,-0.578306,0.267616,0.117378,2.031798,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,-0.920057,-0.212128,0.328047,-0.937775,0.880532,-0.973242,-0.946489,0.0,0.0,1.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.240196,-0.929223,0.83313,-0.578306,-0.585854,0.553626,1.039036,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
3,0.405947,0.156966,2.348377,0.500101,0.142396,0.117378,-0.201917,0.0,0.0,0.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
4,-0.920057,-0.106673,-0.682117,-0.578306,-1.238314,-0.646056,-0.698298,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


# Model

In [64]:
X_train, X_val, y_train, y_val = train_test_split(X_prep, y, test_size=0.2, random_state=42)

## Random Forest

In [59]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf_2 = RandomForestClassifier(n_estimators=200,
							  max_depth=None, 
							  min_samples_leaf=1,
							  min_samples_split=5, 
							  random_state=42)


In [63]:
rf.fit(X_train, y_train)

In [69]:
rf_2.fit(X_train, y_train)

In [65]:
y_pred_val = rf.predict(X_val)

In [70]:
y_pred_val_2 = rf_2.predict(X_val)

In [73]:
from sklearn.metrics import recall_score

print(f'Model 1')
print('Accuracy:', accuracy_score(y_val, y_pred_val))
print('Recall:', recall_score(y_val, y_pred_val))
print('ROC AUC:', roc_auc_score(y_val, y_pred_val))
print('Confusion Matrix:')
print(confusion_matrix(y_val, y_pred_val))

Model 1
Accuracy: 0.9530224230539688
Recall: 0.7253349573690622
ROC AUC: 0.8577106034986482
Confusion Matrix:
[[9987  100]
 [ 451 1191]]


In [75]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score

param_grid = {
	'n_estimators': [100, 200, 300],
	'max_depth': [None, 10, 30],
	'min_samples_split': [2, 5, 10],
	'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, verbose=3, scoring='roc_auc', return_train_score=True, pre_dispatch=4)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


In [78]:
best_params = grid_search.best_params_
best_estimator = grid_search.best_estimator_
print(best_params)

{'max_depth': 30, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 300}


In [79]:
grid_search.best_score_

0.9350260541830865

In [80]:
best_estimator.fit(X_train, y_train)

In [83]:
y_val_pred = best_estimator.predict(X_val)
# accuracy_score(y_val, y_val_pred)
roc_auc_score(y_val, y_val_pred)

0.8544035043718915

In [88]:
X_t, X_v, y_t, y_v = train_test_split(X_prep_ColTransf, y, test_size=0.2, random_state=42)

# Predictions

In [92]:
X_test = pd.read_csv('data/test.csv')

In [93]:
X_processed = preprocessor.transform(X_test)

In [94]:
y_preds = best_estimator.predict(X_processed)

In [95]:
from pathlib import Path

submission_folder = Path('submissions')
submission_folder.mkdir(exist_ok=True)

submission = pd.DataFrame({'id': X_test['id'], 'loan_status': y_preds})
submission.to_csv(submission_folder / 'RF_submission_1.csv', index=False)