# Importing Libraries

In [32]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report
import pickle

import warnings
warnings.filterwarnings('ignore')

# Loading Data

In [2]:
bankloans = pd.read_csv("bankloans.csv")
print(bankloans.shape)
print(bankloans.columns)
bankloans.info()
bankloans.head()

(850, 9)
Index(['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt',
       'othdebt', 'default'],
      dtype='object')
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 850 entries, 0 to 849
Data columns (total 9 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       850 non-null    int64  
 1   ed        850 non-null    int64  
 2   employ    850 non-null    int64  
 3   address   850 non-null    int64  
 4   income    850 non-null    int64  
 5   debtinc   850 non-null    float64
 6   creddebt  850 non-null    float64
 7   othdebt   850 non-null    float64
 8   default   700 non-null    float64
dtypes: float64(4), int64(5)
memory usage: 59.9 KB


Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41,3,17,12,176,9.3,11.359392,5.008608,1.0
1,27,1,10,6,31,17.3,1.362202,4.000798,0.0
2,40,1,15,14,55,5.5,0.856075,2.168925,0.0
3,41,1,15,14,120,2.9,2.65872,0.82128,0.0
4,24,2,2,0,28,17.3,1.787436,3.056564,1.0


## Checking NULL values

In [3]:
bankloans.isnull().sum()

age           0
ed            0
employ        0
address       0
income        0
debtinc       0
creddebt      0
othdebt       0
default     150
dtype: int64

In [4]:
people_with_info = bankloans[bankloans.default.isnull() == False]
people_without_info = bankloans[bankloans.default.isnull() == True]
print(f"Dims of people_with_info: {people_with_info.shape}\n\
Dims of people_without_info: {people_without_info.shape}")

Dims of people_with_info: (700, 9)
Dims of people_without_info: (150, 9)


In [5]:
pp.ProfileReport(bankloans)

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



## Outliers Treatement

In [6]:
pd.__version__

'1.4.2'

In [7]:
def outlier_capping(x):
    x = x.clip(upper = x.quantile(0.99), lower = x.quantile(0.01))
    return x

people_with_info = people_with_info.apply(lambda x: outlier_capping(x))
people_with_info

Unnamed: 0,age,ed,employ,address,income,debtinc,creddebt,othdebt,default
0,41.0,3.0,17.0,12.0,176.0,9.3,9.891428,5.008608,1.0
1,27.0,1.0,10.0,6.0,31.0,17.3,1.362202,4.000798,0.0
2,40.0,1.0,15.0,14.0,55.0,5.5,0.856075,2.168925,0.0
3,41.0,1.0,15.0,14.0,120.0,2.9,2.658720,0.821280,0.0
4,24.0,2.0,2.0,0.0,28.0,17.3,1.787436,3.056564,1.0
...,...,...,...,...,...,...,...,...,...
695,36.0,2.0,6.0,15.0,27.0,4.6,0.262062,0.979938,1.0
696,29.0,2.0,6.0,4.0,21.0,11.5,0.369495,2.045505,0.0
697,33.0,1.0,15.0,3.0,32.0,7.6,0.491264,1.940736,0.0
698,45.0,1.0,19.0,22.0,77.0,8.4,2.302608,4.165392,0.0


In [8]:
x_vars = [x for x in people_with_info.columns if x != "default"]
x_vars

['age', 'ed', 'employ', 'address', 'income', 'debtinc', 'creddebt', 'othdebt']

In [9]:
x = people_with_info[x_vars]
y = people_with_info['default'].values

In [10]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, \
                                                   random_state=42)

In [11]:
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)

In [12]:
rfe_model = rfe.fit(x_train, y_train)

In [13]:
rfe_model.get_support()

array([False, False,  True, False,  True,  True,  True,  True])

In [14]:
x_train.columns[rfe_model.get_support()]

Index(['employ', 'income', 'debtinc', 'creddebt', 'othdebt'], dtype='object')

In [15]:
params = {
    "n_estimators": [10, 20, 40, 80, 100],
    "max_depth": [1, 2, 3]
    
}
rf_estimator = RandomForestClassifier(n_jobs=-1, oob_score=True, \
                                      random_state=1024)

In [16]:
cv_rf = GridSearchCV(rf_estimator, params, cv=5)
cv_rf.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_jobs=-1, oob_score=True,
                                              random_state=1024),
             param_grid={'max_depth': [1, 2, 3],
                         'n_estimators': [10, 20, 40, 80, 100]})

In [17]:
cv_rf.best_estimator_

RandomForestClassifier(max_depth=3, n_estimators=10, n_jobs=-1, oob_score=True,
                       random_state=1024)

In [18]:
print(classification_report(y_train, cv_rf.best_estimator_.predict(x_train)))

              precision    recall  f1-score   support

         0.0       0.81      0.96      0.88       344
         1.0       0.76      0.38      0.50       125

    accuracy                           0.80       469
   macro avg       0.78      0.67      0.69       469
weighted avg       0.79      0.80      0.78       469



In [19]:
print(classification_report(y_test, cv_rf.best_estimator_.predict(x_test)))

              precision    recall  f1-score   support

         0.0       0.79      0.95      0.86       173
         1.0       0.62      0.22      0.33        58

    accuracy                           0.77       231
   macro avg       0.70      0.59      0.60       231
weighted avg       0.74      0.77      0.73       231



# Improving the Model

In [20]:
not_x = list(x_train.columns[np.logical_not(rfe_model.get_support())])\
        + ["default"]

In [21]:
x_vars2 = list(set(people_with_info.columns) - set(not_x))
x_vars2

['income', 'debtinc', 'othdebt', 'employ', 'creddebt']

In [22]:
x = people_with_info[x_vars2]
y = people_with_info['default'].values

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, \
                                                   random_state=42)

In [24]:
rfe = RFE(estimator=RandomForestClassifier(), n_features_to_select=5)

In [25]:
rfe_model = rfe.fit(x_train, y_train)

In [26]:
rfe_model.get_support()

array([ True,  True,  True,  True,  True])

In [27]:
params = {
    "n_estimators": [10, 20, 40, 80, 100],
    "max_depth": [1, 2, 3]
    
}
rf_estimator = RandomForestClassifier(n_jobs=-1, oob_score=True, \
                                      random_state=1024)

In [28]:
cv_rf = GridSearchCV(rf_estimator, params, cv=5)
cv_rf.fit(x_train, y_train)

GridSearchCV(cv=5,
             estimator=RandomForestClassifier(n_jobs=-1, oob_score=True,
                                              random_state=1024),
             param_grid={'max_depth': [1, 2, 3],
                         'n_estimators': [10, 20, 40, 80, 100]})

In [29]:
cv_rf.best_estimator_

RandomForestClassifier(max_depth=2, n_estimators=10, n_jobs=-1, oob_score=True,
                       random_state=1024)

In [30]:
print(classification_report(y_train, cv_rf.best_estimator_.predict(x_train)))

              precision    recall  f1-score   support

         0.0       0.80      0.96      0.87       344
         1.0       0.75      0.36      0.49       125

    accuracy                           0.80       469
   macro avg       0.78      0.66      0.68       469
weighted avg       0.79      0.80      0.77       469



In [31]:
print(classification_report(y_test, cv_rf.best_estimator_.predict(x_test)))

              precision    recall  f1-score   support

         0.0       0.81      0.96      0.88       173
         1.0       0.73      0.33      0.45        58

    accuracy                           0.80       231
   macro avg       0.77      0.64      0.67       231
weighted avg       0.79      0.80      0.77       231



# Saving the model

In [34]:
filename = "RF_model_Banking_Application.pkl"
pickle.dump(cv_rf.best_estimator_, open(filename, "wb"))

# Deploying model

In [35]:
retrived_model = pickle.load(open(filename, "rb"))

In [38]:
retrived_model.predict(x_train)

array([0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 1., 1., 0., 0., 0., 0., 0., 1.,
       0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0.,
       0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 1., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0.,
       0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
       0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 1., 0., 0.,
       1., 0., 0., 0., 0.

In [39]:
x_train.columns

Index(['income', 'debtinc', 'othdebt', 'employ', 'creddebt'], dtype='object')