In [1]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

In [16]:
loan_df = pd.read_excel('Bank_Personal_Loan_Modelling.xlsx', sheet_name = 1)
loan_df.tail()

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
4995,4996,29,3,40,92697,1,1.9,3,0,0,0,0,1,0
4996,4997,30,4,15,92037,4,0.4,1,85,0,0,0,1,0
4997,4998,63,39,24,93023,2,0.3,3,0,0,0,0,0,0
4998,4999,65,40,49,90034,3,0.5,2,0,0,0,0,1,0
4999,5000,28,4,83,92612,3,0.8,1,0,0,0,0,1,1


In [3]:
loan_df.isnull().sum()

ID                    0
Age                   0
Experience            0
Income                0
ZIP Code              0
Family                0
CCAvg                 0
Education             0
Mortgage              0
Personal Loan         0
Securities Account    0
CD Account            0
Online                0
CreditCard            0
dtype: int64

In [24]:
rf_model = RandomForestClassifier(n_estimators = 1000, max_features = 2, oob_score = True)
features = loan_df.columns
feature_names = features.drop(['Personal Loan'])
feature_names

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Securities Account', 'CD Account', 'Online',
       'CreditCard'],
      dtype='object')

In [26]:
rf_model.fit(X = loan_df[feature_names], y = loan_df['Personal Loan'])
print("oob score: ", rf_model.oob_score_)
for features,imp in zip(feature_names, rf_model.feature_importances_):
    print(features, imp)

oob score:  0.9854
ID 0.043753737286148904
Age 0.03818852191910204
Experience 0.03781001144985986
Income 0.3190823499181029
ZIP Code 0.04122116823252703
Family 0.08424868285199041
CCAvg 0.1763249066145855
Education 0.1426093955395115
Mortgage 0.04177199098710535
Securities Account 0.005026136857255723
CD Account 0.05298458451563363
Online 0.007785518833648167
CreditCard 0.00919299499452891


From the above oob scores, we can infer that Income, CCAvg and Education are important since their oob scores are greater than 0.05

In [27]:
X = pd.DataFrame([loan_df["Income"], loan_df["CCAvg"], loan_df["Education"]]).T
y = loan_df['Personal Loan']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


In [29]:
tree_model = tree.DecisionTreeClassifier(max_depth = 6)
tree_model.fit(X_train, y_train)
with open("BankLoanTree.dot", "w") as f:
    f = tree.export_graphviz(tree_model, feature_names = ['Income', 'CCAvg', 'Education'], out_file = f)
tree_model.score(X_train, y_train)

0.97675

In [30]:
test_preds = tree_model.predict(X = X_test)
Predict_Output = pd.DataFrame({"Prediction": test_preds})
Predict_Output.to_csv("BankLoanPredicition.csv")

In [39]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
import math
r2 = r2_score(y_test, test_preds)
print(r2)
rmse = math.sqrt(mean_squared_error(y_test, test_preds))
print(rmse)

0.6092796092796093
0.17888543819998318


In [None]:
Inference: the model has efficiency of 61%