In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.externals.six import StringIO  
from IPython.display import Image  
from sklearn.tree import export_graphviz
import pydotplus



### Load the dataset

In [41]:
df = pd.read_csv("bank.csv")

### Outlier removal amd Data Optimization

In [42]:
df.drop(["contact"],axis = 1,inplace=True)

In [43]:
df.drop(df.age[df.age > 74].index, inplace = True)

### Creating features and target variables

In [44]:
X = df[['age', 'job', 'marital', 'education', 'deposit', 'balance', 'housing',
       'loan', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous',
       'poutcome']]

In [45]:
y = df[['default']]

In [46]:
le = LabelEncoder() 
y['default']= le.fit_transform(y['default'])
X['housing']= le.fit_transform(X['housing'])
X['loan']= le.fit_transform(X['loan'])
X['deposit']= le.fit_transform(X['deposit'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_ind

### Dummy Encoding

In [47]:
cat_columns = ["job","marital","education","day","month","poutcome"]

In [48]:
X = pd.get_dummies(X, prefix_sep="__",columns=cat_columns)

In [49]:
cat_dummies = [col for col in X 
               if "__" in col 
               and col.split("__")[0] in cat_columns]

In [50]:
processed_columns = list(X.columns[:])

In [51]:
X.head(6)

Unnamed: 0,age,deposit,balance,housing,loan,duration,campaign,pdays,previous,job__admin.,...,month__jun,month__mar,month__may,month__nov,month__oct,month__sep,poutcome__failure,poutcome__other,poutcome__success,poutcome__unknown
0,59,1,2343,1,0,1042,1,-1,0,1,...,0,0,1,0,0,0,0,0,0,1
1,56,1,45,0,0,1467,1,-1,0,1,...,0,0,1,0,0,0,0,0,0,1
2,41,1,1270,1,0,1389,1,-1,0,0,...,0,0,1,0,0,0,0,0,0,1
3,55,1,2476,1,0,579,1,-1,0,0,...,0,0,1,0,0,0,0,0,0,1
4,54,1,184,0,0,673,2,-1,0,1,...,0,0,1,0,0,0,0,0,0,1
5,42,1,0,1,1,562,2,-1,0,0,...,0,0,1,0,0,0,0,0,0,1


### Splitting and creating test and train sets

In [52]:
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size =0.3)

### Storing Dataset for export purposes

In [14]:
x_4_export = x_test

In [15]:
y_4_export = y_test

### Build the Decision Tree Model

In [55]:
# Decision tree with depth = 2
model_dt_2 = DecisionTreeClassifier(random_state=1, max_depth=2)
model_dt_2.fit(x_train, y_train)
model_dt_2_score_train = model_dt_2.score(x_train, y_train)
print("Training score: ",model_dt_2_score_train*100)
model_dt_2_score_test = model_dt_2.score(x_test, y_test)
print("Testing score: ",model_dt_2_score_test*100)
y_pred_dt = model_dt_2.predict_proba(x_test)[:, 1]

Training score:  98.40114389704927
Testing score:  98.72650090964221


In [56]:
# Decision tree with depth = 4
model_dt_4 = DecisionTreeClassifier(random_state=1, max_depth=4)
model_dt_4.fit(x_train, y_train)
model_dt_4_score_train = model_dt_4.score(x_train, y_train)
print("Training score: ",model_dt_4_score_train)
model_dt_4_score_test = model_dt_4.score(x_test, y_test)
print("Testing score: ",model_dt_4_score_test)

Training score:  0.9847913687768101
Testing score:  0.9860521528198909


In [57]:
# Decision tree with depth = 8
model_dt_8 = DecisionTreeClassifier(random_state=1, max_depth=8)
model_dt_8.fit(x_train, y_train)
model_dt_8_score_train = model_dt_8.score(x_train, y_train)
print("Training score: ",model_dt_8_score_train)
model_dt_8_score_test = model_dt_8.score(x_test, y_test)
print("Testing score: ",model_dt_8_score_test)

Training score:  0.9892109710126089
Testing score:  0.9784718010915706


In [58]:
fit_rf = DecisionTreeClassifier(random_state=69)

# Implemented Grid Search CV

In [59]:
import time
from sklearn.model_selection import train_test_split, GridSearchCV
np.random.seed(42)
start = time.time()

param_dist = {'max_depth': [2, 4, 8,1],
              'max_features': ['auto', 'sqrt', 'log2', None],
              'criterion': ['gini', 'entropy']}

cv_rf = GridSearchCV(fit_rf, cv = 10,
                     param_grid=param_dist, 
                     n_jobs = 3)

cv_rf.fit(x_train, y_train)
print('Best Parameters using grid search: \n', cv_rf.best_params_)
end = time.time()
print('Time taken in grid search: {0: .2f}'.format(end - start))

Best Parameters using grid search: 
 {'criterion': 'gini', 'max_depth': 2, 'max_features': 'auto'}
Time taken in grid search:  6.73


In [60]:
# Set best parameters given by grid search 
fit_rf.set_params(criterion = 'gini',
                  max_features = None, 
                  max_depth = 2)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=69, splitter='best')

In [61]:
#Decision tree

model_dt = DecisionTreeClassifier(max_depth = 8, criterion ="entropy")
model_dt.fit(x_train, y_train)
y_pred_dt = model_dt.predict_proba(x_test)[:, 1]

In [74]:
df_X_test = pd.DataFrame(x_4_export)

In [75]:
df_y_test = pd.DataFrame(data=y_4_export)

In [79]:
df_pred = pd.DataFrame(data=y_pred_dt,index=df_X_test.index,columns=["Loan_Defaulter_Prediction"])

In [76]:
df_X_test = df_X_test.join(df_y_test,how = 'right')

In [77]:
df_X_test

Unnamed: 0,age,deposit,balance,housing,loan,duration,campaign,pdays,previous,job__admin.,...,month__mar,month__may,month__nov,month__oct,month__sep,poutcome__failure,poutcome__other,poutcome__success,poutcome__unknown,default
9504,45,0,4696,0,0,99,1,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
9992,48,0,741,1,0,227,1,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
121,32,1,4665,1,0,860,2,-1,0,0,...,0,1,0,0,0,0,0,0,1,0
5012,47,1,3696,0,0,250,2,181,4,1,...,0,0,0,0,0,0,0,1,0,0
9239,33,0,3868,1,0,445,1,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9800,40,0,1937,1,0,86,1,-1,0,0,...,0,0,0,0,0,0,0,0,1,0
3346,35,1,295,0,0,473,1,-1,0,0,...,1,0,0,0,0,0,0,0,1,0
5795,52,0,484,1,0,128,1,-1,0,1,...,0,1,0,0,0,0,0,0,1,0
11128,40,0,2171,1,0,97,1,-1,0,0,...,0,1,0,0,0,0,0,0,1,0


In [80]:
df_X_test = df_X_test.join(df_pred, how = 'right')

In [82]:
df_X_test

Unnamed: 0,age,deposit,balance,housing,loan,duration,campaign,pdays,previous,job__admin.,...,month__may,month__nov,month__oct,month__sep,poutcome__failure,poutcome__other,poutcome__success,poutcome__unknown,default,Loan_Defaulter_Prediction
9504,45,0,4696,0,0,99,1,-1,0,0,...,0,0,0,0,0,0,0,1,0,0.000000
9992,48,0,741,1,0,227,1,-1,0,0,...,0,0,0,0,0,0,0,1,0,0.135135
121,32,1,4665,1,0,860,2,-1,0,0,...,1,0,0,0,0,0,0,1,0,0.036928
5012,47,1,3696,0,0,250,2,181,4,1,...,0,0,0,0,0,0,1,0,0,0.000000
9239,33,0,3868,1,0,445,1,-1,0,0,...,0,0,0,0,0,0,0,1,0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9800,40,0,1937,1,0,86,1,-1,0,0,...,0,0,0,0,0,0,0,1,0,0.036928
3346,35,1,295,0,0,473,1,-1,0,0,...,0,0,0,0,0,0,0,1,0,0.000000
5795,52,0,484,1,0,128,1,-1,0,1,...,1,0,0,0,0,0,0,1,0,0.036928
11128,40,0,2171,1,0,97,1,-1,0,0,...,1,0,0,0,0,0,0,1,0,0.000000


In [83]:
df_export = df_X_test

In [85]:
df_export.to_html("export_bank.html")

In [86]:
df_export.to_csv("export_bank.csv")