In [1]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [2]:
trainfile = r'/gdrive/My Drive/CIS508A2/Bank/BankTrain.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/CIS508A2/Bank/BankTest.csv'
testData = pd.read_csv(testfile) #creates a dataframe

#print sizes (shape) of datasets
print(trainData.shape)
print(testData.shape)

trainData.head()
testData.head()

(4521, 17)
(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [3]:
#Copy Train data excluding target
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

trainData_Copy.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [4]:
#List of Categorical Features
categoricalFeatures = ["job", "marital", "education", "default", "housing", "loan", "contact", "month","poutcome"]

#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

X_test.head()


Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,default_no,default_yes,housing_no,housing_yes,loan_no,loan_yes,contact_cellular,contact_telephone,contact_unknown,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,1,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,1,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1


In [5]:
y_train = trainData["y"]
#X_train = trainData.drop(["y"], axis=1) #extracting training data without the target column
y_test = testData["y"]
#X_test = testData.drop(["y"], axis=1) #extracting training data without the target column

y_train.head()

0    no
1    no
2    no
3    no
4    no
Name: y, dtype: object

In [9]:
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (training) for Decision Tree:{0:6f}".format(clf.score(X_train,y_train)))
print("accuracy Score (testing) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))



accuracy Score (training) for Decision Tree:1.000000
accuracy Score (testing) for Decision Tree:0.884011
Confusion Matrix for Decision Tree
[[37218  2704]
 [ 2540  2749]]


In [25]:
#Hyperparameter tuning done for decision tree classifier
#RANDOM SEARCH--------------------------------------------
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(10,200,10),'max_depth': range(1,10,5),'criterion':['gini','entropy']}

clf_random = RandomizedSearchCV(clf,parameters,n_iter=15,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

#GRID SEARCH----------------------------------------
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)



RandomizedSearchCV-Decision tree
{'min_samples_leaf': 20, 'max_depth': 6, 'criterion': 'gini'}
GridSearchCV-Decision tree
{'criterion': 'gini', 'max_depth': 6, 'min_samples_leaf': 70}


In [26]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)


#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (training) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_train,y_train)))
print("accuracy Score (testing) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("accuracy Score (training) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_train,y_train)))
print("accuracy Score (testing) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="roc_auc")
print(clf_cv_score)
print("Random Search Mean roc_auc:")
print(np.mean(clf_cv_score))
clfr_cv_score = cross_val_score(clfr, X_train, y_train, cv=10, scoring="roc_auc")
print(clfr_cv_score)
print("Grid Search Mean_roc_auc:")
print(np.mean(clfr_cv_score))

accuracy Score (training) after hypertuning randomized search for Decision Tree:0.913515
accuracy Score (testing) after hypertuning randomized search for Decision Tree:0.899914
accuracy Score (training) after hypertuning grid search for Decision Tree:0.901128
accuracy Score (testing) after hypertuning grid search for Decision Tree:0.896596
Confusion Matrix after hypertuning for Decision Tree
[[38952   970]
 [ 3555  1734]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.92      0.98      0.95     39922
         yes       0.64      0.33      0.43      5289

    accuracy                           0.90     45211
   macro avg       0.78      0.65      0.69     45211
weighted avg       0.88      0.90      0.89     45211

[0.83988208 0.87891827 0.82221154 0.83189904 0.87252404 0.86887019
 0.87987981 0.86855769 0.83841346 0.869375  ]
Random Search Mean roc_auc:
0.8570531113933237
[0.89591981 0.8465625  0.81704327 0.80848558 0.88694712 0

In [21]:
#Normal randomforest
rand_parameters={'min_samples_leaf' : range(1,70,10),'max_depth': range(1,25,2),'max_features':[3,4],'n_estimators':[5,10,15]}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (training) for RandomForest:{0:6f}".format(rfc.score(X_train,y_train)))
print("accuracy Score (testing) for RandomForest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))

#RANDOMIZED SEARCH----------------------------------------
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=15,cv=5)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
rfc= RandomForestClassifier(**grid_parm_rfc)
rfc.fit(X_train,y_train)
rfc_predict = rfc.predict(X_test)
print("accuracy Score (training) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_train,y_train)))
print("accuracy Score (testing) after hypertuning for Random Forest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))

#Cross Validation---------------------------------------
rfc_cv_score = cross_val_score(rfc, X_train, y_train,cv=10)
print(rfc_cv_score)
print('\n')
print("Mean:")
print(np.mean(rfc_cv_score))





accuracy Score (training) for RandomForest:0.993585
accuracy Score (testing) for RandomForest:0.907014
Confusion Matrix for Random Forest:
[[39130   792]
 [ 3412  1877]]
{'n_estimators': 15, 'min_samples_leaf': 11, 'max_features': 4, 'max_depth': 5}
accuracy Score (training) after hypertuning for Random Forest:0.884539
accuracy Score (testing) after hypertuning for Random Forest:0.883878
Confusion Matrix after hypertuning for Random Forest:
[[39913     9]
 [ 5241    48]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.84      0.01      0.02      5289

    accuracy                           0.88     45211
   macro avg       0.86      0.50      0.48     45211
weighted avg       0.88      0.88      0.83     45211

[0.88300221 0.88495575 0.88495575 0.88274336 0.88495575 0.88495575
 0.88495575 0.88495575 0.88716814 0.88274336]


Mean:
0.8845391588036492
