In [41]:
import pandas as pd
from sklearn.preprocessing import label_binarize
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import GradientBoostingClassifier

import warnings
warnings.filterwarnings("ignore")
#Please ignore the warnings with version change

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [42]:
trainfile = r'/gdrive/My Drive/508-A2/Marketing/TRAIN.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/508-A2/Marketing/TEST.csv'
testData = pd.read_csv(testfile) #creates a dataframe

#print sizes (shape) of datasets
print(trainData.shape)
print(testData.shape)

trainData.head()


(4521, 17)
(45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


In [43]:
testData.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [44]:
#Copy Train data excluding target
trainData_Copy = trainData.iloc[:, :-1].copy()
testData_Copy = testData.iloc[:, :-1].copy()

trainData_Copy.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown


In [45]:
#List of Categorical Features
categoricalFeatures = ["job", 'marital', "education", 'contact', 'housing', 'loan', "month", 'default', 'poutcome']

#Combine Train and test for one Hot Encoding
combined_Data = pd.concat([trainData_Copy,testData_Copy], keys=[0,1])

#Do one Hot encoding for categorical features
combined_Data = pd.get_dummies(combined_Data,columns=categoricalFeatures)

#Separate Train data and test data
X_train = combined_Data.xs(0)
X_test = combined_Data.xs(1)

X_test.head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,job_unemployed,job_unknown,marital_divorced,marital_married,marital_single,education_primary,education_secondary,education_tertiary,education_unknown,contact_cellular,contact_telephone,contact_unknown,housing_no,housing_yes,loan_no,loan_yes,month_apr,month_aug,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,default_no,default_yes,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,58,2143,5,261,1,-1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
1,44,29,5,151,1,-1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
2,33,2,5,76,1,-1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
3,47,1506,5,92,1,-1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1
4,33,1,5,198,1,-1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,1,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1


In [0]:
y_train = trainData["y"]
#X_train = trainData.drop(["y"], axis=1) #extracting training data without the target column
y_test = testData["y"]
#X_test = testData.drop(["y"], axis=1) #extracting training data without the target column



In [47]:
#CONSTRUCT DEFAULT DECISION TREE AND OBTAIN RESPECTIVE ACCURACY 
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
clf_predict=clf.predict(X_test)
print("accuracy Score (training) for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))
print("Confusion Matrix for Decision Tree")
print(confusion_matrix(y_test,clf_predict))

accuracy Score (training) for Decision Tree:0.882307
Confusion Matrix for Decision Tree
[[37150  2772]
 [ 2549  2740]]


In [48]:
#Hyperparameter tuning done for decision tree classifier
#RANDOM SEARCH--------------------------------------------
print("RandomizedSearchCV-Decision tree")
parameters={'min_samples_leaf' : range(10,100,10),'max_depth': range(5,30,5),'criterion':['gini','entropy']}
clf_random = RandomizedSearchCV(clf,parameters,n_iter=15,cv=5)
clf_random.fit(X_train, y_train)
grid_parm=clf_random.best_params_
print(grid_parm)

#GRID SEARCH----------------------------------------`
print("GridSearchCV-Decision tree")
clf_grid = GridSearchCV(clf,parameters, cv = 5)
clf_grid.fit(X_train, y_train)
grid_parm1=clf_grid.best_params_
print(grid_parm1)



RandomizedSearchCV-Decision tree
{'min_samples_leaf': 40, 'max_depth': 5, 'criterion': 'entropy'}
GridSearchCV-Decision tree
{'criterion': 'entropy', 'max_depth': 5, 'min_samples_leaf': 30}


In [49]:
#Using the parameters obtained from HyperParameterTuning in the DecisionTreeClassifier 
clf = DecisionTreeClassifier(**grid_parm)
clfr = DecisionTreeClassifier(**grid_parm1)

clf.fit(X_train,y_train)
clf_predict = clf.predict(X_test)
clfr.fit(X_train,y_train)
clfr_predict = clfr.predict(X_test)


#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testing) after hypertuning randomized search for Decision Tree:{0:6f}".format(clf.score(X_test,y_test)))

print("accuracy Score (testing) after hypertuning grid search for Decision Tree:{0:6f}".format(clfr.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Decision Tree")
print(confusion_matrix(y_test,clf_predict))
print("=== Classification Report ===")
print(classification_report(y_test,clf_predict))

#clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="balanced_accuracy")
#print(clf_cv_score)
#print('\n')



accuracy Score (testing) after hypertuning randomized search for Decision Tree:0.896684
accuracy Score (testing) after hypertuning grid search for Decision Tree:0.896220
Confusion Matrix after hypertuning for Decision Tree
[[39102   820]
 [ 3851  1438]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.91      0.98      0.94     39922
         yes       0.64      0.27      0.38      5289

    accuracy                           0.90     45211
   macro avg       0.77      0.63      0.66     45211
weighted avg       0.88      0.90      0.88     45211



In [50]:
#Normal randomforest
rand_parameters={'min_samples_leaf' : range(10,100,10),'max_depth': range(1,10,2),'max_features':[2,3,4],'n_estimators':[20,30,40], 'criterion':['gini','entropy']}
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
rfc_predict=rfc.predict(X_test)
print("accuracy Score (testing) for RandomForest:{0:6f}".format(rfc.score(X_test,y_test)))
print("Confusion Matrix for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))





accuracy Score (testing) for RandomForest:0.906770
Confusion Matrix for Random Forest:
[[39127   795]
 [ 3420  1869]]


In [51]:
#RANDOMIZED SEARCH----------------------------------------
rfc_random = RandomizedSearchCV(rfc,rand_parameters,n_iter=10,cv=5)
rfc_random.fit(X_train, y_train)
grid_parm_rfc=rfc_random.best_params_
print(grid_parm_rfc)
rfc1= RandomForestClassifier(**grid_parm_rfc)
rfc1.fit(X_train,y_train)
rfc_predict = rfc1.predict(X_test)
print("accuracy Score (testing) after hypertuning for Random Forest:{0:6f}".format(rfc1.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for Random Forest:")
print(confusion_matrix(y_test,rfc_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc_predict))
rfc_cv_score = cross_val_score(rfc, X_train, y_train)
print(rfc_cv_score)
print('\n')


{'n_estimators': 30, 'min_samples_leaf': 60, 'max_features': 3, 'max_depth': 9, 'criterion': 'gini'}
accuracy Score (testing) after hypertuning for Random Forest:0.883015
Confusion Matrix after hypertuning for Random Forest:
[[39922     0]
 [ 5289     0]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.88      1.00      0.94     39922
         yes       0.00      0.00      0.00      5289

    accuracy                           0.88     45211
   macro avg       0.44      0.50      0.47     45211
weighted avg       0.78      0.88      0.83     45211

[0.89257294 0.89515594 0.89840637]




In [52]:
print("GridSearchCV-Random Forest")
rfc_grid = GridSearchCV(rfc,rand_parameters,cv=5)
rfc_grid.fit(X_train, y_train)
grid_parm2=rfc_grid.best_params_
print(grid_parm2)

GridSearchCV-Random Forest
{'criterion': 'entropy', 'max_depth': 9, 'max_features': 4, 'min_samples_leaf': 10, 'n_estimators': 30}


In [53]:
#Using the parameters obtained from HyperParameterTuning in the random forest 
rfc2 = RandomForestClassifier(**grid_parm2)

rfc2.fit(X_train,y_train)
rfc2_predict = rfc2.predict(X_test)

#Obtain accuracy ,confusion matrix,classification report and AUC values for the result above.
print("accuracy Score (testing) after hypertuning grid search for random forest:{0:6f}".format(rfc2.score(X_test,y_test)))
print("Confusion Matrix after hypertuning for random forest")
print(confusion_matrix(y_test,rfc2_predict))
print("=== Classification Report ===")
print(classification_report(y_test,rfc2_predict))

#clf_cv_score = cross_val_score(clf, X_train, y_train, cv=10, scoring="balanced_accuracy")
#print(clf_cv_score)
#print('\n')


accuracy Score (testing) after hypertuning grid search for random forest:0.892040
Confusion Matrix after hypertuning for random forest
[[39799   123]
 [ 4758   531]]
=== Classification Report ===
              precision    recall  f1-score   support

          no       0.89      1.00      0.94     39922
         yes       0.81      0.10      0.18      5289

    accuracy                           0.89     45211
   macro avg       0.85      0.55      0.56     45211
weighted avg       0.88      0.89      0.85     45211

