In [1]:
import pandas as pd
import numpy as np

In [79]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score,recall_score,f1_score
from IPython.display import Image
from subprocess import call
from sklearn.tree import export_graphviz

<h3>Reading Train Test Split<h3>

In [4]:
x_train = pd.read_csv('X_train.csv')
x_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [5]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(3305, 16)
(3305, 1)
(827, 16)
(827, 1)


In [6]:
x_train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome
0,1,8,2,2,0,0.006244,0,0,0,8,9,0.092593,1,0.104598,1,0
1,4,4,1,1,0,0.006687,0,0,0,12,6,0.04065,7,0.409195,2,0
2,2,8,2,2,0,0.124195,0,0,0,8,7,0.212285,3,0.088506,2,2
3,4,4,1,2,0,0.019987,0,0,0,12,5,0.072719,3,0.208046,10,2
4,3,10,1,1,0,0.006281,1,1,1,5,2,0.084011,5,0.217241,5,1


In [8]:
dt_base_model = DecisionTreeClassifier(random_state=7)
dt_base_model.fit(x_train,y_train)
y_pred = dt_base_model.predict(x_test)

In [9]:
accuracy_score(y_pred,y_test)

0.7315598548972189

In [10]:
confusion_matrix(y_true=y_test, y_pred=y_pred)

array([[425, 101],
       [121, 180]])

In [11]:
precision_score(y_true=y_test, y_pred=y_pred)

0.6405693950177936

In [12]:
recall_score(y_true=y_test, y_pred=y_pred)

0.5980066445182725

In [13]:
f1_score(y_true=y_test, y_pred=y_pred)

0.6185567010309279

<h2>Feature Selection<h2>

In [14]:
from sklearn.feature_selection import RFE

In [15]:
def rfe_selector(X, y, num_feats):
    model = DecisionTreeClassifier(random_state=7)
    rfe = RFE(estimator=model, n_features_to_select=num_feats, step=1, verbose=5)
    rfe.fit(X,y.values.ravel())
    rfe_support = rfe.get_support()
    rfe_feature = list(X.loc[:,rfe_support].columns)
    return rfe_support, rfe_feature

In [44]:
rfe_support, rfe_feature = rfe_selector(x_train, y_train,6)
rfe_feature

Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.


['balance', 'day', 'month', 'duration', 'pdays', 'poutcome']

<h3>Selected Features<h3>

In [45]:
print(rfe_feature)

['balance', 'day', 'month', 'duration', 'pdays', 'poutcome']


In [46]:
feature_dt_model = dt_base_model.fit(x_train[rfe_feature],y_train.values.ravel())
y_feature_pred = dt_base_model.predict(x_test[rfe_feature])

In [47]:
accuracy_score(y_true=y_test, y_pred=y_feature_pred)

0.7726723095525998

In [48]:
confusion_matrix(y_true=y_test, y_pred=y_feature_pred)

array([[448,  78],
       [110, 191]])

In [49]:
precision_score(y_true=y_test, y_pred=y_feature_pred)

0.7100371747211895

In [50]:
recall_score(y_true=y_test, y_pred=y_feature_pred)

0.6345514950166113

In [51]:
f1_score(y_true=y_test, y_pred=y_feature_pred)

0.6701754385964912

<h2>Hyper-Parameter Tuning<h2>

In [52]:
from sklearn.model_selection import RandomizedSearchCV

In [53]:
depth = [2, 3, 5, 10, 20]
leaves = [5, 10, 20, 50, 100]
criteria = ['gini', 'entropy']
para_dict = {'max_depth': depth, 'min_samples_leaf': leaves,'criterion':criteria}

In [60]:
parameter_model = DecisionTreeClassifier(random_state=7)
parameter_dt_model = RandomizedSearchCV(parameter_model, param_distributions=para_dict, cv=5, n_iter=50, random_state=18, n_jobs=-1, verbose=2)

In [73]:
parameter_dt_model.fit(x_train[rfe_feature],y_train.values.ravel())
y_parameter_pred = parameter_dt_model.predict(x_test[rfe_feature])
# parameter_dt_model.fit(x_train,y_train.values.ravel())
# y_parameter_pred = parameter_dt_model.predict(x_test)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV] END ....criterion=gini, max_depth=2, min_samples_leaf=5; total time=   0.0s
[CV] END ....criterion=gini, max_depth=2, min_samples_leaf=5; total time=   0.0s
[CV] END ....criterion=gini, max_depth=2, min_samples_leaf=5; total time=   0.0s
[CV] END ....criterion=gini, max_depth=2, min_samples_leaf=5; total time=   0.0s
[CV] END ...criterion=gini, max_depth=2, min_samples_leaf=10; total time=   0.0s
[CV] END ....criterion=gini, max_depth=2, min_samples_leaf=5; total time=   0.0s
[CV] END ...criterion=gini, max_depth=2, min_samples_leaf=10; total time=   0.0s
[CV] END ...criterion=gini, max_depth=2, min_samples_leaf=10; total time=   0.0s
[CV] END ...criterion=gini, max_depth=2, min_samples_leaf=10; total time=   0.0s
[CV] END ...criterion=gini, max_depth=2, min_samples_leaf=10; total time=   0.0s
[CV] END ...criterion=gini, max_depth=2, min_samples_leaf=20; total time=   0.0s
[CV] END ...criterion=gini, max_depth=2, min_sa

In [74]:
accuracy_score(y_true=y_test, y_pred=y_parameter_pred)

0.8174123337363967

In [75]:
confusion_matrix(y_true=y_test, y_pred=y_parameter_pred)

array([[462,  64],
       [ 87, 214]])

In [76]:
precision_score(y_true=y_test, y_pred=y_parameter_pred)

0.7697841726618705

In [77]:
recall_score(y_true=y_test, y_pred=y_parameter_pred)

0.7109634551495017

In [78]:
f1_score(y_true=y_test, y_pred=y_parameter_pred)

0.7392055267702935