# WE04-UniversalBank

# 1.0 Setup

In [3]:
# Import all required libraries
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


np.random.seed(1)

# 2.0 Load the Data 

In [4]:
import os
print(os.getcwd())

/content


In [5]:
#Loding the data file
bank_df = pd.read_csv('/content/UniversalBank.csv')
bank_df.head(20)

Unnamed: 0,ID,Age,Experience,Income,ZIP Code,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,1,25,1,49,91107,4,1.6,1,0,0,1,0,0,0
1,2,45,19,34,90089,3,1.5,1,0,0,1,0,0,0
2,3,39,15,11,94720,1,1.0,1,0,0,0,0,0,0
3,4,35,9,100,94112,1,2.7,2,0,0,0,0,0,0
4,5,35,8,45,91330,4,1.0,2,0,0,0,0,0,1
5,6,37,13,29,92121,4,0.4,2,155,0,0,0,1,0
6,7,53,27,72,91711,2,1.5,2,0,0,0,0,1,0
7,8,50,24,22,93943,1,0.3,3,0,0,0,0,0,1
8,9,35,10,81,90089,3,0.6,2,104,0,0,0,1,0
9,10,34,9,180,93023,1,8.9,3,0,1,0,0,0,0


# 3.0 Explore the given data

In [6]:
#see the column name
bank_df.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [7]:
#Coverting all column name to Upper alphabtes and replacing space with "_"
bank_df.columns = [s.strip().upper().replace(' ', '_') for s in bank_df.columns] 
bank_df.columns

Index(['ID', 'AGE', 'EXPERIENCE', 'INCOME', 'ZIP_CODE', 'FAMILY', 'CCAVG',
       'EDUCATION', 'MORTGAGE', 'PERSONAL_LOAN', 'SECURITIES_ACCOUNT',
       'CD_ACCOUNT', 'ONLINE', 'CREDITCARD'],
      dtype='object')

In [8]:
#Some of the variables aren't predictors; therefore we drop them
# drop ID, and Zip Code as predictors
bank_df = bank_df.drop(columns=['ID', 'ZIP_CODE'])

In [9]:
# check for if there is any missing values in remaining columns
bank_df.isnull().sum()

AGE                   0
EXPERIENCE            0
INCOME                0
FAMILY                0
CCAVG                 0
EDUCATION             0
MORTGAGE              0
PERSONAL_LOAN         0
SECURITIES_ACCOUNT    0
CD_ACCOUNT            0
ONLINE                0
CREDITCARD            0
dtype: int64

In [10]:
#checking data type of each cloumns
bank_df.dtypes

AGE                     int64
EXPERIENCE              int64
INCOME                  int64
FAMILY                  int64
CCAVG                 float64
EDUCATION               int64
MORTGAGE                int64
PERSONAL_LOAN           int64
SECURITIES_ACCOUNT      int64
CD_ACCOUNT              int64
ONLINE                  int64
CREDITCARD              int64
dtype: object

## Only CCAVG column is float and remaining all are integers, we need to change FAMILY,EDUCATION,PERSONAL_LOAN, SECURITIES_ACCOUNT,CD_ACCOUNT,ONLINE and CREDITCARD as category data type beacuse variable has only a limited number of values, and it makes more sense to treat it as a categorical variable rather than a numerical variable. By converting these columns to categorical data type, you can save memory and potentially speed performance.

In [11]:
bank_df.FAMILY = bank_df.FAMILY.astype('category')
bank_df.SECURITIES_ACCOUNT = bank_df.SECURITIES_ACCOUNT.astype('category')
bank_df.EDUCATION = bank_df.EDUCATION.astype('category')
bank_df.ONLINE = bank_df.ONLINE.astype('category')
bank_df.CD_ACCOUNT = bank_df.CD_ACCOUNT.astype('category')
bank_df.CREDITCARD = bank_df.CREDITCARD.astype('category')
bank_df.PERSONAL_LOAN = bank_df.PERSONAL_LOAN.astype('category')

In [12]:
bank_df.dtypes

AGE                      int64
EXPERIENCE               int64
INCOME                   int64
FAMILY                category
CCAVG                  float64
EDUCATION             category
MORTGAGE                 int64
PERSONAL_LOAN         category
SECURITIES_ACCOUNT    category
CD_ACCOUNT            category
ONLINE                category
CREDITCARD            category
dtype: object

In [13]:
bank_df.describe()

Unnamed: 0,AGE,EXPERIENCE,INCOME,CCAVG,MORTGAGE
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,45.3384,20.1046,73.7742,1.937938,56.4988
std,11.463166,11.467954,46.033729,1.747659,101.713802
min,23.0,-3.0,8.0,0.0,0.0
25%,35.0,10.0,39.0,0.7,0.0
50%,45.0,20.0,64.0,1.5,0.0
75%,55.0,30.0,98.0,2.5,101.0
max,67.0,43.0,224.0,10.0,635.0


In [14]:
bank_df.CD_ACCOUNT.value_counts()

0    4698
1     302
Name: CD_ACCOUNT, dtype: int64

By seeing above we can say that our data is imblanced

To summarize our initial assessment of the data, we have observed that the dataset contains 14 columns and 5000 rows of non-missing values. Furthermore, we have preprocessed the data by dropping soem columns and  converting some of the data types to the category data type, which can result in improved memory usage, enhanced performance, and greater ease of analysis and visualization.

# 4.0 Split data (train/test)

In [15]:
# split the data into validation and training set
df_train, df_test = train_test_split(bank_df, test_size=0.3, random_state=1)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'CD_ACCOUNT'
predictors = list(bank_df.columns)
predictors.remove(target)

In [16]:
print(predictors)

['AGE', 'EXPERIENCE', 'INCOME', 'FAMILY', 'CCAVG', 'EDUCATION', 'MORTGAGE', 'PERSONAL_LOAN', 'SECURITIES_ACCOUNT', 'ONLINE', 'CREDITCARD']


In [17]:
print(target)

CD_ACCOUNT


# 5.0 Standardize numeric values
Now, let's create a common scale between the numberic columns by standardizing each numeric column

In [18]:
# create a standard scaler and fit it to the training set of predictors
scaler = preprocessing.StandardScaler()
scaler.fit(df_train[predictors])

# Transform the predictors of training and test sets
train_predictors = scaler.transform(df_train[predictors]) # train_predictors is not a numpy array
train_target = df_train[target] # train_target is now a series object

validation_predictors = scaler.transform(df_test[predictors]) # validation_target is now a series object
validation_target = df_test[target] # validation_target is now a series object

In [19]:
X_train = df_train[predictors]
y_train = df_train[target]
X_test = df_test[predictors]
y_test = df_test[target]

# 5.1 Address any data imbalances

In [20]:
#We will utilize an oversamplying technique to address any necessary date balancing.
from imblearn.over_sampling import RandomOverSampler
target = 'CD_ACCOUNT'
predictors = list(bank_df.columns)
predictors.remove(target)
RanOverSample=RandomOverSampler(sampling_strategy=1)
pred,tar = RanOverSample.fit_resample(df_train[predictors],df_train[target])
tar.value_counts()

0    3281
1    3281
Name: CD_ACCOUNT, dtype: int64

# 6.0 Fitting Models the data


Using RandomSearchCV combined with GridSearchCV to identify the best parameters for each model tested below.

# 6.1 Fit a LogisticRegression model with Random Search

In [21]:
score_measure = "recall"
kfolds = 10
param_grid = { 'solver': [ 'liblinear', 'saga'],
                      'penalty': ['l1', 'l2'], # NOTE: 'elasticnet' is only supported by 'saga' solver
                      'C': [100, 10, 1.0, 0.1, 0.01],
                      # number of iterations to converge (sometimes the default is not enough - and sometimes, it will never converge)
                     }
logi_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = logi_reg, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_logi_reg_rand = rand_search.best_estimator_

Fitting 10 folds for each of 20 candidates, totalling 200 fits




The best recall score is 0.6662337662337662
... with parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 100}


In [22]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


# 6.2 Fit a LogisticRegression model with Grid Search

In [23]:
score_measure = "recall"
kfolds = 10

penalty= rand_search.best_params_['penalty']
solver =rand_search.best_params_['solver']

param_grid = {
    'C': [100, 10, 1.0, 0.1, 0.01],
    'penalty': [penalty],
    'solver': [solver]
}

logi_reg = LogisticRegression()
grid_search = GridSearchCV(estimator = logi_reg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 10 folds for each of 5 candidates, totalling 50 fits
The best recall score is 0.6662337662337662
... with parameters: {'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [24]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


# 6.3 Analysis of Logistic regression model


1. The true negatives (TN) which are the cases where the actual class is negative and the model also predicts it to be negative. In this case, the value is 1417, which means that there are 1417 cases where the model correctly predicted the negative class
2. The false positives (FP) which are the cases where the actual class is negative but the model predicted it to be positive. In this case, the value is 0, which means that there are no false positives.
3. The false negatives (FN) which are the cases where the actual class is positive but the model predicted it to be negative. In this case, the value is 33, which means that there are 33 cases where the model incorrectly predicted the negative class when the actual class was positive.
4.The true positives (TP) which are the cases where the actual class is positive and the model also predicts it to be positive. In this case, the value is 50, which means that there are 50 cases where the model correctly predicted the positive class.

The accuracy score is 0.978, which means that the model correctly predicted the class in 97.8% of cases. The precision score is 1.0, which indicates that when the model predicts a positive class, it is always correct. However, the recall score is 0.602, which suggests that the model may not be identifying all the positive cases correctly. The F1 score is 0.752, which is the harmonic mean of the precision and recall scores and provides an overall measure of the model's performance. Overall, this confusion matrix suggests that the model has high accuracy and precision but may benefit from improving its ability to identify positive cases.




# 6.4 Fit a SVM classification model using linear kernal with Random Search

In [25]:
score_measure = "recall"
kfolds = 5

param_grid = {
    'C': [100, 10, 1.0, 0.1, 0.01],
    'kernel': ['linear'],
    
}

svm = SVC()
rand_search = RandomizedSearchCV(estimator =svm, param_distributions=param_grid, cv=kfolds, n_iter=20,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_linear = rand_search.best_estimator_

Fitting 5 folds for each of 5 candidates, totalling 25 fits




The best recall score is 0.6662790697674419
... with parameters: {'kernel': 'linear', 'C': 100}


In [26]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


# 6.5 Fit a SVM classification model using linear kernal with Grid Search

In [27]:
score_measure = "recall"
kfolds = 2

C = rand_search.best_params_['C']

param_grid = {
    'C': [C+10,C,C-10]
}

svm_linear_model = SVC(kernel="linear")
grid_search = GridSearchCV(estimator = svm_linear_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 2 folds for each of 3 candidates, totalling 6 fits
The best recall score is 0.6665971643035863
... with parameters: {'C': 110}


In [28]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.978
Precision Score:  1.0
F1 Score:         0.7518796992481204
************************************
Confusion Matrix: [[1417    0]
 [  33   50]]


we got same results for SVM classsification linear model with Random search combined with grid search as Logistic regression. By comparing time excuted to fitting and predicted with logistic regression. Linear SVM taking more time for fitting.

Please Note:- For SVM using rbf kernal, i skipped it,beacuse it took me 3 hrs to run and got recall score of 2% and i changed some parameters again i run this models its taking more than 1 hr as of now i skipped this model because of very poor performace.

# # 6.6 Fit a SVM classification model using Polynomial kernal with Random Search

In [32]:
score_measure = "recall"
kfolds = 5

param_rand = {
    'C': np.arange(5,15),
    'degree': [3, 4,5],
    'coef0': np.arange(1,5)
}

svm_poly_model = SVC(kernel="poly")
rand_search = RandomizedSearchCV(estimator = svm_poly_model, param_distributions=param_rand, cv=kfolds, n_iter=20,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train,np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_poly = rand_search.best_estimator_

Fitting 5 folds for each of 20 candidates, totalling 100 fits
The best recall score is 0.6662790697674419
... with parameters: {'degree': 4, 'coef0': 4, 'C': 10}


In [36]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.9773333333333334
Precision Score:  0.9803921568627451
F1 Score:         0.746268656716418
************************************
Confusion Matrix: [[1416    1]
 [  33   50]]


# # 6.7 Fit a SVM classification model using Polynomial kernal with Grid Search

In [38]:
score_measure = "recall"
kfolds = 5

degree = rand_search.best_params_['degree']
coef0 = rand_search.best_params_['coef0']
C = rand_search.best_params_['C']
param_grid = {
    'C': [C-1,C,C+1],
    'degree': np.arange(degree-2,degree+2),
    'coef0': np.arange(coef0-2,coef0+2),
}

svm_poly_model = SVC(kernel="poly")
grid_search = GridSearchCV(estimator = svm_poly_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_poly = grid_search.best_estimator_

Fitting 5 folds for each of 48 candidates, totalling 240 fits
The best recall score is 0.6663847780126849
... with parameters: {'C': 9, 'coef0': 5, 'degree': 5}


In [41]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6024096385542169
************************************
Accuracy Score:   0.9753333333333334
Precision Score:  0.9259259259259259
F1 Score:         0.7299270072992702
************************************
Confusion Matrix: [[1413    4]
 [  33   50]]


# 6.8 Fit a Decision Tree Classifier model with Random Search

Decision trees are a type of machine learning algorithm that do not typically require the standardization of numeric values. This is because decision trees work by splitting the data at each node based on a threshold value, and the scaling of the features does not affect the splitting points.

I also iterated by standarding it gives me a bad results so it better no to standarizing for decision trees.

We also know that small change or error in data gives drastic change results in Decision trees.

In [53]:
bank_df.describe()

Unnamed: 0,AGE,EXPERIENCE,INCOME,CCAVG,MORTGAGE
count,5000.0,5000.0,5000.0,5000.0,5000.0
mean,45.3384,20.1046,73.7742,1.937938,56.4988
std,11.463166,11.467954,46.033729,1.747659,101.713802
min,23.0,-3.0,8.0,0.0,0.0
25%,35.0,10.0,39.0,0.7,0.0
50%,45.0,20.0,64.0,1.5,0.0
75%,55.0,30.0,98.0,2.5,101.0
max,67.0,43.0,224.0,10.0,635.0


In [54]:
# split the data into validation and training set
df_train, df_test = train_test_split(bank_df, test_size=0.3, random_state=1)

# to reduce repetition in later code, create variables to represent the columns
# that are our predictors and target
target = 'CD_ACCOUNT'
predictors = list(bank_df.columns)
predictors.remove(target)

In [55]:
print(predictors)

['AGE', 'EXPERIENCE', 'INCOME', 'FAMILY', 'CCAVG', 'EDUCATION', 'MORTGAGE', 'PERSONAL_LOAN', 'SECURITIES_ACCOUNT', 'ONLINE', 'CREDITCARD']


In [56]:
print(target)

CD_ACCOUNT


In [57]:
X_train = df_train[predictors]
y_train = df_train[target]
X_test = df_test[predictors]
y_test = df_test[target]

Please note:- As discussed in our class, we can use np.arange for for hyperparameters or for loop, as this assignment is learning purpose i used np.arange in last class assignment and now using for loop method.

In [81]:
# Criterion used to guide data splits
criterion = ['gini', 'entropy']

# Maximum number of levels in tree. If None, then nodes are expanded until all leaves are pure or until all 
# leaves contain less than min_samples_split samples.
# default = None
max_depth = [int(x) for x in np.linspace(1, 400, 10)]
max_depth.append(None)

# Minimum number of samples required to split a node
# default is 2
min_samples_split = [int(x) for x in np.linspace(2, 5000, 25)]

# Minimum number of samples required at each leaf node
# default = 1 
min_samples_leaf = [int(x) for x in np.linspace(1, 500, 5)]

# max_leaf_nodes  - Grow trees with max_leaf_nodes in best-first fashion.
# If None then unlimited number of leaf nodes.
# default=None 
max_leaf_nodes = [int(x) for x in np.linspace(2, len(y_test), 50)]
max_leaf_nodes.append(None)

# min_impurity_decrease - A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# default=0.0
min_impurity_decrease = [x for x in np.arange(0.0, 0.01, 0.0001).round(5)]

# Create the random grid
param_grid_random = { 'criterion': criterion,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf' : min_samples_leaf,
                      'max_leaf_nodes' : max_leaf_nodes,
                      'min_impurity_decrease' : min_impurity_decrease,
                     }

best_random_search_model = RandomizedSearchCV(
        estimator=DecisionTreeClassifier(), 
        scoring='recall', 
        param_distributions=param_grid_random, 
        n_iter = 110, 
        cv=10, 
        verbose=1, 
        n_jobs = -1
    )
_ = best_random_search_model.fit(X_train, np.ravel(y_train))
random_search_best_params = best_random_search_model.best_params_
print('Best parameters found: ', random_search_best_params)

Fitting 10 folds for each of 110 candidates, totalling 1100 fits
Best parameters found:  {'min_samples_split': 210, 'min_samples_leaf': 1, 'min_impurity_decrease': 0.0031, 'max_leaf_nodes': 1438, 'max_depth': 311, 'criterion': 'gini'}


In [82]:
y_pred = best_random_search_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred,zero_division=1)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6506024096385542
************************************
Accuracy Score:   0.9573333333333334
Precision Score:  0.6067415730337079
F1 Score:         0.627906976744186
************************************
Confusion Matrix: [[1382   35]
 [  29   54]]


In [83]:
#The best parameters found using RandomizedSearchCV were:
random_search_best_params

{'min_samples_split': 210,
 'min_samples_leaf': 1,
 'min_impurity_decrease': 0.0031,
 'max_leaf_nodes': 1438,
 'max_depth': 311,
 'criterion': 'gini'}

Let's now use these current best parameters as a starting point for a more refined grid search. We'll use the same parameters as before, but we'll use a much smaller range of values for each parameter.

# 6.9 Fit a Decision Tree Classifier model with Grid Search

In [91]:
plus_minus = 5
increment = 2

param_grid = { 'min_samples_split': [x for x in range(random_search_best_params['min_samples_split']-plus_minus, random_search_best_params['min_samples_split']+plus_minus,2) if x >= 2],       
              'min_samples_leaf': [x for x in range(random_search_best_params['min_samples_leaf']-plus_minus , random_search_best_params['min_samples_leaf']+plus_minus,2) if x > 0],
              'min_impurity_decrease': [x for x in np.arange(random_search_best_params['min_impurity_decrease']-0.001, random_search_best_params['min_impurity_decrease']+0.001,.0001).round(5) if x >= 0.000],
              'max_leaf_nodes':[x for x in range(random_search_best_params['max_leaf_nodes']-plus_minus , random_search_best_params['max_leaf_nodes']+plus_minus, 2) if x > 1],  
              'max_depth': [x for x in range(random_search_best_params['max_depth']-plus_minus , random_search_best_params['max_depth']+plus_minus, 2) if x > 1],
              'criterion': [random_search_best_params['criterion']]
              }

best_grid_search_model = GridSearchCV(estimator=DecisionTreeClassifier(), 
                                    scoring='recall', param_grid=param_grid, cv=2, verbose=1,  n_jobs = -1)
_ = best_grid_search_model.fit(X_train, y_train)

Fitting 2 folds for each of 5000 candidates, totalling 10000 fits


In [92]:
print('Best parameters found: ', best_grid_search_model.best_params_)

Best parameters found:  {'criterion': 'gini', 'max_depth': 306, 'max_leaf_nodes': 1433, 'min_impurity_decrease': 0.0021, 'min_samples_leaf': 2, 'min_samples_split': 205}


In [93]:
y_pred = best_grid_search_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred,zero_division=1)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.6506024096385542
************************************
Accuracy Score:   0.9573333333333334
Precision Score:  0.6067415730337079
F1 Score:         0.627906976744186
************************************
Confusion Matrix: [[1382   35]
 [  29   54]]


1382, represents the number of true negatives (TN) - the number of instances correctly predicted to be in the negative class.

35, represents the number of false positives (FP) - the number of instances incorrectly predicted to be in the positive class.

29, represents the number of false negatives (FN) - the number of instances incorrectly predicted to be in the negative class.

54, represents the number of true positives (TP) - the number of instances correctly predicted to be in the positive class.

In this Decision Tree using Random search combined with Grid search model, the model correctly predicted 1382 instances as negative and 54 instances as positive. However, it incorrectly predicted 35 instances as positive when they were actually negative, and 29 instances as negative when they were actually positive.

Overall, the results indicate that the model did not perform well in identifying positive samples, as indicated by the low recall and precision scores and the confusion matrix. The high accuracy score is likely due to the large number of negative samples in the dataset.

In [95]:
print("*******Enter the Recall score of all models*****************************")

*******Enter the Recall score of all models*****************************


# 7.0 Recall Score for our models
Logistic Regression-Recall Score =0.6024096385542169

SVC Linear-Recall Score 0.6024096385542169

SVC Poly- Recall Score =0.6024096385542169

Decision Tree-Recall Score= 0.6506024096385542

# **8.0 Conclusion and Performance Evaluation**

It appears that the Recall Score for all the models is fairly similar, ranging from 0.602 to 0.651. This means that all models have a similar ability to correctly identify the positive class (customers who are likely to purchase the CD account). However, the Decision Tree model has a slightly higher recall score, and fast model fitting and quick execution time indicating that it may be slightly better at identifying the positive class than the other models.