# 1.0 Setup

In [1]:
# Import all required libraries
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.neural_network import MLPClassifier
import seaborn as sns
np.random.seed(1)

# 2.0 Load the Data

Loading the cleaned and preprossed data

In [2]:
#load the cleaned data 
X_train = pd.read_csv("shopping_X_train.csv")
X_test = pd.read_csv("shopping_X_test.csv")
y_train = pd.read_csv("shopping_y_train.csv")
y_test = pd.read_csv("shopping_y_test.csv")

In [3]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.695244,-0.464002,-0.401578,-0.249008,0.002346,-0.234227,-0.459606,-0.757614,-0.320331,-0.309603,2.522763,-0.084367,-2.44949
1,-0.094939,-0.20002,-0.401578,-0.249008,0.160364,0.430788,-0.459606,-0.628451,2.589939,-0.309603,-0.396391,-0.084367,0.408248
2,-0.695244,-0.464002,-0.401578,-0.249008,-0.562004,-0.549309,-0.459606,-0.657154,-0.320331,-0.309603,-0.396391,11.852924,-2.44949
3,0.205214,0.008264,-0.401578,-0.249008,1.401933,0.681911,-0.459606,-0.843723,-0.320331,-0.309603,2.522763,-0.084367,-2.44949
4,0.805519,0.522158,1.975666,0.500311,1.311638,1.896665,-0.373168,-0.73221,1.930372,-0.309603,-0.396391,-0.084367,0.408248


In [4]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.695244,-0.464002,-0.401578,-0.249008,-0.697448,-0.629112,1.614895,3.246437,-0.320331,-0.309603,2.522763,-0.084367,-2.44949
1,0.805519,7.74844,-0.401578,-0.249008,0.047494,-0.319019,-0.459606,-0.261692,0.115159,-0.309603,-0.396391,-0.084367,0.408248
2,-0.695244,-0.464002,-0.401578,-0.249008,-0.674874,-0.590638,-0.459606,0.146526,-0.320331,-0.309603,-0.396391,-0.084367,0.408248
3,-0.094939,0.104574,-0.401578,-0.249008,-0.268542,-0.417506,-0.459606,-0.81789,-0.320331,-0.309603,-0.396391,-0.084367,0.408248
4,0.505366,-0.136201,-0.401578,-0.249008,-0.53943,-0.522562,-0.459606,-0.680116,-0.320331,-0.309603,2.522763,-0.084367,-2.44949


In [5]:
y_train.head()

Unnamed: 0,REVENUE
0,0
1,1
2,0
3,0
4,1


In [6]:
y_test.head()

Unnamed: 0,REVENUE
0,0
1,1
2,0
3,0
4,0


# 3.0 Performance metrics Identification

### As this company is new to the market and wants to spend more on marketing campaigns to reach as many potential customers, building its own brand and focusing on product sales.

True Positives (TP): Users who were correctly identified by the model as potential customers who made a purchase.

False Positives (FP): Users who were incorrectly identified by the model as potential customers who made a purchase, but in reality, they did not.

True Negatives (TN): Users who were correctly identified by the model as non-potential customers who did not make a purchase.

False Negatives (FN): Users who were incorrectly identified by the model as non-potential customers who did not make a purchase, but in reality, they did.

##### Selecting Performace metrics

Clearly, our data set is imblanced and the majority class is 0 (not purchased) and the minority class is 1 (purchased). If we choose accuracy, it will focus on the majority class, which is not good in our case. Then we need to focus on Recall, Precision and F1 score.

Recall and precision score depends on the specific goal of the analysis and the trade-off between identifying all positive instances (high recall) and minimizing the number of false positives (high precision).

To minimize the number of false positives, even if it means potentially missing some positive instances, then precision should be the primary metric to focus on. A high precision means that the model is able to correctly identify positive instances with a high degree of certainty, which is important for minimizing the cost of false positives, such as wasting resources on marketing campaigns to non-interested customers. however the company is more concerned with not missing out on potential customers and wants to minimize False Negatives (FN), they prioritize maximizing Recall, which measures the proportion of true positives among all actual positive cases (TP / (TP + FN)). A higher Recall score indicates that the model is better at identifying all positive cases, even if it means making more false positive predictions that meansidentify all users who are likely to make a purchase, regardless of the number of false positives.


#### Recall Score- Performance metrics

# 3.0 Model the Data

## 3.1 Fit and test a Logistic Regression model

In [7]:
%%time
log_reg_model = LogisticRegression(penalty=None, max_iter=900)
_ = log_reg_model.fit(X_train, np.ravel(y_train))

CPU times: total: 62.5 ms
Wall time: 153 ms


In [8]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = log_reg_model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7454545454545455
************************************
Accuracy Score:   0.874831035414977
Precision Score:  0.5593451568894953
F1 Score:         0.6391270459859704
************************************
Confusion Matrix: [[2826  323]
 [ 140  410]]
CPU times: total: 46.9 ms
Wall time: 93.8 ms


In [9]:
lr_default_recall = recall_score(y_test, y_pred_thresh)

# 3.2 Fit a LogisticRegression model with Random Search

In [10]:
%%time
score_measure = "recall"
kfolds = 3
param_grid = { 'solver': [ 'liblinear', 'saga'],
                      'penalty': ['l1', 'l2'], # NOTE: 'elasticnet' is only supported by 'saga' solver
                      'C': np.arange(5,15),
                      # number of iterations to converge (sometimes the default is not enough - and sometimes, it will never converge)
                     }
logi_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = logi_reg, param_distributions=param_grid, cv=kfolds, n_iter=40,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_logi_reg_rand = rand_search.best_estimator_

Fitting 3 folds for each of 40 candidates, totalling 120 fits
The best recall score is 0.7448074807480749
... with parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 5}
CPU times: total: 1.12 s
Wall time: 8.12 s


In [11]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = rand_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7454545454545455
************************************
Accuracy Score:   0.874831035414977
Precision Score:  0.5593451568894953
F1 Score:         0.6391270459859704
************************************
Confusion Matrix: [[2826  323]
 [ 140  410]]
CPU times: total: 15.6 ms
Wall time: 89 ms


In [12]:
lr_rand_recall = recall_score(y_test, y_pred_thresh)

# 3.3 Fit a LogisticRegression model with Grid Search

In [13]:
%%time
score_measure = "recall"
kfolds = 5

penalty= rand_search.best_params_['penalty']
solver =rand_search.best_params_['solver']
C =rand_search.best_params_['C']
param_grid = {
    'C': [C+2,C,C-2],
    'penalty': [penalty],
    'solver': [solver]
}

logi_reg = LogisticRegression()
grid_search = GridSearchCV(estimator = logi_reg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
The best recall score is 0.745087801396314
... with parameters: {'C': 7, 'penalty': 'l2', 'solver': 'liblinear'}
CPU times: total: 172 ms
Wall time: 580 ms


In [14]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = grid_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7454545454545455
************************************
Accuracy Score:   0.874831035414977
Precision Score:  0.5593451568894953
F1 Score:         0.6391270459859704
************************************
Confusion Matrix: [[2826  323]
 [ 140  410]]
CPU times: total: 62.5 ms
Wall time: 71.8 ms


In [15]:
lr_grid_recall = recall_score(y_test, y_pred_thresh)

# 3.4 Fit a SVM classification model using linear kernal

In [16]:
%%time
svm_lin_model = SVC(kernel="linear")
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

CPU times: total: 14.5 s
Wall time: 19.6 s


In [17]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = svm_lin_model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.730909090909091
************************************
Accuracy Score:   0.8799675587996756
Precision Score:  0.5759312320916905
F1 Score:         0.6442307692307692
************************************
Confusion Matrix: [[2853  296]
 [ 148  402]]
CPU times: total: 922 ms
Wall time: 1.17 s


In [18]:
svm_linear_default_recall = recall_score(y_test, y_pred_thresh)

# 3.5 Fit a SVM classification model using linear kernal with Random Search

In [19]:
%%time
score_measure = "recall"
kfolds = 2

param_grid = {
    'C': [5,10,15,20],
    'kernel': ['linear'],
    
}

svm = SVC()
rand_search = RandomizedSearchCV(estimator =svm, param_distributions=param_grid, cv=kfolds, n_iter=20,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_linear = rand_search.best_estimator_

Fitting 2 folds for each of 4 candidates, totalling 8 fits




The best recall score is 0.7401358743243035
... with parameters: {'kernel': 'linear', 'C': 5}
CPU times: total: 27.6 s
Wall time: 1min 32s


In [20]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = rand_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.730909090909091
************************************
Accuracy Score:   0.8799675587996756
Precision Score:  0.5759312320916905
F1 Score:         0.6442307692307692
************************************
Confusion Matrix: [[2853  296]
 [ 148  402]]
CPU times: total: 1.08 s
Wall time: 1.82 s


In [21]:
svm_linear_rand_recall = recall_score(y_test, y_pred_thresh)

# 3.6 Fit a SVM classification model using linear kernal with Grid Search

In [22]:
%%time
score_measure = "recall"
kfolds = 2

C = rand_search.best_params_['C']

param_grid = {
    'C': [C+2,C,C-2]
}

svm_linear_model = SVC(kernel="linear")
grid_search = GridSearchCV(estimator = svm_linear_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 2 folds for each of 3 candidates, totalling 6 fits
The best recall score is 0.7401358743243035
... with parameters: {'C': 7}
CPU times: total: 30.9 s
Wall time: 48.3 s


In [23]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = grid_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.730909090909091
************************************
Accuracy Score:   0.8799675587996756
Precision Score:  0.5759312320916905
F1 Score:         0.6442307692307692
************************************
Confusion Matrix: [[2853  296]
 [ 148  402]]
CPU times: total: 1 s
Wall time: 1.08 s


In [24]:
svm_linear_grid_recall = recall_score(y_test, y_pred_thresh)

# 3.7 Fit a SVM classification model using rbf kernal

In [25]:
%%time
svm_rbf_model = SVC(kernel="rbf")
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

CPU times: total: 9.27 s
Wall time: 9.56 s


In [26]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = svm_rbf_model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7490909090909091
************************************
Accuracy Score:   0.8675317653419843
Precision Score:  0.5392670157068062
F1 Score:         0.6270928462709285
************************************
Confusion Matrix: [[2797  352]
 [ 138  412]]
CPU times: total: 3.39 s
Wall time: 3.43 s


In [27]:
svm_rbf_default_recall = recall_score(y_test, y_pred_thresh)

# 3.8 Fit a SVM classification model using rbf kernal with Random Search

In [28]:
%%time
score_measure = "recall"
kfolds = 2

param_grid = {
    'C': np.arange(5,15),
    'kernel': ['rbf'],
    
}

svm = SVC()
rand_search = RandomizedSearchCV(estimator =svm, param_distributions=param_grid, cv=kfolds, n_iter=12,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_linear = rand_search.best_estimator_

Fitting 2 folds for each of 10 candidates, totalling 20 fits




The best recall score is 0.8127322080572094
... with parameters: {'kernel': 'rbf', 'C': 14}
CPU times: total: 9.22 s
Wall time: 53.8 s


In [29]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = rand_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7345454545454545
************************************
Accuracy Score:   0.8675317653419843
Precision Score:  0.5401069518716578
F1 Score:         0.6224961479198767
************************************
Confusion Matrix: [[2805  344]
 [ 146  404]]
CPU times: total: 2.47 s
Wall time: 4.25 s


In [30]:
svm_rbf_rand_recall = recall_score(y_test, y_pred_thresh)

# 3.9 Fit a SVM classification model using rbf kernal with Grid Search


In [31]:
%%time
score_measure = "recall"
kfolds = 3

C = rand_search.best_params_['C']

param_grid = {
    'C': [C+1,C,C-1]
}

svm_linear_model = SVC(kernel="rbf")
grid_search = GridSearchCV(estimator = svm_linear_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 3 folds for each of 3 candidates, totalling 9 fits
The best recall score is 0.8180924999716467
... with parameters: {'C': 15}
CPU times: total: 12.1 s
Wall time: 37.3 s


In [32]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = grid_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7327272727272728
************************************
Accuracy Score:   0.8678021086780211
Precision Score:  0.5409395973154363
F1 Score:         0.6223938223938225
************************************
Confusion Matrix: [[2807  342]
 [ 147  403]]
CPU times: total: 3.11 s
Wall time: 3.13 s


In [33]:
svm_rbf_grid_recall = recall_score(y_test, y_pred_thresh)

# 3.10 Fit a SVM classification model using polynomial kernal

In [34]:
%%time
svm_poly_model= SVC(kernel="poly", degree=3, coef0=1, C=10)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

CPU times: total: 47.4 s
Wall time: 48.9 s


In [35]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred =svm_poly_model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7345454545454545
************************************
Accuracy Score:   0.8772641254393079
Precision Score:  0.5674157303370787
F1 Score:         0.6402535657686212
************************************
Confusion Matrix: [[2841  308]
 [ 146  404]]
CPU times: total: 969 ms
Wall time: 1.1 s


In [36]:
svm_poly_default_recall = recall_score(y_test, y_pred_thresh)

# 3.11 Fit a SVM classification model using Polynomial kernal with Random Search

In [37]:
%%time
score_measure = "recall"
kfolds = 2

param_rand = {
    'C': [10,15],
    'degree': [3,4],
    'coef0': [4,5]

}

svm_poly_model = SVC(kernel="poly")
rand_search = RandomizedSearchCV(estimator = svm_poly_model, param_distributions=param_rand, cv=kfolds, n_iter=5,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train,np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_poly = rand_search.best_estimator_

Fitting 2 folds for each of 5 candidates, totalling 10 fits
The best recall score is 0.8277185980902186
... with parameters: {'degree': 4, 'coef0': 4, 'C': 15}
CPU times: total: 21min 41s
Wall time: 29min 43s


In [38]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = rand_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7163636363636363
************************************
Accuracy Score:   0.8629359286293593
Precision Score:  0.5288590604026846
F1 Score:         0.6084942084942085
************************************
Confusion Matrix: [[2798  351]
 [ 156  394]]
CPU times: total: 984 ms
Wall time: 977 ms


In [39]:
svm_poly_rand_recall = recall_score(y_test, y_pred_thresh)

# 3.12 Fit a SVM classification model using Polynomial kernal with Grid Search

In [40]:
%%time
score_measure = "recall"
kfolds = 2

degree = rand_search.best_params_['degree']
coef0 = rand_search.best_params_['coef0']
C = rand_search.best_params_['C']
param_grid = {
    'C': [C-1,C,C+1],
    'degree': [degree],
    'coef0': [coef0],
}

svm_poly_model = SVC(kernel="poly")
grid_search = GridSearchCV(estimator = svm_poly_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_poly = grid_search.best_estimator_

Fitting 2 folds for each of 3 candidates, totalling 6 fits
The best recall score is 0.8292311359263504
... with parameters: {'C': 16, 'coef0': 4, 'degree': 4}
CPU times: total: 19min 26s
Wall time: 27min 13s


In [41]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = grid_search.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7163636363636363
************************************
Accuracy Score:   0.8629359286293593
Precision Score:  0.5288590604026846
F1 Score:         0.6084942084942085
************************************
Confusion Matrix: [[2798  351]
 [ 156  394]]
CPU times: total: 891 ms
Wall time: 1.04 s


In [42]:
svm_poly_grid_recall = recall_score(y_test, y_pred_thresh)

# 3.13 Fit a DTree classification model using defaults (unconstrained tree)

In [43]:
%%time
dtree = DecisionTreeClassifier()
_=dtree.fit(X_train, np.ravel(y_train))

CPU times: total: 93.8 ms
Wall time: 116 ms


In [44]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = dtree.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.5636363636363636
************************************
Accuracy Score:   0.8613138686131386
Precision Score:  0.5317324185248714
F1 Score:         0.5472197705207414
************************************
Confusion Matrix: [[2876  273]
 [ 240  310]]
CPU times: total: 0 ns
Wall time: 47 ms


In [45]:
dtree_default_recall = recall_score(y_test, y_pred_thresh)

# 3.14 Fit a Decision Tree Classifier model with Random Search

In [46]:
%%time
# Criterion used to guide data splits
criterion = ['gini', 'entropy']

# Maximum number of levels in tree. If None, then nodes are expanded until all leaves are pure or until all 
# leaves contain less than min_samples_split samples.
# default = None
max_depth = [int(x) for x in np.linspace(1, 5000, 25)]

# Minimum number of samples required to split a node
# default is 2
min_samples_split = [int(x) for x in np.linspace(2, 5000, 25)]

# Minimum number of samples required at each leaf node
# default = 1 
min_samples_leaf = [int(x) for x in np.linspace(1, 500, 5)]

# max_leaf_nodes  - Grow trees with max_leaf_nodes in best-first fashion.
# If None then unlimited number of leaf nodes.
# default=None 
max_leaf_nodes = [int(x) for x in np.linspace(2, len(y_test), 50)]
max_leaf_nodes.append(None)

# min_impurity_decrease - A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# default=0.0
min_impurity_decrease = [x for x in np.arange(0.0, 0.01, 0.0001).round(5)]

# Create the random grid
param_grid_random = { 'criterion': criterion,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf' : min_samples_leaf,
                      'max_leaf_nodes' : max_leaf_nodes,
                      'min_impurity_decrease' : min_impurity_decrease,
                     }

best_random_search_model = RandomizedSearchCV(
        estimator=DecisionTreeClassifier(), 
        scoring='recall', 
        param_distributions=param_grid_random, 
        n_iter = 100, random_state=1,
        cv=5, 
        verbose=1, 
        n_jobs = -1
    )
_ = best_random_search_model.fit(X_train, np.ravel(y_train))
random_search_best_params = best_random_search_model.best_params_
print('Best parameters found: ', random_search_best_params)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'min_samples_split': 2, 'min_samples_leaf': 125, 'min_impurity_decrease': 0.0006, 'max_leaf_nodes': 2567, 'max_depth': 1667, 'criterion': 'entropy'}
CPU times: total: 1.2 s
Wall time: 6.9 s


In [47]:

%%time
# evaluate the model
# predict on test data with default threshold
y_pred = best_random_search_model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.8490909090909091
************************************
Accuracy Score:   0.8288726682887266
Precision Score:  0.4591937069813176
F1 Score:         0.5960433950223356
************************************
Confusion Matrix: [[2599  550]
 [  83  467]]
CPU times: total: 31.2 ms
Wall time: 85.2 ms


In [48]:
dtree_rand_recall = recall_score(y_test, y_pred_thresh)

# 3.15 Fit a Decision Tree Classifier model with Grid Search

In [49]:
%%time
plus_minus = 4
increment = 2

param_grid = { 'min_samples_split': [x for x in range(random_search_best_params['min_samples_split']-plus_minus, random_search_best_params['min_samples_split']+plus_minus,2) if x >= 2],       
              'min_samples_leaf': [x for x in range(random_search_best_params['min_samples_leaf']-plus_minus , random_search_best_params['min_samples_leaf']+plus_minus,2) if x > 0],
              'min_impurity_decrease': [x for x in np.arange(random_search_best_params['min_impurity_decrease']-0.001, random_search_best_params['min_impurity_decrease']+0.001,.0001).round(5) if x >= 0.000],
              'max_leaf_nodes':[x for x in range(random_search_best_params['max_leaf_nodes']-plus_minus , random_search_best_params['max_leaf_nodes']+plus_minus, 2) if x > 1],  
              'max_depth': [x for x in range(random_search_best_params['max_depth']-plus_minus , random_search_best_params['max_depth']+plus_minus, 2) if x > 1],
              'criterion': [random_search_best_params['criterion']]
              }

best_grid_search_model = GridSearchCV(estimator=DecisionTreeClassifier(), 
                                    scoring='recall', param_grid=param_grid, cv=5, verbose=1,  n_jobs = -1)
_ = best_grid_search_model.fit(X_train, y_train)
print('Best parameters found: ', best_grid_search_model.best_params_)

Fitting 5 folds for each of 2048 candidates, totalling 10240 fits
Best parameters found:  {'criterion': 'entropy', 'max_depth': 1663, 'max_leaf_nodes': 2563, 'min_impurity_decrease': 0.0008, 'min_samples_leaf': 125, 'min_samples_split': 2}
CPU times: total: 28.7 s
Wall time: 2min 36s


In [50]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = best_grid_search_model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.8345454545454546
************************************
Accuracy Score:   0.8450932684509327
Precision Score:  0.487778958554729
F1 Score:         0.6156941649899396
************************************
Confusion Matrix: [[2667  482]
 [  91  459]]
CPU times: total: 46.9 ms
Wall time: 63.1 ms


In [51]:
dtree_grid_recall = recall_score(y_test, y_pred_thresh)

# 3.16 Neural Net with default parameters

In [52]:
%%time
ann = MLPClassifier(hidden_layer_sizes=(60,50,40), solver='adam', max_iter=2000)
_ = ann.fit(X_train, np.ravel(y_train))

CPU times: total: 45.1 s
Wall time: 2min 33s


In [53]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = ann.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.6290909090909091
************************************
Accuracy Score:   0.8734793187347932
Precision Score:  0.5672131147540984
F1 Score:         0.5965517241379311
************************************
Confusion Matrix: [[2885  264]
 [ 204  346]]
CPU times: total: 266 ms
Wall time: 104 ms


In [54]:
%%time
ann_default_recall = recall_score(y_test, y_pred_thresh)

CPU times: total: 0 ns
Wall time: 0 ns


# 3.17 NN With RandomizedSearchCV

In [55]:
%%time
score_measure = "recall"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (50,), (70,),(50,30), (40,20), (60,40, 20), (70,50,40)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam', 'sgd'],
    'alpha': [0, .2, .5, .7, 1],
    'learning_rate': ['constant', 'invscaling', 'adaptive'],
    'learning_rate_init': [0.001, 0.01, 0.1, 0.2, 0.5],
    'max_iter': [2000]
}

ann = MLPClassifier()
random_search = RandomizedSearchCV(estimator = ann, param_distributions=param_grid, cv=kfolds, n_iter=100,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = random_search.fit(X_train, np.ravel(y_train))

bestRecallANN = random_search.best_estimator_

print(random_search.best_params_)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'solver': 'adam', 'max_iter': 2000, 'learning_rate_init': 0.01, 'learning_rate': 'constant', 'hidden_layer_sizes': (60, 40, 20), 'alpha': 0, 'activation': 'tanh'}
CPU times: total: 6.33 s
Wall time: 16min 53s


In [56]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = bestRecallANN.predict(X_test)


from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.649090909090909
************************************
Accuracy Score:   0.8353609083536091
Precision Score:  0.46183699870633893
F1 Score:         0.5396825396825397
************************************
Confusion Matrix: [[2733  416]
 [ 193  357]]
CPU times: total: 46.9 ms
Wall time: 65.4 ms


In [57]:
ann_random_recall = recall_score(y_test, y_pred)

# 3.18 NN With GridSearchCV

In [58]:
%%time
score_measure = "recall"
kfolds = 5

param_grid = {
    'hidden_layer_sizes': [ (30,), (50,), (70,), (90,)],
    'activation': ['tanh', 'relu'],
    'solver': ['adam'],
    'alpha': [.5, .7, 1],
    'learning_rate': ['adaptive', 'invscaling'],
    'learning_rate_init': [0.005, 0.01, 0.15],
    'max_iter': [5000]
}

ann = MLPClassifier()
grid_search = GridSearchCV(estimator = ann, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

bestRecallANN = grid_search.best_estimator_

print(grid_search.best_params_)

Fitting 5 folds for each of 144 candidates, totalling 720 fits
{'activation': 'tanh', 'alpha': 1, 'hidden_layer_sizes': (50,), 'learning_rate': 'adaptive', 'learning_rate_init': 0.15, 'max_iter': 5000, 'solver': 'adam'}
CPU times: total: 4.22 s
Wall time: 4min 52s


In [59]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = bestRecallANN.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.8545454545454545
************************************
Accuracy Score:   0.7764260610975939
Precision Score:  0.3861955628594905
F1 Score:         0.5319750990379174
************************************
Confusion Matrix: [[2402  747]
 [  80  470]]
CPU times: total: 46.9 ms
Wall time: 55.4 ms


In [60]:
ann_grid_recall = recall_score(y_test, y_pred_thresh)

# 3.19 Deep Network using Random search

In [61]:
import keras_tuner
import tensorflow as tf
from tensorflow import keras
import numpy as np
import tensorflow as tf
from keras import backend as K
# fix random seed for reproducibility
np.random.seed(1)
tf.random.set_seed(1)

In [62]:
%%time

def build_clf(meta, hidden_layer_sizes, dropout):
    n_features_in_ = meta["n_features_in_"]
    n_classes_ = meta["n_classes_"]
    target_encoder_ = meta["target_encoder_"]
    
    model = tf.keras.models.Sequential()
    model.add(keras.layers.Input(shape=n_features_in_)),
    #for hidden_layer_size in hidden_layer_sizes:
    for hidden_layer_size in hidden_layer_sizes:
        model.add(keras.layers.Dense(hidden_layer_size, 
            kernel_initializer= tf.keras.initializers.GlorotUniform(), 
            bias_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None), 
            activation="relu"))
        model.add(keras.layers.Dropout(dropout))
    model.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    
    #though you could return a compiled model, it's not necessary, and would result in the loss of these
    # parameters in the tune process - as they would be 'hard coded'
    # model.compile(loss = 'sparse_categorical_crossentropy', metrics = ['accuracy']) 

    return model

CPU times: total: 0 ns
Wall time: 0 ns


In [63]:
%%time

# If you don't have the following installed, from command line '!pip install scikeras'
from scikeras.wrappers import KerasClassifier

keras_clf = KerasClassifier(
    model=build_clf,
    hidden_layer_sizes=13,
    dropout=0.5,
    optimizer=keras.optimizers.Adam,
    optimizer__learning_rate=0.0001
)
keras_clf.get_params()

CPU times: total: 0 ns
Wall time: 78.1 ms


{'model': <function __main__.build_clf(meta, hidden_layer_sizes, dropout)>,
 'build_fn': None,
 'warm_start': False,
 'random_state': None,
 'optimizer': keras.optimizers.optimizer_v2.adam.Adam,
 'loss': None,
 'metrics': None,
 'batch_size': None,
 'validation_batch_size': None,
 'verbose': 1,
 'callbacks': None,
 'validation_split': 0.0,
 'shuffle': True,
 'run_eagerly': False,
 'epochs': 1,
 'hidden_layer_sizes': 13,
 'dropout': 0.5,
 'optimizer__learning_rate': 0.0001,
 'class_weight': None}

In [64]:
%%time

params = {
    
    # the following are model parameters, and therefore must be defined as parameters in the KarasClassifier, and then in the build_clf function
    'model__hidden_layer_sizes': [(70,),(90, ), (100,), (100, 90)], # this will require KarasClassifier and build_clf to have hidden_layer_sizes parameter set
    'model__dropout': [0, 0.1], # this will require KarasClassifier and build_clf to have hidden_layer_sizes parameter set
    
    # the following are 'fit' parameters, the scikeras wrapper provides these parameters. These are passed to the 'model.fit' method for each fit of the model
    'batch_size':[20, 60, 100],
    'epochs':[10],
    'optimizer':['adam','sgd'],
    'loss':['binary_crossentropy'],
    
    # this is added to the optimizer 
    'optimizer__learning_rate': [0.0001, 0.001, 0.01]

}
keras_clf.get_params()

CPU times: total: 0 ns
Wall time: 0 ns


{'model': <function __main__.build_clf(meta, hidden_layer_sizes, dropout)>,
 'build_fn': None,
 'warm_start': False,
 'random_state': None,
 'optimizer': keras.optimizers.optimizer_v2.adam.Adam,
 'loss': None,
 'metrics': None,
 'batch_size': None,
 'validation_batch_size': None,
 'verbose': 1,
 'callbacks': None,
 'validation_split': 0.0,
 'shuffle': True,
 'run_eagerly': False,
 'epochs': 1,
 'hidden_layer_sizes': 13,
 'dropout': 0.5,
 'optimizer__learning_rate': 0.0001,
 'class_weight': None}

In [65]:
%%time

from sklearn.model_selection import RandomizedSearchCV
#from tensorflow.keras.callbacks import EarlyStopping

rnd_search_cv = RandomizedSearchCV(
    estimator=keras_clf, 
    param_distributions=params, 
    scoring='recall',  # we could use any appropriate sklearn metric here (i.e. accuracy, f1_micro, f1_macro)
    n_iter=5, 
    cv=3)

# In rare cases, you may find your model training results in exceeding python's default recursion limit.
# If needed, you can increase this excersion limit by using the following code.
#import sys
#sys.setrecursionlimit(10000) # note: the default is 3000 (python 3.9)

_ = rnd_search_cv.fit(X_train, y_train,  verbose=1)

# You can create 'call back' functions. These are functions that will be called at the 
# end of each epoch. There are a number of builtin functions created for this purpose, 
# one of which is EarlyStopping -- that, based on the parameters you give, will stop
# the training process. This is useful when the algorithm is not making any significant
# gains through further training. 
#earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
#callback = [earlystop]
#_ = rnd_search_cv.fit(X_train, y_train, callbacks=callback, verbose=0)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [66]:
rnd_search_cv.best_params_

{'optimizer__learning_rate': 0.001,
 'optimizer': 'adam',
 'model__hidden_layer_sizes': (100, 90),
 'model__dropout': 0.1,
 'loss': 'binary_crossentropy',
 'epochs': 10,
 'batch_size': 60}

In [67]:
best_model = rnd_search_cv.best_estimator_

In [68]:
%%time

print(f"best score {best_model.score(X_test, y_test)}")
print(f"min loss {min(best_model.history_['loss'])}")

best score 0.8556366585563666
min loss 0.3583610951900482
CPU times: total: 141 ms
Wall time: 360 ms


In [69]:
best_model.history_

defaultdict(list,
            {'loss': [0.4506235122680664,
              0.4004330635070801,
              0.38564833998680115,
              0.37650027871131897,
              0.369480699300766,
              0.3683628439903259,
              0.36624133586883545,
              0.3611660301685333,
              0.3603729009628296,
              0.3583610951900482]})

In [70]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = best_model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.8036363636363636
************************************
Accuracy Score:   0.8556366585563666
Precision Score:  0.5092165898617511
F1 Score:         0.6234132581100141
************************************
Confusion Matrix: [[2723  426]
 [ 108  442]]
CPU times: total: 78.1 ms
Wall time: 346 ms


In [71]:
Dnn_random_recall = recall_score(y_test, y_pred)

# 3.20 Deep Network deafult

In [72]:
#Define the model

model = keras.models.Sequential()

model.add(keras.layers.Input(shape=13))
model.add(keras.layers.Dense(100, activation='relu'))
model.add(keras.layers.Dense(90, activation='relu'))
model.add(keras.layers.Dense(30, activation='relu'))
model.add(keras.layers.Dense(10, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [73]:
import keras.backend as K
def recall(y_test, y_pred):
    y_test = K.ones_like(y_test)
    true_positives = K.sum(K.round(K.clip(y_test * y_pred, 0, 1)))
    all_positives = K.sum(K.round(K.clip(y_test, 0, 1)))

    recall = true_positives / (all_positives + K.epsilon())
    return recall

In [74]:
# Compile model

#Optimizer:
adam = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[recall])

In [75]:
# Fit the model

history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test), 
                    epochs=20, batch_size=100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [76]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7981818181818182
************************************
Accuracy Score:   0.8386050283860503
Precision Score:  0.4745945945945946
F1 Score:         0.5952542372881355
************************************
Confusion Matrix: [[2663  486]
 [ 111  439]]
CPU times: total: 234 ms
Wall time: 795 ms


In [77]:
Dnn_default_recall = recall_score(y_test, y_pred_thresh)

# 3.21 DNN Random Grid Search

In [78]:
%%time

# If you don't have the following installed, from command line '!pip install scikeras'
from scikeras.wrappers import KerasClassifier
from keras.initializers import GlorotNormal
from keras.initializers import lecun_normal

score_measure = "recall"
kfolds = 5

def build_clf(hidden_layer_sizes, dropout):
    ann = tf.keras.models.Sequential()
    ann.add(keras.layers.Input(shape=13)),
    for hidden_layer_size in hidden_layer_sizes:
        model.add(keras.layers.Dense(hidden_layer_size, kernel_initializer= tf.keras.initializers.lecun_normal(), 
                                     bias_initializer=keras.initializers.RandomNormal(mean=0.0, stddev=0.05, seed=None), activation="selu"))
        model.add(keras.layers.Dropout(dropout))
    ann.add(tf.keras.layers.Dense(1, activation='sigmoid'))
    ann.compile(loss = 'binary_crossentropy', metrics = [recall])
    return ann


CPU times: total: 0 ns
Wall time: 0 ns


In [79]:
from scikeras.wrappers import KerasClassifier

keras_clf = KerasClassifier(
    model=build_clf,
    hidden_layer_sizes=13,
    dropout = 0.0
)

In [80]:
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.model_selection import RandomizedSearchCV

params = {
    'optimizer__learning_rate': [0.0005, 0.001, 0.005],
    'model__hidden_layer_sizes': [(70,),(90, ), (100,), (100, 90)],
    'model__dropout': [0, 0.1],
    'batch_size':[50, 100],
    'epochs':[10, 20],
    'optimizer':["adam"]
}
keras_clf.get_params().keys()



dict_keys(['model', 'build_fn', 'warm_start', 'random_state', 'optimizer', 'loss', 'metrics', 'batch_size', 'validation_batch_size', 'verbose', 'callbacks', 'validation_split', 'shuffle', 'run_eagerly', 'epochs', 'hidden_layer_sizes', 'dropout', 'class_weight'])

In [81]:
%%time
rnd_search_cv = RandomizedSearchCV(estimator=keras_clf, param_distributions=params, scoring=score_measure, n_iter=10, cv=5)

import sys
sys.setrecursionlimit(3000) # note: the default is 3000 (python 3.9)

earlystop = EarlyStopping(monitor='val_loss', patience=5, verbose=0, mode='auto')
callback = [earlystop]

_ = rnd_search_cv.fit(X_train, y_train, callbacks=callback, verbose=0)


CPU times: total: 1min 35s
Wall time: 5min 41s


In [82]:
rnd_search_cv.best_params_

{'optimizer__learning_rate': 0.0005,
 'optimizer': 'adam',
 'model__hidden_layer_sizes': (70,),
 'model__dropout': 0.1,
 'epochs': 10,
 'batch_size': 100}

In [83]:
best_net = rnd_search_cv.best_estimator_
print(rnd_search_cv.best_params_)

{'optimizer__learning_rate': 0.0005, 'optimizer': 'adam', 'model__hidden_layer_sizes': (70,), 'model__dropout': 0.1, 'epochs': 10, 'batch_size': 100}


In [84]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.7981818181818182
************************************
Accuracy Score:   0.8386050283860503
Precision Score:  0.4745945945945946
F1 Score:         0.5952542372881355
************************************
Confusion Matrix: [[2663  486]
 [ 111  439]]
CPU times: total: 172 ms
Wall time: 680 ms


In [85]:
DNN_randgrid_recall = recall_score(y_test, y_pred_thresh)

# 3.22 Wide and Deep Network

In [86]:
#Define the model: for multi-class

model = keras.models.Sequential()

model.add(keras.layers.Input(shape=13))
model.add(keras.layers.Dense(200, activation='relu'))
model.add(keras.layers.Dense(200, activation='relu'))
model.add(keras.layers.Dense(200, activation='relu'))
model.add(keras.layers.Dense(1, activation='sigmoid'))

In [87]:
# Compile model

#Optimizer:
adam = keras.optimizers.Adam(learning_rate=0.01)
model.compile(loss='binary_crossentropy', optimizer=adam, metrics=[recall])

In [88]:
# Fit the model

history = model.fit(X_train, y_train, 
                    validation_data=(X_test, y_test), 
                    epochs=20, batch_size=100)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [89]:
%%time
# evaluate the model
# predict on test data with default threshold
y_pred = model.predict(X_test)

# adjust the threshold and predict again
new_threshold = 0.3
y_pred_thresh = (y_pred >= new_threshold).astype(int)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# calculate evaluation metrics with adjusted threshold
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred_thresh)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred_thresh)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred_thresh)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred_thresh)}")

************************************
Recall Score:     0.82
************************************
Accuracy Score:   0.8188699648553663
Precision Score:  0.44129158512720157
F1 Score:         0.5737913486005088
************************************
Confusion Matrix: [[2578  571]
 [  99  451]]
CPU times: total: 188 ms
Wall time: 540 ms


In [90]:
WDNN_recall = recall_score(y_test, y_pred_thresh)

# 4.0 Summarize results

In [91]:
print("Recall scores...")
print("**************************************")
print(f"{'Logistic Regression deafult:':18}{lr_default_recall}")
print(f"{'Logistic Regression Random Search:':18}{lr_rand_recall}")
print(f"{'Logistic Regression Grid Search:':18}{lr_grid_recall}")
print("**************************************")
print(f"{'SVM Linear deafult:':18}{svm_linear_default_recall}")
print(f"{'SVM Linear Random Search:':18}{svm_linear_rand_recall}")
print(f"{'SVM Linear Grid Search:':18}{svm_linear_grid_recall}")
print("**************************************")
print(f"{'SVM rbf deafult:':18}{svm_rbf_default_recall}")
print(f"{'SVM rbf Random Search:':18}{svm_rbf_rand_recall}")
print(f"{'SVM rbf Grid Search:':18}{svm_rbf_grid_recall}")
print("**************************************")
print(f"{'SVM Poly deafult:':18}{svm_poly_default_recall}")
print(f"{'SVM Poly Random Search:':18}{svm_poly_rand_recall}")
print(f"{'SVM Poly Grid Search:':18}{svm_poly_grid_recall}")
print("**************************************")
print(f"{'Decision Tree deafult:':18}{dtree_default_recall}")
print(f"{'Decision Tree Random Search:':18}{dtree_rand_recall}")
print(f"{'Decision Tree Grid Search:':18}{dtree_grid_recall}")
print("**************************************")
print(f"{'ANN deafult:':18}{ann_default_recall}")
print(f"{'ANN Random Search:':18}{ann_random_recall}")
print(f"{'ANN Grid Search:':18}{ann_grid_recall}")
print("**************************************")
print("**************************************")
print(f"{'DNN with Random Search:':18}{Dnn_random_recall}")
print(f"{'DNN deafult :':18}{Dnn_default_recall}")
print(f"{'DNN with Random Grid Search:':18}{DNN_randgrid_recall}")
print(f"{'WDNN:':18}{WDNN_recall}")
print("**************************************")
print("**************************************")

Recall scores...
**************************************
Logistic Regression deafult:0.7454545454545455
Logistic Regression Random Search:0.7454545454545455
Logistic Regression Grid Search:0.7454545454545455
**************************************
SVM Linear deafult:0.730909090909091
SVM Linear Random Search:0.730909090909091
SVM Linear Grid Search:0.730909090909091
**************************************
SVM rbf deafult:  0.7490909090909091
SVM rbf Random Search:0.7345454545454545
SVM rbf Grid Search:0.7327272727272728
**************************************
SVM Poly deafult: 0.7345454545454545
SVM Poly Random Search:0.7163636363636363
SVM Poly Grid Search:0.7163636363636363
**************************************
Decision Tree deafult:0.5636363636363636
Decision Tree Random Search:0.8490909090909091
Decision Tree Grid Search:0.8345454545454546
**************************************
ANN deafult:      0.6290909090909091
ANN Random Search:0.649090909090909
ANN Grid Search:  0.854545454545454

# Analysis

##### As the company is new to market and they want to improve potential customer in coming three year regardless of their marketing budget spending  so we used recall score.

# MLP and Keras models perform versus other predictive models.

Based on the recall scores, the MLP-ANN (Multi-layer Perceptron Artificial Neural Network) models perform worse than some of the other models, such as the decision tree models, SVM models (with radial basis function kernel), and the DNN (Deep Neural Network) models. However, the MLP-ANN model with Grid Search hyperparameter tuning achieved a recall score of 0.8545, which is higher than any other model except the decision tree with Random Search tuning. This indicates that the MLP-ANN model has the potential to perform well, but it may require more extensive hyperparameter tuning to achieve optimal performance.

On the other hand, the DNN models with Random Search, default settings, Random Grid Search, and Wide DNN perform relatively well, achieving recall scores ranging from 0.7982 to 0.82. These scores are competitive with the best-performing models, such as the decision tree with Random Search and the MLP-ANN with Grid Search. This suggests that the DNN models have potential for good performance and may be worth further investigation and optimization for better results.