# 1.0 Setup

In [1]:
# Import all required libraries
import pandas as pd
from sklearn.svm import SVC
from matplotlib import pyplot as plt
import numpy as np
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import preprocessing
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from imblearn.over_sampling import RandomOverSampler

import seaborn as sns
np.random.seed(1)

# 2.0 Load the Data

Loading the cleaned and preprossed data

In [3]:
#load the cleaned data 
X_train = pd.read_csv("shopping_X_train.csv")
X_test = pd.read_csv("shopping_X_test.csv")
y_train = pd.read_csv("shopping_y_train.csv")
y_test = pd.read_csv("shopping_y_test.csv")

In [4]:
X_train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.695244,-0.464002,-0.401578,-0.249008,0.002346,-0.234227,-0.459606,-0.757614,-0.320331,-0.309603,2.522763,-0.084367,-2.44949
1,-0.094939,-0.20002,-0.401578,-0.249008,0.160364,0.430788,-0.459606,-0.628451,2.589939,-0.309603,-0.396391,-0.084367,0.408248
2,-0.695244,-0.464002,-0.401578,-0.249008,-0.562004,-0.549309,-0.459606,-0.657154,-0.320331,-0.309603,-0.396391,11.852924,-2.44949
3,0.205214,0.008264,-0.401578,-0.249008,1.401933,0.681911,-0.459606,-0.843723,-0.320331,-0.309603,2.522763,-0.084367,-2.44949
4,0.805519,0.522158,1.975666,0.500311,1.311638,1.896665,-0.373168,-0.73221,1.930372,-0.309603,-0.396391,-0.084367,0.408248


In [5]:
X_test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,-0.695244,-0.464002,-0.401578,-0.249008,-0.697448,-0.629112,1.614895,3.246437,-0.320331,-0.309603,2.522763,-0.084367,-2.44949
1,0.805519,7.74844,-0.401578,-0.249008,0.047494,-0.319019,-0.459606,-0.261692,0.115159,-0.309603,-0.396391,-0.084367,0.408248
2,-0.695244,-0.464002,-0.401578,-0.249008,-0.674874,-0.590638,-0.459606,0.146526,-0.320331,-0.309603,-0.396391,-0.084367,0.408248
3,-0.094939,0.104574,-0.401578,-0.249008,-0.268542,-0.417506,-0.459606,-0.81789,-0.320331,-0.309603,-0.396391,-0.084367,0.408248
4,0.505366,-0.136201,-0.401578,-0.249008,-0.53943,-0.522562,-0.459606,-0.680116,-0.320331,-0.309603,2.522763,-0.084367,-2.44949


In [6]:
y_train.head()

Unnamed: 0,REVENUE
0,0
1,1
2,0
3,0
4,1


In [7]:
y_test.head()

Unnamed: 0,REVENUE
0,0
1,1
2,0
3,0
4,0


# 3.0 Performance metrics Identification

### As this company is new to the market and wants to spend more on marketing campaigns to reach as many potential customers, building its own brand and focusing on product sales.

True Positives (TP): Users who were correctly identified by the model as potential customers who made a purchase.

False Positives (FP): Users who were incorrectly identified by the model as potential customers who made a purchase, but in reality, they did not.

True Negatives (TN): Users who were correctly identified by the model as non-potential customers who did not make a purchase.

False Negatives (FN): Users who were incorrectly identified by the model as non-potential customers who did not make a purchase, but in reality, they did.

##### Selecting Performace metrics

Clearly, our data set is imblanced and the majority class is 0 (not purchased) and the minority class is 1 (purchased). If we choose accuracy, it will focus on the majority class, which is not good in our case. Then we need to focus on Recall, Precision and F1 score.

Recall and precision score depends on the specific goal of the analysis and the trade-off between identifying all positive instances (high recall) and minimizing the number of false positives (high precision).

To minimize the number of false positives, even if it means potentially missing some positive instances, then precision should be the primary metric to focus on. A high precision means that the model is able to correctly identify positive instances with a high degree of certainty, which is important for minimizing the cost of false positives, such as wasting resources on marketing campaigns to non-interested customers. however the company is more concerned with not missing out on potential customers and wants to minimize False Negatives (FN), they prioritize maximizing Recall, which measures the proportion of true positives among all actual positive cases (TP / (TP + FN)). A higher Recall score indicates that the model is better at identifying all positive cases, even if it means making more false positive predictions that meansidentify all users who are likely to make a purchase, regardless of the number of false positives.


#### Recall Score- Performance metrics

# 3.0 Model the Data

## 3.1 Fit and test a Logistic Regression model

In [8]:
log_reg_model = LogisticRegression(penalty=None, max_iter=900)
_ = log_reg_model.fit(X_train, np.ravel(y_train))

In [9]:
y_pred = log_reg_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7454545454545455
************************************
Accuracy Score:   0.874831035414977
Precision Score:  0.5593451568894953
F1 Score:         0.6391270459859704
************************************
Confusion Matrix: [[2826  323]
 [ 140  410]]


In [10]:
lr_default_recall = recall_score(y_test, y_pred)

# 3.2 Fit a LogisticRegression model with Random Search

In [12]:
score_measure = "recall"
kfolds = 3
param_grid = { 'solver': [ 'liblinear', 'saga'],
                      'penalty': ['l1', 'l2'], # NOTE: 'elasticnet' is only supported by 'saga' solver
                      'C': np.arange(5,15),
                      # number of iterations to converge (sometimes the default is not enough - and sometimes, it will never converge)
                     }
logi_reg = LogisticRegression()
rand_search = RandomizedSearchCV(estimator = logi_reg, param_distributions=param_grid, cv=kfolds, n_iter=40,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_logi_reg_rand = rand_search.best_estimator_

Fitting 3 folds for each of 40 candidates, totalling 120 fits
The best recall score is 0.7448074807480749
... with parameters: {'solver': 'liblinear', 'penalty': 'l1', 'C': 5}


In [13]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7454545454545455
************************************
Accuracy Score:   0.874831035414977
Precision Score:  0.5593451568894953
F1 Score:         0.6391270459859704
************************************
Confusion Matrix: [[2826  323]
 [ 140  410]]


In [14]:
lr_rand_recall = recall_score(y_test, y_pred)

# 3.3 Fit a LogisticRegression model with Grid Search

In [15]:
score_measure = "recall"
kfolds = 5

penalty= rand_search.best_params_['penalty']
solver =rand_search.best_params_['solver']
C =rand_search.best_params_['C']
param_grid = {
    'C': [C+2,C,C-2],
    'penalty': [penalty],
    'solver': [solver]
}

logi_reg = LogisticRegression()
grid_search = GridSearchCV(estimator = logi_reg, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 5 folds for each of 3 candidates, totalling 15 fits
The best recall score is 0.745087801396314
... with parameters: {'C': 7, 'penalty': 'l1', 'solver': 'liblinear'}


In [16]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7454545454545455
************************************
Accuracy Score:   0.874831035414977
Precision Score:  0.5593451568894953
F1 Score:         0.6391270459859704
************************************
Confusion Matrix: [[2826  323]
 [ 140  410]]


In [17]:
lr_grid_recall = recall_score(y_test, y_pred)

# 3.4 Fit a SVM classification model using linear kernal

In [18]:
svm_lin_model = SVC(kernel="linear")
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [19]:
y_pred = svm_lin_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.730909090909091
************************************
Accuracy Score:   0.8799675587996756
Precision Score:  0.5759312320916905
F1 Score:         0.6442307692307692
************************************
Confusion Matrix: [[2853  296]
 [ 148  402]]


In [20]:
svm_linear_default_recall = recall_score(y_test, y_pred)

# 3.5 Fit a SVM classification model using linear kernal with Random Search

In [41]:
score_measure = "recall"
kfolds = 2

param_grid = {
    'C': [5,10,15,20],
    'kernel': ['linear'],
    
}

svm = SVC()
rand_search = RandomizedSearchCV(estimator =svm, param_distributions=param_grid, cv=kfolds, n_iter=20,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_linear = rand_search.best_estimator_



Fitting 2 folds for each of 4 candidates, totalling 8 fits
The best recall score is 0.7401358743243035
... with parameters: {'kernel': 'linear', 'C': 5}


In [42]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.730909090909091
************************************
Accuracy Score:   0.8799675587996756
Precision Score:  0.5759312320916905
F1 Score:         0.6442307692307692
************************************
Confusion Matrix: [[2853  296]
 [ 148  402]]


In [43]:
svm_linear_rand_recall = recall_score(y_test, y_pred)

# 3.6 Fit a SVM classification model using linear kernal with Grid Search

In [44]:
score_measure = "recall"
kfolds = 2

C = rand_search.best_params_['C']

param_grid = {
    'C': [C+2,C,C-2]
}

svm_linear_model = SVC(kernel="linear")
grid_search = GridSearchCV(estimator = svm_linear_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 2 folds for each of 3 candidates, totalling 6 fits
The best recall score is 0.7401358743243035
... with parameters: {'C': 7}


In [45]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.730909090909091
************************************
Accuracy Score:   0.8799675587996756
Precision Score:  0.5759312320916905
F1 Score:         0.6442307692307692
************************************
Confusion Matrix: [[2853  296]
 [ 148  402]]


In [46]:
svm_linear_grid_recall = recall_score(y_test, y_pred)

# 3.7 Fit a SVM classification model using rbf kernal

In [47]:
svm_rbf_model = SVC(kernel="rbf")
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [48]:
y_pred = svm_rbf_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7490909090909091
************************************
Accuracy Score:   0.8675317653419843
Precision Score:  0.5392670157068062
F1 Score:         0.6270928462709285
************************************
Confusion Matrix: [[2797  352]
 [ 138  412]]


In [49]:
svm_rbf_default_recall = recall_score(y_test, y_pred)

# 3.8 Fit a SVM classification model using rbf kernal with Random Search

In [50]:
score_measure = "recall"
kfolds = 2

param_grid = {
    'C': np.arange(5,15),
    'kernel': ['rbf'],
    
}

svm = SVC()
rand_search = RandomizedSearchCV(estimator =svm, param_distributions=param_grid, cv=kfolds, n_iter=12,
                           scoring=score_measure, verbose=1, n_jobs=-1,
                           return_train_score=True)

_ = rand_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_linear = rand_search.best_estimator_



Fitting 2 folds for each of 10 candidates, totalling 20 fits
The best recall score is 0.8127322080572094
... with parameters: {'kernel': 'rbf', 'C': 14}


In [51]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7345454545454545
************************************
Accuracy Score:   0.8675317653419843
Precision Score:  0.5401069518716578
F1 Score:         0.6224961479198767
************************************
Confusion Matrix: [[2805  344]
 [ 146  404]]


In [52]:
svm_rbf_rand_recall = recall_score(y_test, y_pred)

# 3.9 Fit a SVM classification model using rbf kernal with Grid Search


In [53]:
score_measure = "recall"
kfolds = 3

C = rand_search.best_params_['C']

param_grid = {
    'C': [C+1,C,C-1]
}

svm_linear_model = SVC(kernel="rbf")
grid_search = GridSearchCV(estimator = svm_linear_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_linear = grid_search.best_estimator_

Fitting 3 folds for each of 3 candidates, totalling 9 fits
The best recall score is 0.8180924999716467
... with parameters: {'C': 15}


In [54]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7327272727272728
************************************
Accuracy Score:   0.8678021086780211
Precision Score:  0.5409395973154363
F1 Score:         0.6223938223938225
************************************
Confusion Matrix: [[2807  342]
 [ 147  403]]


In [55]:
svm_rbf_grid_recall = recall_score(y_test, y_pred)

# 3.10 Fit a SVM classification model using polynomial kernal

In [56]:
svm_poly_model = SVC(kernel="poly", degree=3, coef0=1, C=10)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [57]:
y_pred = svm_poly_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7345454545454545
************************************
Accuracy Score:   0.8772641254393079
Precision Score:  0.5674157303370787
F1 Score:         0.6402535657686212
************************************
Confusion Matrix: [[2841  308]
 [ 146  404]]


In [58]:
svm_poly_default_recall = recall_score(y_test, y_pred)

# 3.11 Fit a SVM classification model using Polynomial kernal with Random Search

In [61]:
score_measure = "recall"
kfolds = 2

param_rand = {
    'C': [10,15],
    'degree': [3,4],
    'coef0': [4,5]

}

svm_poly_model = SVC(kernel="poly")
rand_search = RandomizedSearchCV(estimator = svm_poly_model, param_distributions=param_rand, cv=kfolds, n_iter=5,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train,np.ravel(y_train))

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

best_SVM_poly = rand_search.best_estimator_

Fitting 2 folds for each of 5 candidates, totalling 10 fits
The best recall score is 0.8285436805984695
... with parameters: {'degree': 4, 'coef0': 5, 'C': 15}


In [62]:
y_pred = rand_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7109090909090909
************************************
Accuracy Score:   0.8610435252771019
Precision Score:  0.5241286863270778
F1 Score:         0.6033950617283951
************************************
Confusion Matrix: [[2794  355]
 [ 159  391]]


In [63]:
svm_poly_rand_recall = recall_score(y_test, y_pred)

# 3.12 Fit a SVM classification model using Polynomial kernal with Grid Search

In [65]:
score_measure = "recall"
kfolds = 2

degree = rand_search.best_params_['degree']
coef0 = rand_search.best_params_['coef0']
C = rand_search.best_params_['C']
param_grid = {
    'C': [C-1,C,C+1],
    'degree': [degree],
    'coef0': [coef0],
}

svm_poly_model = SVC(kernel="poly")
grid_search = GridSearchCV(estimator = svm_poly_model, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(X_train, np.ravel(y_train))

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

best_SVM_poly = grid_search.best_estimator_

Fitting 2 folds for each of 3 candidates, totalling 6 fits
The best recall score is 0.829781190931851
... with parameters: {'C': 16, 'coef0': 5, 'degree': 4}


In [66]:
y_pred = grid_search.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.7109090909090909
************************************
Accuracy Score:   0.8623952419572858
Precision Score:  0.5276653171390013
F1 Score:         0.6057319907048799
************************************
Confusion Matrix: [[2799  350]
 [ 159  391]]


In [67]:
svm_poly_grid_recall = recall_score(y_test, y_pred)

# 3.13 Fit a DTree classification model using defaults (unconstrained tree)

In [22]:
dtree = DecisionTreeClassifier()
_=dtree.fit(X_train, np.ravel(y_train))

In [23]:
y_pred = dtree.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.5709090909090909
************************************
Accuracy Score:   0.862124898621249
Precision Score:  0.5340136054421769
F1 Score:         0.5518453427065027
************************************
Confusion Matrix: [[2875  274]
 [ 236  314]]


In [24]:
dtree_default_recall = recall_score(y_test, y_pred)

# 3.14 Fit a Decision Tree Classifier model with Random Search

In [25]:
# Criterion used to guide data splits
criterion = ['gini', 'entropy']

# Maximum number of levels in tree. If None, then nodes are expanded until all leaves are pure or until all 
# leaves contain less than min_samples_split samples.
# default = None
max_depth = [int(x) for x in np.linspace(1, 5000, 25)]

# Minimum number of samples required to split a node
# default is 2
min_samples_split = [int(x) for x in np.linspace(2, 5000, 25)]

# Minimum number of samples required at each leaf node
# default = 1 
min_samples_leaf = [int(x) for x in np.linspace(1, 500, 5)]

# max_leaf_nodes  - Grow trees with max_leaf_nodes in best-first fashion.
# If None then unlimited number of leaf nodes.
# default=None 
max_leaf_nodes = [int(x) for x in np.linspace(2, len(y_test), 50)]
max_leaf_nodes.append(None)

# min_impurity_decrease - A node will be split if this split induces a decrease of the impurity greater than or equal to this value.
# default=0.0
min_impurity_decrease = [x for x in np.arange(0.0, 0.01, 0.0001).round(5)]

# Create the random grid
param_grid_random = { 'criterion': criterion,
                      'max_depth': max_depth,
                      'min_samples_split': min_samples_split,
                      'min_samples_leaf' : min_samples_leaf,
                      'max_leaf_nodes' : max_leaf_nodes,
                      'min_impurity_decrease' : min_impurity_decrease,
                     }

best_random_search_model = RandomizedSearchCV(
        estimator=DecisionTreeClassifier(), 
        scoring='recall', 
        param_distributions=param_grid_random, 
        n_iter = 100, random_state=1,
        cv=5, 
        verbose=1, 
        n_jobs = -1
    )
_ = best_random_search_model.fit(X_train, np.ravel(y_train))
random_search_best_params = best_random_search_model.best_params_
print('Best parameters found: ', random_search_best_params)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters found:  {'min_samples_split': 2, 'min_samples_leaf': 125, 'min_impurity_decrease': 0.0006, 'max_leaf_nodes': 2567, 'max_depth': 1667, 'criterion': 'entropy'}


In [26]:
y_pred = best_random_search_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred,zero_division=1)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.8490909090909091
************************************
Accuracy Score:   0.8288726682887266
Precision Score:  0.4591937069813176
F1 Score:         0.5960433950223356
************************************
Confusion Matrix: [[2599  550]
 [  83  467]]


In [27]:
dtree_rand_recall = recall_score(y_test, y_pred)

# 3.15 Fit a Decision Tree Classifier model with Grid Search

In [37]:
plus_minus = 4
increment = 2

param_grid = { 'min_samples_split': [x for x in range(random_search_best_params['min_samples_split']-plus_minus, random_search_best_params['min_samples_split']+plus_minus,2) if x >= 2],       
              'min_samples_leaf': [x for x in range(random_search_best_params['min_samples_leaf']-plus_minus , random_search_best_params['min_samples_leaf']+plus_minus,2) if x > 0],
              'min_impurity_decrease': [x for x in np.arange(random_search_best_params['min_impurity_decrease']-0.001, random_search_best_params['min_impurity_decrease']+0.001,.0001).round(5) if x >= 0.000],
              'max_leaf_nodes':[x for x in range(random_search_best_params['max_leaf_nodes']-plus_minus , random_search_best_params['max_leaf_nodes']+plus_minus, 2) if x > 1],  
              'max_depth': [x for x in range(random_search_best_params['max_depth']-plus_minus , random_search_best_params['max_depth']+plus_minus, 2) if x > 1],
              'criterion': [random_search_best_params['criterion']]
              }

best_grid_search_model = GridSearchCV(estimator=DecisionTreeClassifier(), 
                                    scoring='recall', param_grid=param_grid, cv=5, verbose=1,  n_jobs = -1)
_ = best_grid_search_model.fit(X_train, y_train)

Fitting 5 folds for each of 2048 candidates, totalling 10240 fits


In [38]:
print('Best parameters found: ', best_grid_search_model.best_params_)

Best parameters found:  {'criterion': 'entropy', 'max_depth': 1663, 'max_leaf_nodes': 2563, 'min_impurity_decrease': 0.0008, 'min_samples_leaf': 125, 'min_samples_split': 2}


In [39]:
y_pred = best_grid_search_model.predict(X_test)
print("************************************")
print(f"{'Recall Score:':18}{recall_score(y_test, y_pred)}")
print("************************************")
print(f"{'Accuracy Score: ':18}{accuracy_score(y_test, y_pred)}")
print(f"{'Precision Score: ':18}{precision_score(y_test, y_pred,zero_division=1)}")
print(f"{'F1 Score: ':18}{f1_score(y_test, y_pred)}")
print("************************************")
print(f"{'Confusion Matrix: ':18}{confusion_matrix(y_test, y_pred)}")

************************************
Recall Score:     0.8345454545454546
************************************
Accuracy Score:   0.8450932684509327
Precision Score:  0.487778958554729
F1 Score:         0.6156941649899396
************************************
Confusion Matrix: [[2667  482]
 [  91  459]]


In [40]:
dtree_grid_recall = recall_score(y_test, y_pred)

# 4.0 Summarize results

In [73]:
print("Recall scores...")
print("**************************************")
print(f"{'Decision Tree Random Search:':18}{dtree_rand_recall}")
print("**************************************")
print(f"{'Decision Tree Grid Search:':18}{dtree_grid_recall}")
print(f"{'Decision Tree deafult:':18}{dtree_default_recall}")
print(f"{'SVM Poly Grid Search:':18}{svm_poly_grid_recall}")
print(f"{'SVM Poly Random Search:':18}{svm_poly_rand_recall}")
print(f"{'SVM Poly deafult:':18}{svm_poly_default_recall}")
print(f"{'SVM rbf Grid Search:':18}{svm_rbf_grid_recall}")
print(f"{'SVM rbf Random Search:':18}{svm_rbf_rand_recall}")
print(f"{'SVM rbf deafult:':18}{svm_rbf_default_recall}")
print(f"{'SVM Linear Grid Search:':18}{svm_linear_grid_recall}")
print(f"{'SVM Linear Random Search:':18}{svm_linear_rand_recall}")
print(f"{'SVM Linear deafult:':18}{svm_linear_default_recall}")
print(f"{'Logistic Regression Grid Search:':18}{lr_grid_recall}")
print(f"{'Logistic Regressionr Random Search:':18}{lr_rand_recall}")
print(f"{'Logistic Regression deafult:':18}{lr_default_recall}")


Recall scores...
**************************************
Decision Tree Random Search:0.8490909090909091
**************************************
Decision Tree Grid Search:0.8345454545454546
Decision Tree deafult:0.5709090909090909
SVM Poly Grid Search:0.7109090909090909
SVM Poly Random Search:0.7109090909090909
SVM Poly deafult: 0.7345454545454545
SVM rbf Grid Search:0.7327272727272728
SVM rbf Random Search:0.7345454545454545
SVM rbf deafult:  0.7490909090909091
SVM Linear Grid Search:0.730909090909091
SVM Linear Random Search:0.730909090909091
SVM Linear deafult:0.730909090909091
Logistic Regression Grid Search:0.7454545454545455
Logistic Regressionr Random Search:0.7454545454545455
Logistic Regression deafult:0.7454545454545455


##### As the company is new to market and they want to improve potential customer in coming three year regardless of their marketing budget spending  so we used recall score. from the above models we got good Recall score of 84.9% for Decision tree Random search followed by 83.4% for Decision Tree Grid Search.