data: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [73]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [74]:
df_ori = pd.read_csv("bank-additional/bank-additional-full.csv",  sep=';')
df_ori.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [75]:
df=pd.read_csv('bank_additional_full_clean_normalized.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,1.642226,0,2,1,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
1,-0.196449,1,2,4,0,1,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
2,0.093868,2,2,2,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
3,1.642226,1,2,4,0,0,1,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
4,1.932543,2,2,5,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0


In [76]:
X = df.iloc[:, 0:-1]
y = df.y

In [77]:
X.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.642226,0,2,1,0,0,0,0,5,1,0.005792,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
1,-0.196449,1,2,4,0,1,0,0,5,1,-0.127941,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
2,0.093868,2,2,2,0,0,0,0,5,1,-0.414513,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
3,1.642226,1,2,4,0,0,1,0,5,1,0.181556,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
4,1.932543,2,2,5,0,0,0,0,5,1,-0.460365,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641


In [78]:
X.pdays.max(), X.pdays.min()

(0.2118836362185938, -4.749052297919588)

In [79]:
y.head(2)

0    0
1    0
Name: y, dtype: int64

In [80]:
from sklearn.model_selection import train_test_split

In [81]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=101)

In [82]:
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)

In [83]:
from sklearn.metrics import classification_report

In [84]:
df_ori.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [85]:
categorical_columns_subset = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome'
]

numerical_columns_subset = [
    'duration', 'campaign', 'pdays',
    'previous', 'emp.var.rate', 'cons.price.idx', 
    'cons.conf.idx', 'euribor3m', 'nr.employed'
]

X = X[categorical_columns_subset + numerical_columns_subset]
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")

n_categorical_features = X.select_dtypes(include="category").shape[1]
n_numerical_features = X.select_dtypes(include="number").shape[1]

print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of categorical features: {n_categorical_features}")
print(f"Number of numerical features: {n_numerical_features}")

Number of samples: 30488
Number of features: 19
Number of categorical features: 10
Number of numerical features: 9


In [86]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline

one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(sparse=False, handle_unknown="ignore"),
        make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)

In [87]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [88]:
from sklearn.neural_network import MLPClassifier

In [89]:
clf_nn1 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.001, max_iter=600)
)
clf_nn1.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.001, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [90]:
clf_nn2 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.003, max_iter=600)
)
clf_nn2.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.003, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [91]:
clf_nn3 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.01, max_iter=600)
)
clf_nn3.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.01, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [92]:
clf_nn4 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.03, max_iter=600)
)
clf_nn4.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.03, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [93]:
clf_nn5 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.1, max_iter=600)
)
clf_nn5.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.1, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [94]:
clf_nn6 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=0.3, max_iter=600)
)
clf_nn6.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.3, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [95]:
clf_nn7 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=1, max_iter=600)
)
clf_nn7.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=1, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [96]:
clf_nn8 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(20,4), alpha=3, max_iter=600)
)
clf_nn8.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=3, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [97]:
clf_nn9 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(16,8,4), alpha=0.001, max_iter=600)
)
clf_nn9.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.001, hidden_layer_sizes=(16, 8, 4),
                               max_iter=600))])

In [98]:
clf_nn10 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(16,8,4), alpha=0.003, max_iter=600)
)
clf_nn10.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.003, hidden_layer_sizes=(16, 8, 4),
                               max_iter=600))])

In [99]:
clf_nn11 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(16,8,4), alpha=0.01, max_iter=600)
)
clf_nn11.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.01, hidden_layer_sizes=(16, 8, 4),
                               max_iter=600))])

In [100]:
clf_nn12 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(16,8,4), alpha=0.03, max_iter=600)
)
clf_nn12.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.03, hidden_layer_sizes=(16, 8, 4),
                               max_iter=600))])

In [101]:
clf_nn13 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(16,8,4), alpha=0.1, max_iter=600)
)
clf_nn13.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.1, hidden_layer_sizes=(16, 8, 4),
                               max_iter=600))])

In [102]:
clf_nn14 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(16,8,4), alpha=0.3, max_iter=600)
)
clf_nn14.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.3, hidden_layer_sizes=(16, 8, 4),
                               max_iter=600))])

In [103]:
clf_nn15 = make_pipeline(
    one_hot_encoder, MLPClassifier(hidden_layer_sizes=(16,8,4), alpha=1, max_iter=600)
)
clf_nn15.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=1, hidden_layer_sizes=(16, 8, 4),
                               max_iter=600))])

In [104]:
nn_lst = [clf_nn1, clf_nn2, clf_nn3, clf_nn4, clf_nn5, clf_nn6, clf_nn7, clf_nn8, clf_nn9, clf_nn10, clf_nn11, 
          clf_nn12, clf_nn13, clf_nn14, clf_nn15]

In [105]:
len(nn_lst)

15

In [107]:
best_f1 = 0
best_idx = 999

for i in range(len(nn_lst)):
    print("Neural Network model ", i+1, 
          "\nClassification report on trainng data\n",) 
          #classification_report(y_train, nn_lst[i].predict(X_train)))
    cl_rep = classification_report(y_cv, nn_lst[i].predict(X_cv), output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on cross-validation data:')
    print(f"f1-score of macro average: {f1_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best neural network classifier is classifier {best_idx} with Micro-Average F1_score {best_f1}") 

Neural Network model  1 
Classification report on trainng data

Classification report on cross-validation data:
f1-score of macro average: 0.7760

----------------------------------------------------------------------------------
Neural Network model  2 
Classification report on trainng data

Classification report on cross-validation data:
f1-score of macro average: 0.7752

----------------------------------------------------------------------------------
Neural Network model  3 
Classification report on trainng data

Classification report on cross-validation data:
f1-score of macro average: 0.7617

----------------------------------------------------------------------------------
Neural Network model  4 
Classification report on trainng data

Classification report on cross-validation data:
f1-score of macro average: 0.7778

----------------------------------------------------------------------------------
Neural Network model  5 
Classification report on trainng data

Classification r

In [106]:
best_f1 = 0
best_idx = 999

for i in range(len(nn_lst)):
    print("Neural Network model ", i+1, 
          "\nClassification report on trainng data\n",) 
          #classification_report(y_train, nn_lst[i].predict(X_train)))
    cl_rep = classification_report(y_test, nn_lst[i].predict(X_test), output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on Test data:')
    print(f"f1-score of macro average: {f1_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best neural network classifier is classifier {best_idx} with Micro-Average F1_score {best_f1} on test data")

Neural Network model  1 
Classification report on trainng data

Classification report on Test data:
f1-score of macro average: 0.7501

----------------------------------------------------------------------------------
Neural Network model  2 
Classification report on trainng data

Classification report on Test data:
f1-score of macro average: 0.7611

----------------------------------------------------------------------------------
Neural Network model  3 
Classification report on trainng data

Classification report on Test data:
f1-score of macro average: 0.7469

----------------------------------------------------------------------------------
Neural Network model  4 
Classification report on trainng data

Classification report on Test data:
f1-score of macro average: 0.7670

----------------------------------------------------------------------------------
Neural Network model  5 
Classification report on trainng data

Classification report on Test data:
f1-score of macro average: 0

In [108]:
clf_nn5

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd5405abd00>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.1, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [109]:
best_auc = 0
best_idx = 999

for i in range(len(nn_lst)):
    print("\nNeural Network model", i+1)
    auc_sc = roc_auc_score(y_cv, nn_lst[i].predict_proba(X_cv)[:, 1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print(f"roc_auc_score on cross-validation data {auc_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"The best Neural Network classifier is classifier {best_idx} with roc_auc_score {best_auc:.4f}")


Neural Network model 1
roc_auc_score on cross-validation data 0.9369

----------------------------------------------------------------------------------

Neural Network model 2
roc_auc_score on cross-validation data 0.9360

----------------------------------------------------------------------------------

Neural Network model 3
roc_auc_score on cross-validation data 0.9379

----------------------------------------------------------------------------------

Neural Network model 4
roc_auc_score on cross-validation data 0.9367

----------------------------------------------------------------------------------

Neural Network model 5
roc_auc_score on cross-validation data 0.9387

----------------------------------------------------------------------------------

Neural Network model 6
roc_auc_score on cross-validation data 0.9372

----------------------------------------------------------------------------------

Neural Network model 7
roc_auc_score on cross-validation data 0.9316

-----

In [110]:
best_auc = 0
best_idx = 999

for i in range(len(nn_lst)):
    print("\nNeural Network model", i+1)
    auc_sc = roc_auc_score(y_test, nn_lst[i].predict_proba(X_test)[:, 1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print(f"roc_auc_score on test data {auc_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"The best Neural Network classifier is classifier {best_idx} with roc_auc_score {best_auc:.4f} on test data")


Neural Network model 1
roc_auc_score on test data 0.9297

----------------------------------------------------------------------------------

Neural Network model 2
roc_auc_score on test data 0.9339

----------------------------------------------------------------------------------

Neural Network model 3
roc_auc_score on test data 0.9350

----------------------------------------------------------------------------------

Neural Network model 4
roc_auc_score on test data 0.9338

----------------------------------------------------------------------------------

Neural Network model 5
roc_auc_score on test data 0.9335

----------------------------------------------------------------------------------

Neural Network model 6
roc_auc_score on test data 0.9338

----------------------------------------------------------------------------------

Neural Network model 7
roc_auc_score on test data 0.9285

----------------------------------------------------------------------------------

Neura

In [111]:
import joblib
joblib.dump(clf_nn5, "best_neural_network_model_one_hot_classifier5.sav")

['best_neural_network_model_one_hot_classifier5.sav']

In [112]:
clf_load = joblib.load("best_neural_network_model_one_hot_classifier5.sav")

In [113]:
clf_load

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(handle_unknown='ignore',
                                                                sparse=False),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fd540619910>)])),
                ('mlpclassifier',
                 MLPClassifier(alpha=0.1, hidden_layer_sizes=(20, 4),
                               max_iter=600))])

In [114]:
classification_report(y_cv, clf_load.predict(X_cv), output_dict=True)

{'0': {'precision': 0.9533826037521319,
  'recall': 0.9382693024990675,
  'f1-score': 0.9457655794717549,
  'support': 5362},
 '1': {'precision': 0.5968331303288672,
  'recall': 0.6657608695652174,
  'f1-score': 0.6294155427103404,
  'support': 736},
 'accuracy': 0.9053788127254838,
 'macro avg': {'precision': 0.7751078670404996,
  'recall': 0.8020150860321424,
  'f1-score': 0.7875905610910476,
  'support': 6098},
 'weighted avg': {'precision': 0.9103487545491927,
  'recall': 0.9053788127254838,
  'f1-score': 0.907583613736038,
  'support': 6098}}

### The best neural network classifier is classifier 5 with Micro-Average F1_score 0.7875905610910476 on cv data

### The best neural network classifier is classifier 5 with Micro-Average F1_score 0.7767775891179305 on test data