data: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_ori = pd.read_csv("bank-additional/bank-additional-full.csv",  sep=';')
df_ori.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df=pd.read_csv('bank_additional_full_clean_normalized.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,1.642226,0,2,1,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
1,-0.196449,1,2,4,0,1,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
2,0.093868,2,2,2,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
3,1.642226,1,2,4,0,0,1,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
4,1.932543,2,2,5,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0


In [4]:
X = df.iloc[:, 0:-1]
y = df.y

In [5]:
X.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.642226,0,2,1,0,0,0,0,5,1,0.005792,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
1,-0.196449,1,2,4,0,1,0,0,5,1,-0.127941,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
2,0.093868,2,2,2,0,0,0,0,5,1,-0.414513,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
3,1.642226,1,2,4,0,0,1,0,5,1,0.181556,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
4,1.932543,2,2,5,0,0,0,0,5,1,-0.460365,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641


In [6]:
X.pdays.max(), X.pdays.min()

(0.2118836362185938, -4.749052297919588)

In [7]:
y.head(2)

0    0
1    0
Name: y, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=101)

In [10]:
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)

In [11]:
from sklearn.svm import SVC

In [12]:
from sklearn.metrics import classification_report

In [13]:
df_ori.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [14]:
categorical_columns_subset = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome'
]

numerical_columns_subset = [
    'duration', 'campaign', 'pdays',
    'previous', 'emp.var.rate', 'cons.price.idx', 
    'cons.conf.idx', 'euribor3m', 'nr.employed'
]

X = X[categorical_columns_subset + numerical_columns_subset]
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")

n_categorical_features = X.select_dtypes(include="category").shape[1]
n_numerical_features = X.select_dtypes(include="number").shape[1]

print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of categorical features: {n_categorical_features}")
print(f"Number of numerical features: {n_numerical_features}")

Number of samples: 30488
Number of features: 19
Number of categorical features: 10
Number of numerical features: 9


In [15]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline

one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(sparse=False, handle_unknown="ignore"),
        make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)



In [16]:
svc1 = make_pipeline(
    one_hot_encoder, SVC(C=1.0, probability=True)
)
clf_svc1 = svc1.fit(X_train, y_train)

In [17]:
svc2 = make_pipeline(
    one_hot_encoder, SVC(C=3.0, probability=True)
)
clf_svc2 = svc2.fit(X_train, y_train)

In [18]:
svc3 = make_pipeline(
    one_hot_encoder, SVC(C=10.0, probability=True)
)
clf_svc3 = svc3.fit(X_train, y_train)

In [19]:
svc4 = make_pipeline(
    one_hot_encoder, SVC(C=30.0, probability=True)
)
clf_svc4 = svc4.fit(X_train, y_train)

In [20]:
svc5 = make_pipeline(
    one_hot_encoder, SVC(C=100.0, probability=True)
)
clf_svc5 = svc5.fit(X_train, y_train)

In [21]:
svc6 = make_pipeline(
    one_hot_encoder, SVC(C=300.0, probability=True)
)
clf_svc6 = svc6.fit(X_train, y_train)

In [22]:
svc7 = make_pipeline(
    one_hot_encoder, SVC(C=1000.0, probability=True)
)
clf_svc7 = svc7.fit(X_train, y_train)

In [23]:
svc_lst = [clf_svc1, clf_svc2, clf_svc3, clf_svc4, clf_svc5, clf_svc6, clf_svc7]
svc_c = [1, 3, 10, 30, 100, 300, 1000, 3000]

In [24]:
best_f1 = 0
best_idx = 999
for i in range(len(svc_lst)):
#     print("svc model, C=", svc_c[i], "\nClassification report on trainng data\n", 
#           classification_report(y_train, svc_lst[i].predict(X_train)))
    print(f"SVM classifier {i+1}, C={svc_c[i]}")
    cl_rep = classification_report(y_cv, svc_lst[i].predict(X_cv), output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on cross-validation data:')
    print(f"f1-score of macro average: \n {f1_sc:.4f}")
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best SVM classifier is classifier {best_idx} with Micro-Average F1_score {best_f1:.4f}")
    

SVM classifier 1, C=1
Classification report on cross-validation data:
f1-score of macro average: 
 0.6878
----------------------------------------------------------------------------------
SVM classifier 2, C=3
Classification report on cross-validation data:
f1-score of macro average: 
 0.7115
----------------------------------------------------------------------------------
SVM classifier 3, C=10
Classification report on cross-validation data:
f1-score of macro average: 
 0.7269
----------------------------------------------------------------------------------
SVM classifier 4, C=30
Classification report on cross-validation data:
f1-score of macro average: 
 0.7411
----------------------------------------------------------------------------------
SVM classifier 5, C=100
Classification report on cross-validation data:
f1-score of macro average: 
 0.7411
----------------------------------------------------------------------------------
SVM classifier 6, C=300
Classification report on cr

In [35]:
best_f1 = 0
best_idx = 999
for i in range(len(svc_lst)):
#     print("svc model, C=", svc_c[i], "\nClassification report on trainng data\n", 
#           classification_report(y_train, svc_lst[i].predict(X_train)))
    print(f"SVM classifier {i+1}, C={svc_c[i]}")
    cl_rep = classification_report(y_test, svc_lst[i].predict(X_test), output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on Test data:')
    print(f"f1-score of macro average: \n {f1_sc:.4f}")
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best SVM classifier is classifier {best_idx} with Micro-Average F1_score {best_f1:.4f}")

SVM classifier 1, C=1
Classification report on Test data:
f1-score of macro average: 
 0.6785
----------------------------------------------------------------------------------
SVM classifier 2, C=3
Classification report on Test data:
f1-score of macro average: 
 0.6958
----------------------------------------------------------------------------------
SVM classifier 3, C=10
Classification report on Test data:
f1-score of macro average: 
 0.7227
----------------------------------------------------------------------------------
SVM classifier 4, C=30
Classification report on Test data:
f1-score of macro average: 
 0.7323
----------------------------------------------------------------------------------
SVM classifier 5, C=100
Classification report on Test data:
f1-score of macro average: 
 0.7339
----------------------------------------------------------------------------------
SVM classifier 6, C=300
Classification report on Test data:
f1-score of macro average: 
 0.7389
---------------

In [25]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

In [26]:
best_auc = 0
best_idx = 999

for i in range(len(svc_lst)):
    print(f"SVM classifier {i+1}, C={svc_c[i]}")
    auc_sc = roc_auc_score(y_cv, svc_lst[i].predict_proba(X_cv)[:,1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print("roc_auc_score on cross-validation data\n", auc_sc)
    print("----------------------------------------------------------------------------------")
    
print(f"The SVM classifier is classifier {best_idx} with roc_auc_score {best_auc}")
#     print("\nsvc model",i+1, ", C =", svc_c[i], "\n roc_auc_score on trainng data\n", 
#           roc_auc_score(y_train, svc_lst[i].predict(X_train)))

SVM classifier 1, C=1
roc_auc_score on cross-validation data
 0.9309317885117493
----------------------------------------------------------------------------------
SVM classifier 2, C=3
roc_auc_score on cross-validation data
 0.931753543453935
----------------------------------------------------------------------------------
SVM classifier 3, C=10
roc_auc_score on cross-validation data
 0.9287058537940095
----------------------------------------------------------------------------------
SVM classifier 4, C=30
roc_auc_score on cross-validation data
 0.9194505315180903
----------------------------------------------------------------------------------
SVM classifier 5, C=100
roc_auc_score on cross-validation data
 0.9133858128050856
----------------------------------------------------------------------------------
SVM classifier 6, C=300
roc_auc_score on cross-validation data
 0.9020716687884145
----------------------------------------------------------------------------------
SVM classif

In [34]:
best_auc = 0
best_idx = 999

for i in range(len(svc_lst)):
    print(f"SVM classifier {i+1}, C={svc_c[i]}")
    auc_sc = roc_auc_score(y_test, svc_lst[i].predict_proba(X_test)[:,1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print("roc_auc_score on Test data\n", auc_sc)
    print("----------------------------------------------------------------------------------")
    
print(f"The SVM classifier is classifier {best_idx} with roc_auc_score {best_auc}")
#     print("\nsvc model",i+1, ", C =", svc_c[i], "\n roc_auc_score on trainng data\n", 
#           roc_auc_score(y_train, svc_lst[i].predict(X_train)))

SVM classifier 1, C=1
roc_auc_score on Test data
 0.9298477602299016
----------------------------------------------------------------------------------
SVM classifier 2, C=3
roc_auc_score on Test data
 0.9300938888121427
----------------------------------------------------------------------------------
SVM classifier 3, C=10
roc_auc_score on Test data
 0.9257249848723341
----------------------------------------------------------------------------------
SVM classifier 4, C=30
roc_auc_score on Test data
 0.916693744053514
----------------------------------------------------------------------------------
SVM classifier 5, C=100
roc_auc_score on Test data
 0.910534692456013
----------------------------------------------------------------------------------
SVM classifier 6, C=300
roc_auc_score on Test data
 0.9033682647837669
----------------------------------------------------------------------------------
SVM classifier 7, C=1000
roc_auc_score on Test data
 0.8976099017625918
------------

### clf_svc4 (C=30) had the bestmacro average F1 on the cross-validation data. 
### clf_svc6 (C=300) had the bestmacro average F1 on test data.  
### clf_svc2 (C=3) had the best roc_auc_score on the cross-validation data. 
### clf_svc2 (C=3) had the best roc_auc_score on the Test data.

In [38]:
from joblib import dump, load
dump(clf_svc4, "best_SVC_model_one_hot_C30.sav")

['best_SVC_model_one_hot_C30.sav']

In [39]:
clf_load = load("best_SVC_model_one_hot_C30.sav")

In [40]:
classification_report(y_cv, clf_load.predict(X_cv), output_dict=True)

{'0': {'precision': 0.9266903914590747,
  'recall': 0.9712793733681462,
  'f1-score': 0.9484611181934073,
  'support': 5362},
 '1': {'precision': 0.6778242677824268,
  'recall': 0.44021739130434784,
  'f1-score': 0.5337726523887973,
  'support': 736},
 'accuracy': 0.907182682846835,
 'macro avg': {'precision': 0.8022573296207507,
  'recall': 0.705748382336247,
  'f1-score': 0.7411168852911023,
  'support': 6098},
 'weighted avg': {'precision': 0.8966534175289316,
  'recall': 0.907182682846835,
  'f1-score': 0.898410165285537,
  'support': 6098}}

In [41]:
roc_auc_score(y_test, clf_load.predict_proba(X_test)[:, 1])

0.916693744053514

In [42]:
roc_auc_score(y_cv, clf_load.predict_proba(X_cv)[:, 1])

0.9194505315180903