data: https://archive.ics.uci.edu/ml/datasets/Bank+Marketing

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df_ori = pd.read_csv("bank-additional/bank-additional-full.csv",  sep=';')
df_ori.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [3]:
df=pd.read_csv('bank_additional_full_clean_normalized.csv')
df.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,1.642226,0,2,1,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
1,-0.196449,1,2,4,0,1,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
2,0.093868,2,2,2,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
3,1.642226,1,2,4,0,0,1,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0
4,1.932543,2,2,5,0,0,0,0,5,1,...,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641,0


In [4]:
X = df.iloc[:, 0:-1]
y = df.y

In [5]:
X.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,1.642226,0,2,1,0,0,0,0,5,1,0.005792,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
1,-0.196449,1,2,4,0,1,0,0,5,1,-0.127941,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
2,0.093868,2,2,2,0,0,0,0,5,1,-0.414513,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
3,1.642226,1,2,4,0,0,1,0,5,1,0.181556,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641
4,1.932543,2,2,5,0,0,0,0,5,1,-0.460365,-0.559326,0.211884,-0.37161,1,0.727466,0.804082,0.877437,0.786089,0.401641


In [6]:
X.pdays.max(), X.pdays.min()

(0.2118836362185938, -4.749052297919588)

In [7]:
y.head(2)

0    0
1    0
Name: y, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=101)

In [10]:
X_cv, X_test, y_cv, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=101)

In [11]:
from sklearn.metrics import classification_report

In [12]:
df_ori.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [13]:
categorical_columns_subset = [
    'job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome'
]

numerical_columns_subset = [
    'duration', 'campaign', 'pdays',
    'previous', 'emp.var.rate', 'cons.price.idx', 
    'cons.conf.idx', 'euribor3m', 'nr.employed'
]

X = X[categorical_columns_subset + numerical_columns_subset]
X[categorical_columns_subset] = X[categorical_columns_subset].astype("category")

n_categorical_features = X.select_dtypes(include="category").shape[1]
n_numerical_features = X.select_dtypes(include="number").shape[1]

print(f"Number of samples: {X.shape[0]}")
print(f"Number of features: {X.shape[1]}")
print(f"Number of categorical features: {n_categorical_features}")
print(f"Number of numerical features: {n_numerical_features}")

Number of samples: 30488
Number of features: 19
Number of categorical features: 10
Number of numerical features: 9


In [14]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector
from sklearn.pipeline import make_pipeline

one_hot_encoder = make_column_transformer(
    (
        OneHotEncoder(sparse=False, handle_unknown="ignore"),
        make_column_selector(dtype_include="category"),
    ),
    remainder="passthrough",
)

In [15]:
from sklearn.metrics import roc_curve, auc, roc_auc_score
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier

In [34]:
clf_lr1 = make_pipeline(one_hot_encoder,  LogisticRegression(C=1.0, class_weight='balanced', max_iter=600))
clf_lr1 = clf_lr1.fit(X_train, y_train)

In [35]:
clf_lr2 = make_pipeline(one_hot_encoder,  LogisticRegression(C=3.0, class_weight='balanced', max_iter=600))
clf_lr2=clf_lr2.fit(X_train, y_train)

In [36]:
clf_lr3 = make_pipeline(one_hot_encoder,  LogisticRegression(C=10.0, class_weight='balanced', max_iter=600))
clf_lr3 = clf_lr3.fit(X_train, y_train)

In [38]:
clf_lr4 = make_pipeline(one_hot_encoder,  LogisticRegression(C=30.0, class_weight='balanced', max_iter=600))
clf_lr4 = clf_lr4.fit(X_train, y_train)

In [39]:
clf_lr5 = make_pipeline(one_hot_encoder,  LogisticRegression(C=5.0, class_weight='balanced', max_iter=600))
clf_lr5 = clf_lr5.fit(X_train, y_train)

In [40]:
clf_lr6 = make_pipeline(one_hot_encoder,  LogisticRegression(C=300.0, class_weight='balanced', max_iter=600))
clf_lr6 = clf_lr6.fit(X_train, y_train)

In [41]:
clf_lr7 = make_pipeline(one_hot_encoder,  LogisticRegression(C=1000.0, class_weight='balanced', max_iter=600))
clf_lr7 = clf_lr7.fit(X_train, y_train)

In [42]:
clf_lr8 = make_pipeline(one_hot_encoder,  LogisticRegression(C=3000.0, class_weight='balanced', max_iter=600))
clf_lr8 = clf_lr8.fit(X_train, y_train)

In [43]:
clf_lr9 = make_pipeline(one_hot_encoder,  LogisticRegression(C=0.5, class_weight='balanced', max_iter=600))
clf_lr9 = clf_lr9.fit(X_train, y_train)

In [44]:
lr_lst = [clf_lr1, clf_lr2, clf_lr3,clf_lr4,clf_lr5, clf_lr6, clf_lr7, clf_lr8, clf_lr9]
cc_lst = [1, 3, 10, 30, 100, 300, 1000, 3000, 0.5, 1]

In [45]:
best_f1 = 0
best_idx = 999

for i in range(len(lr_lst)):
    print("Logistic Regression model", i+1, ", C=", cc_lst[i])
#           "\nClassification report on trainng data\n", 
#           classification_report(y_sc_train, lr_lst[i].predict(X_sc_train)))

    cl_rep = classification_report(y_cv, lr_lst[i].predict(X_cv), output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on cross-validation data:')
    print(f"f1-score of macro average: {f1_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best logistic Regression classifier is classifier {best_idx} with Micro-Average F1_score {best_f1:.4f}")


Logistic Regression model 1 , C= 1
Classification report on cross-validation data:
f1-score of macro average: 0.7453

----------------------------------------------------------------------------------
Logistic Regression model 2 , C= 3
Classification report on cross-validation data:
f1-score of macro average: 0.7464

----------------------------------------------------------------------------------
Logistic Regression model 3 , C= 10
Classification report on cross-validation data:
f1-score of macro average: 0.7463

----------------------------------------------------------------------------------
Logistic Regression model 4 , C= 30
Classification report on cross-validation data:
f1-score of macro average: 0.7459

----------------------------------------------------------------------------------
Logistic Regression model 5 , C= 100
Classification report on cross-validation data:
f1-score of macro average: 0.7463

--------------------------------------------------------------------------

In [46]:
best_f1 = 0
best_idx = 999

for i in range(len(lr_lst)):
    print("Logistic Regression model", i+1, ", C=", cc_lst[i])
#           "\nClassification report on trainng data\n", 
#           classification_report(y_sc_train, lr_lst[i].predict(X_sc_train)))

    cl_rep = classification_report(y_test, lr_lst[i].predict(X_test), output_dict=True)
    f1_sc = cl_rep['macro avg']['f1-score']
    
    if f1_sc > best_f1:
        best_f1 = f1_sc
        best_idx = i+1
    print('Classification report on test data:')
    print(f"f1-score of macro average: {f1_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"\nThe best logistic Regression classifier is classifier {best_idx} with Micro-Average F1_score {best_f1:.4f} on test data")


Logistic Regression model 1 , C= 1
Classification report on test data:
f1-score of macro average: 0.7450

----------------------------------------------------------------------------------
Logistic Regression model 2 , C= 3
Classification report on test data:
f1-score of macro average: 0.7446

----------------------------------------------------------------------------------
Logistic Regression model 3 , C= 10
Classification report on test data:
f1-score of macro average: 0.7445

----------------------------------------------------------------------------------
Logistic Regression model 4 , C= 30
Classification report on test data:
f1-score of macro average: 0.7445

----------------------------------------------------------------------------------
Logistic Regression model 5 , C= 100
Classification report on test data:
f1-score of macro average: 0.7446

----------------------------------------------------------------------------------
Logistic Regression model 6 , C= 300
Classification

####  Logistic Regression model 10 performed much better with balanced weight

In [47]:
best_auc = 0
best_idx = 999

for i in range(len(lr_lst)):
    print("\nLogistic Regression model", i+1)
    auc_sc = roc_auc_score(y_cv, lr_lst[i].predict_proba(X_cv)[:, 1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print(f"roc_auc_score on cross-validation data {auc_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"The best Logistic Regression classifier is classifier {best_idx} with roc_auc_score {best_auc:.4f}")


Logistic Regression model 1
roc_auc_score on cross-validation data 0.9247

----------------------------------------------------------------------------------

Logistic Regression model 2
roc_auc_score on cross-validation data 0.9246

----------------------------------------------------------------------------------

Logistic Regression model 3
roc_auc_score on cross-validation data 0.9246

----------------------------------------------------------------------------------

Logistic Regression model 4
roc_auc_score on cross-validation data 0.9246

----------------------------------------------------------------------------------

Logistic Regression model 5
roc_auc_score on cross-validation data 0.9246

----------------------------------------------------------------------------------

Logistic Regression model 6
roc_auc_score on cross-validation data 0.9246

----------------------------------------------------------------------------------

Logistic Regression model 7
roc_auc_score on 

In [48]:
best_auc = 0
best_idx = 999

for i in range(len(lr_lst)):
    print("\nLogistic Regression model", i+1)
    auc_sc = roc_auc_score(y_test, lr_lst[i].predict_proba(X_test)[:, 1])
    
    if auc_sc > best_auc:
        best_auc = auc_sc
        best_idx = i+1
    
    print(f"roc_auc_score on test data {auc_sc:.4f}\n")
    print("----------------------------------------------------------------------------------")
    
print(f"The best Logistic Regression classifier is classifier {best_idx} with roc_auc_score {best_auc:.4f}")


Logistic Regression model 1
roc_auc_score on test data 0.9217

----------------------------------------------------------------------------------

Logistic Regression model 2
roc_auc_score on test data 0.9217

----------------------------------------------------------------------------------

Logistic Regression model 3
roc_auc_score on test data 0.9217

----------------------------------------------------------------------------------

Logistic Regression model 4
roc_auc_score on test data 0.9217

----------------------------------------------------------------------------------

Logistic Regression model 5
roc_auc_score on test data 0.9217

----------------------------------------------------------------------------------

Logistic Regression model 6
roc_auc_score on test data 0.9217

----------------------------------------------------------------------------------

Logistic Regression model 7
roc_auc_score on test data 0.9217

------------------------------------------------------

In [53]:
import joblib
joblib.dump(clf_lr2, "best_logistic_regression_model_C3_balanced_wt.sav")
clf_load = joblib.load("best_logistic_regression_model_C3_balanced_wt.sav")

In [54]:
classification_report(y_cv, clf_load.predict(X_cv), output_dict=True)

{'0': {'precision': 0.9775112443778111,
  'recall': 0.8511749347258486,
  'f1-score': 0.9099790648988137,
  'support': 5362},
 '1': {'precision': 0.44156752974107766,
  'recall': 0.8573369565217391,
  'f1-score': 0.5829099307159353,
  'support': 736},
 'accuracy': 0.8519186618563463,
 'macro avg': {'precision': 0.7095393870594444,
  'recall': 0.8542559456237939,
  'f1-score': 0.7464444978073745,
  'support': 6098},
 'weighted avg': {'precision': 0.9128253516305767,
  'recall': 0.8519186618563463,
  'f1-score': 0.8705033543775611,
  'support': 6098}}

In [55]:
roc_auc_score(y_test, clf_load.predict_proba(X_test)[:, 1])

0.9216846577256163