In [1]:
# Fine-Grained Approach to Scaling, Encoding, and Evaluating Logistic Regression on Bank Marketing Data

In [2]:
# Step 1: Import Libraries and Load the Dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
url = "https://raw.githubusercontent.com/fenago/datasets/main/bank-full.csv"

bankData = pd.read_csv(url, sep=";")
bankData.sample(15)

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
10106,50,management,married,unknown,no,8345,no,no,unknown,11,jun,561,1,-1,0,unknown,no
35664,40,self-employed,married,secondary,no,167,yes,yes,cellular,8,may,380,4,295,1,failure,no
19022,59,retired,married,primary,no,1634,yes,yes,cellular,5,aug,836,1,-1,0,unknown,yes
23503,35,technician,married,secondary,no,0,no,no,cellular,28,aug,19,14,-1,0,unknown,no
3698,33,services,single,secondary,no,0,yes,no,unknown,16,may,212,1,-1,0,unknown,no
15897,51,admin.,married,secondary,no,907,yes,no,telephone,21,jul,722,3,-1,0,unknown,no
40404,69,retired,divorced,primary,no,482,no,no,cellular,2,jul,260,2,-1,0,unknown,yes
8952,38,technician,married,secondary,no,625,yes,no,unknown,4,jun,140,2,-1,0,unknown,no
22720,39,management,single,tertiary,no,0,no,no,cellular,25,aug,342,5,-1,0,unknown,no
43223,68,retired,married,primary,no,451,no,no,cellular,5,mar,373,1,-1,0,unknown,yes


In [4]:
#Step 2: Preprocessing the Data
bankData.fillna(0, inplace=True)
print(bankData.nunique())

age            77
job            12
marital         3
education       4
default         2
balance      7168
housing         2
loan            2
contact         3
day            31
month          12
duration     1573
campaign       48
pdays         559
previous       41
poutcome        4
y               2
dtype: int64


In [5]:
# 2. Scaling the Numeric Features
from sklearn.preprocessing import RobustScaler
rob_scaler = RobustScaler()

# Scaling numeric features
bankData['ageScaled'] = rob_scaler.fit_transform(bankData['age'].values.reshape(-1,1))
bankData['balScaled'] = rob_scaler.fit_transform(bankData['balance'].values.reshape(-1,1))
bankData['durScaled'] = rob_scaler.fit_transform(bankData['duration'].values.reshape(-1,1))

# Dropping the original columns
bankData.drop(['age', 'balance', 'duration'], axis=1, inplace=True)
bankData.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,ageScaled,balScaled,durScaled
0,management,married,tertiary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,1.266667,1.25,0.375
1,technician,single,secondary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.333333,-0.308997,-0.134259
2,entrepreneur,married,secondary,no,yes,yes,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.328909,-0.481481
3,blue-collar,married,unknown,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.533333,0.780236,-0.407407
4,unknown,single,unknown,no,no,no,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.329646,0.083333


In [6]:
# Step 3: One-Hot Encoding for Categorical Variables
bankCat = pd.get_dummies(bankData[['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']])
bankCat.head()

Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,month_jun,month_mar,month_may,month_nov,month_oct,month_sep,poutcome_failure,poutcome_other,poutcome_success,poutcome_unknown
0,False,False,False,False,True,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
1,False,False,False,False,False,False,False,False,False,True,...,False,False,True,False,False,False,False,False,False,True
2,False,False,True,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
3,False,True,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True
4,False,False,False,False,False,False,False,False,False,False,...,False,False,True,False,False,False,False,False,False,True


In [7]:
# Step 4: 4. Preparing the Feature Set (X) and Target (Y)
# Numeric features
bankNum = bankData[['ageScaled', 'balScaled', 'day', 'durScaled', 'campaign', 'pdays', 'previous']]

# Merging numeric and categorical features
X = pd.concat([bankCat, bankNum], axis=1)
print(X.shape)

# Preparing the target variable
Y = bankData['y']
print(Y.shape)

X.head()

(45211, 51)
(45211,)


Unnamed: 0,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,job_management,job_retired,job_self-employed,job_services,job_student,job_technician,...,poutcome_other,poutcome_success,poutcome_unknown,ageScaled,balScaled,day,durScaled,campaign,pdays,previous
0,False,False,False,False,True,False,False,False,False,False,...,False,False,True,1.266667,1.25,5,0.375,1,-1,0
1,False,False,False,False,False,False,False,False,False,True,...,False,False,True,0.333333,-0.308997,5,-0.134259,1,-1,0
2,False,False,True,False,False,False,False,False,False,False,...,False,False,True,-0.4,-0.328909,5,-0.481481,1,-1,0
3,False,True,False,False,False,False,False,False,False,False,...,False,False,True,0.533333,0.780236,5,-0.407407,1,-1,0
4,False,False,False,False,False,False,False,False,False,False,...,False,False,True,-0.4,-0.329646,5,0.083333,1,-1,0


In [8]:
# Step 5: Train-Test Split

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [9]:
# 6. Logistic Regression Model Training and Prediction

from sklearn.linear_model import LogisticRegression
bankModel = LogisticRegression(max_iter=100)
bankModel.fit(X_train, y_train)

# Predictions on the test set
pred = bankModel.predict(X_test)

# Model accuracy
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(bankModel.score(X_test, y_test)))

Accuracy of Logistic regression model prediction on test set: 0.90


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [10]:
# 7. Confusion Matrix and Classification Report
from sklearn.metrics import confusion_matrix, classification_report

# Confusion matrix
confusionMatrix = confusion_matrix(y_test, pred)
print("Confusion Matrix:\n", confusionMatrix)

# Classification report
print(classification_report(y_test, pred))

Confusion Matrix:
 [[11700   298]
 [ 1086   480]]
              precision    recall  f1-score   support

          no       0.92      0.98      0.94     11998
         yes       0.62      0.31      0.41      1566

    accuracy                           0.90     13564
   macro avg       0.77      0.64      0.68     13564
weighted avg       0.88      0.90      0.88     13564



In [11]:
# 8. Class Distribution

print('Percentage of positive class:', (y_train[y_train == 'yes'].value_counts() / len(y_train)) * 100)
print('Percentage of negative class:', (y_train[y_train == 'no'].value_counts() / len(y_train)) * 100)

Percentage of positive class: y
yes    11.764148
Name: count, dtype: float64
Percentage of negative class: y
no    88.235852
Name: count, dtype: float64
