# Benchmarking the Logistic Regression Model on the Dataset

In [1]:
# import packages
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
# url path
url_path = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter13/Dataset/bank-full.csv'

In [3]:
# load the data
df = pd.read_csv(url_path, sep=';')
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
# Normalize the numerical features
from sklearn.preprocessing import RobustScaler
rob_scaler = RobustScaler()

In [5]:
# convert each of the columns to a scaled version
df['ageScaled'] = rob_scaler.fit_transform(df['age'].values.reshape(-1, 1))
df['balScaled'] = rob_scaler.fit_transform(df['balance'].values.reshape(-1, 1))
df['durScaled'] = rob_scaler.fit_transform(df['duration'].values.reshape(-1, 1))

In [6]:
# drop the original features after we introduce the scaled features 
df.drop(['age', 'balance', 'duration'], axis=1, inplace=True)

In [7]:
df.head()

Unnamed: 0,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,ageScaled,balScaled,durScaled
0,management,married,tertiary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,1.266667,1.25,0.375
1,technician,single,secondary,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.333333,-0.308997,-0.134259
2,entrepreneur,married,secondary,no,yes,yes,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.328909,-0.481481
3,blue-collar,married,unknown,no,yes,no,unknown,5,may,1,-1,0,unknown,no,0.533333,0.780236,-0.407407
4,unknown,single,unknown,no,no,no,unknown,5,may,1,-1,0,unknown,no,-0.4,-0.329646,0.083333


In [8]:
# Convert all the categorical variables to dummy variables
dfCat = pd.get_dummies(df[['job','marital','education','default','housing','loan','contact','month','poutcome']])

In [9]:
# Separate the numerical data and observe the shape
dfNum = df[['ageScaled','balScaled','day','durScaled','campaign','pdays','previous']]
dfNum.shape

(45211, 7)

In [10]:
# Create the independent variables, X, and dependent variables, Y, from the combined dataset for modeling
X = pd.concat([dfCat, dfNum], axis=1)
X.shape

(45211, 51)

In [11]:
Y = df['y']
Y.shape

(45211,)

In [12]:
# import packages
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=123)

In [14]:
# fit the model
bankModel = LogisticRegression()
bankModel.fit(X_train, y_train)

LogisticRegression()

In [15]:
# make predictions
pred = bankModel.predict(X_test)

In [16]:
# calculate accuracy
print(f'Accuracy of Logistic regression model prediction on test set: {bankModel.score(X_test, y_test)}')

Accuracy of Logistic regression model prediction on test set: 0.8989236213506341


In [17]:
# calculate other metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [18]:
print(confusion_matrix(y_test, pred))

[[11697   301]
 [ 1070   496]]


In [19]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.92      0.97      0.94     11998
         yes       0.62      0.32      0.42      1566

    accuracy                           0.90     13564
   macro avg       0.77      0.65      0.68     13564
weighted avg       0.88      0.90      0.88     13564



# Implementing Random Undersampling and Classification on Our Banking Dataset to Find the Optimal Result

In [20]:
# join the X and y variables for the training set before resampling
trainData = pd.concat([X_train, y_train], axis=1)

What we will do next is separate the minority class and the majority class. This is required because we have to sample separately from the majority class to make a balanced dataset. To separate the minority class, we have to identify the indexes of the dataset where the dataset has 'yes.'

In [21]:
# find the indexes of the sample dataset where the propensity is yes
ind = trainData[trainData['y'] == 'yes'].index
print(len(ind))

3723


In [22]:
# Separate by the minority class
minData = trainData.loc[ind]
minData.shape

(3723, 52)

In [23]:
# find the indexes of the majority class
ind1 = trainData[trainData['y'] == 'no'].index
print(len(ind1))

27924


In [24]:
# Separate by the majority class
majData = trainData.loc[ind1]
majData.shape

(27924, 52)

Once the majority class is separated, we can proceed with sampling from the majority class. Once the sampling is done, the shape of the majority class dataset and its head are printed.

In [25]:
# Extract the samples
majSample = majData.sample(n=len(ind), random_state=123)

In [26]:
# preparing the individual dataset, we can now concatenate them together
balData = pd.concat([minData, majSample], axis=0)

In [27]:
# shuffle the dataset so that both the minority and majority classes
from sklearn.utils import shuffle
balData = shuffle(balData)

In [28]:
# separate the shuffled dataset into the independent variables and dependent variables
X_trainNew = balData.iloc[:, 0:51]
y_trainNew = balData['y']

In [29]:
# create the model
bankModel1 = LogisticRegression()
bankModel1.fit(X_trainNew, y_trainNew)

LogisticRegression()

In [30]:
# make prediction
pred = bankModel1.predict(X_test)

In [31]:
# calculate accuracy
print(f'Accuracy of Logistic Regression model prediction on test set for balanced data set: {bankModel1.score(X_test, y_test)}')

Accuracy of Logistic Regression model prediction on test set for balanced data set: 0.8299174284871719


In [32]:
# generate the confusion matrix and classification report
print(confusion_matrix(y_test, pred))

[[9969 2029]
 [ 278 1288]]


In [33]:
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.97      0.83      0.90     11998
         yes       0.39      0.82      0.53      1566

    accuracy                           0.83     13564
   macro avg       0.68      0.83      0.71     13564
weighted avg       0.91      0.83      0.85     13564



# Implementing SMOTE on Our Banking Dataset to Find the Optimal Result

In [34]:
# print the count of both the classes before we oversample
print(f"Before OverSampling count of yes: {sum(y_train=='yes')}")
print(f"Before OverSampling count of no: {sum(y_train=='no')}")

Before OverSampling count of yes: 3723
Before OverSampling count of no: 27924


In [35]:
# import packages
import smote_variants as sv
import numpy as np

In [36]:
# instantiate the SMOTE library
oversampler = sv.SMOTE()

In [37]:
# sample the process 
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

2021-03-06 00:34:15,555:INFO:SMOTE: Running sampling via ('SMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


In [38]:
# print the shapes of the new X and y variables
print(f'After OverSampling, the shape of train_X: {X_train_os.shape}')
print(f'After OverSampling, the shape of train_y: {y_train_os.shape} \n')
print(f"After OverSampling, counts of label 'Yes': {sum(y_train_os == 'yes')}")
print(f"After OverSampling, counts of label 'no': {sum(y_train_os == 'no')}")

After OverSampling, the shape of train_X: (55848, 51)
After OverSampling, the shape of train_y: (55848,) 

After OverSampling, counts of label 'Yes': 27924
After OverSampling, counts of label 'no': 27924


In [39]:
# create a model
bankModel2 = LogisticRegression()
bankModel2.fit(X_train_os, y_train_os)

LogisticRegression()

In [40]:
# make predicitons
pred = bankModel2.predict(X_test)

In [41]:
# print the accuracy values
print(f'Accuracy of Logistic Regression model prediction on test set for SMOTE balanced data set: {bankModel2.score(X_test, y_test)}')

Accuracy of Logistic Regression model prediction on test set for SMOTE balanced data set: 0.8437039221468593


In [42]:
# print confusion matrix
print(confusion_matrix(y_test, pred))

[[10210  1788]
 [  332  1234]]


In [43]:
# print classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.97      0.85      0.91     11998
         yes       0.41      0.79      0.54      1566

    accuracy                           0.84     13564
   macro avg       0.69      0.82      0.72     13564
weighted avg       0.90      0.84      0.86     13564



# Implementing MSMOTE on Our Banking Dataset to Find the Optimal Result

In [44]:
# instantiate MSMOTE
oversampler = sv.MSMOTE()

In [45]:
# creating new training set
X_train_os, y_train_os = oversampler.sample(np.array(X_train), np.array(y_train))

2021-03-06 00:34:21,020:INFO:MSMOTE: Running sampling via ('MSMOTE', "{'proportion': 1.0, 'n_neighbors': 5, 'n_jobs': 1, 'random_state': None}")


In [46]:
# print the shapes of the new X and y variables
print(f'After OverSampling, the shape of train_X: {X_train_os.shape}')
print(f'After OverSampling, the shape of train_y: {y_train_os.shape} \n')
print(f"After OverSampling, counts of label 'Yes': {sum(y_train_os=='yes')}")
print(f"After OverSampling, counts of label 'no': {sum(y_train_os=='no')}")

After OverSampling, the shape of train_X: (55848, 51)
After OverSampling, the shape of train_y: (55848,) 

After OverSampling, counts of label 'Yes': 27924
After OverSampling, counts of label 'no': 27924


In [47]:
# create the model
bankModel3 = LogisticRegression()
bankModel3.fit(X_train_os, y_train_os)

LogisticRegression()

In [48]:
# make predictions
pred = bankModel3.predict(X_test)

In [49]:
# print the accuracy of the model
print(f'Accuracy of Logistic Regression model prediction on test set for MSMOTE balanced data set: {bankModel3.score(X_test, y_test)}')

Accuracy of Logistic Regression model prediction on test set for MSMOTE balanced data set: 0.8482748451784135


In [50]:
# print confusion matrix
print(confusion_matrix(y_test, pred))

[[10277  1721]
 [  337  1229]]


In [51]:
# print classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

          no       0.97      0.86      0.91     11998
         yes       0.42      0.78      0.54      1566

    accuracy                           0.85     13564
   macro avg       0.69      0.82      0.73     13564
weighted avg       0.90      0.85      0.87     13564

