## 1) Importing important libraries

In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import  confusion_matrix
from sklearn.metrics import classification_report

## 2) Importing data

In [5]:
bank_data = pd.read_csv("bank-full.csv",sep= ";")
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


## 3) Data Understanding

In [6]:
bank_data.shape

(45211, 17)

In [7]:
bank_data.dtypes

age           int64
job          object
marital      object
education    object
default      object
balance       int64
housing      object
loan         object
contact      object
day           int64
month        object
duration      int64
campaign      int64
pdays         int64
previous      int64
poutcome     object
y            object
dtype: object

In [8]:
bank_data.isnull().sum()

age          0
job          0
marital      0
education    0
default      0
balance      0
housing      0
loan         0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [9]:
bank_data.describe()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
count,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0,45211.0
mean,40.93621,1362.272058,15.806419,258.16308,2.763841,40.197828,0.580323
std,10.618762,3044.765829,8.322476,257.527812,3.098021,100.128746,2.303441
min,18.0,-8019.0,1.0,0.0,1.0,-1.0,0.0
25%,33.0,72.0,8.0,103.0,1.0,-1.0,0.0
50%,39.0,448.0,16.0,180.0,2.0,-1.0,0.0
75%,48.0,1428.0,21.0,319.0,3.0,-1.0,0.0
max,95.0,102127.0,31.0,4918.0,63.0,871.0,275.0


## 4) Data Preparation

In [13]:
le = LabelEncoder()
bank_data['y'] = le.fit_transform(bank_data['y'])

In [20]:
bank_data['job'] = le.fit_transform(bank_data['job'])
bank_data['marital'] = le.fit_transform(bank_data['marital'])
bank_data['education'] = le.fit_transform(bank_data['education'])
bank_data['default'] = le.fit_transform(bank_data['default'])
bank_data['housing'] =  le.fit_transform(bank_data['housing'])
bank_data['contact'] = le.fit_transform(bank_data['contact'])
bank_data['month'] = le.fit_transform(bank_data['month'])
bank_data['poutcome'] = le.fit_transform(bank_data['poutcome'])
bank_data['loan'] = le.fit_transform(bank_data['loan'])

In [21]:
bank_data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,4,1,2,0,2143,1,0,2,5,8,261,1,-1,0,3,0
1,44,9,2,1,0,29,1,0,2,5,8,151,1,-1,0,3,0
2,33,2,1,1,0,2,1,1,2,5,8,76,1,-1,0,3,0
3,47,1,1,3,0,1506,1,0,2,5,8,92,1,-1,0,3,0
4,33,11,2,3,0,1,0,0,2,5,8,198,1,-1,0,3,0


In [22]:
bank_data.dtypes

age          int64
job          int64
marital      int64
education    int64
default      int64
balance      int64
housing      int64
loan         int32
contact      int64
day          int64
month        int64
duration     int64
campaign     int64
pdays        int64
previous     int64
poutcome     int32
y            int64
dtype: object

## 5) Model Building

In [23]:
X = bank_data.drop('y',axis=1)
y = bank_data['y']

In [25]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.20,random_state = 12)

In [27]:
print(X_train.shape)
print(y_train.shape)

(36168, 16)
(36168,)


In [28]:
print(X_test.shape)
print(y_test.shape)

(9043, 16)
(9043,)


## 6) Model Training

In [30]:
import warnings
warnings.filterwarnings('ignore')
logistic_model=LogisticRegression()
logistic_model.fit(X_train,y_train)

LogisticRegression()

## 7) Model Testing

In [31]:
#prediction of training data
y_pred_train = logistic_model.predict(X_train)
y_pred_train

array([0, 0, 1, ..., 0, 0, 1], dtype=int64)

In [32]:
# prediction of testing data
y_pred_test =  logistic_model.predict(X_test)
y_pred_test

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

## 8) Model Evaluation

In [33]:
# Checking Training Accuracy
accuracy_score(y_train,y_pred_train)
print("The accuracy score of training data of the given model = {}".format(accuracy_score(y_train,y_pred_train)))

The accuracy score of training data of the given model = 0.8869995576199956


In [34]:
# Checking Testing Accuracy
accuracy_score(y_test,y_pred_test)
print("The accuracy score of testing data of the given model = {}".format(accuracy_score(y_test,y_pred_test)))

The accuracy score of testing data of the given model = 0.8896383943381622


In [36]:
confusion_matrix(y_train,y_pred_train)

array([[31348,   581],
       [ 3506,   733]], dtype=int64)

 Here, it is clear that total 31348+733 are correct predictions and 3506+581 are incorrect predictions are done by the model.

In [39]:
confusion_matrix(y_test,y_pred_test)

array([[7847,  146],
       [ 852,  198]], dtype=int64)

Here, it is clear that total 7847+198 are correct predictions and 852+146 are incorrect predictions are done by the model.

In [40]:
print(classification_report(y_train,y_pred_train))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94     31929
           1       0.56      0.17      0.26      4239

    accuracy                           0.89     36168
   macro avg       0.73      0.58      0.60     36168
weighted avg       0.86      0.89      0.86     36168



In [41]:
print(classification_report(y_test,y_pred_test))

              precision    recall  f1-score   support

           0       0.90      0.98      0.94      7993
           1       0.58      0.19      0.28      1050

    accuracy                           0.89      9043
   macro avg       0.74      0.59      0.61      9043
weighted avg       0.86      0.89      0.86      9043



#### Insight:
* The Logistic Regression Model of bank-full data is built.
* The training accuracy of the model is found by 0.86699 = 86.69%.
* The testing accuracy of the model is found by 0.8896 = 88.96%.
* The Precision score, Recall score, F1 score is up to the mark in both training data and testing data of the model.