In [1]:
import numpy as np
import pandas as pd

In [2]:
dataset = pd.read_csv("./dataset/bank-additional-full.csv")
dataset

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,retired,married,professional.course,no,yes,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes
41184,46,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41185,56,retired,married,university.degree,no,yes,no,cellular,nov,fri,...,2,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,no
41186,44,technician,married,professional.course,no,no,no,cellular,nov,fri,...,1,999,0,nonexistent,-1.1,94.767,-50.8,1.028,4963.6,yes


In [3]:
dataset.shape

(41188, 21)

In [4]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   age             41188 non-null  int64  
 1   job             41188 non-null  object 
 2   marital         41188 non-null  object 
 3   education       41188 non-null  object 
 4   default         41188 non-null  object 
 5   housing         41188 non-null  object 
 6   loan            41188 non-null  object 
 7   contact         41188 non-null  object 
 8   month           41188 non-null  object 
 9   day_of_week     41188 non-null  object 
 10  duration        41188 non-null  int64  
 11  campaign        41188 non-null  int64  
 12  pdays           41188 non-null  int64  
 13  previous        41188 non-null  int64  
 14  poutcome        41188 non-null  object 
 15  emp.var.rate    41188 non-null  float64
 16  cons.price.idx  41188 non-null  float64
 17  cons.conf.idx   41188 non-null 

In [5]:
dataset.isna().any()

age               False
job               False
marital           False
education         False
default           False
housing           False
loan              False
contact           False
month             False
day_of_week       False
duration          False
campaign          False
pdays             False
previous          False
poutcome          False
emp.var.rate      False
cons.price.idx    False
cons.conf.idx     False
euribor3m         False
nr.employed       False
y                 False
dtype: bool

In [6]:
categorical_columns = dataset.select_dtypes(include="object")
categorical_columns

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome,y
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent,no
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent,no
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent,no
3,admin.,married,basic.6y,no,no,no,telephone,may,mon,nonexistent,no
4,services,married,high.school,no,no,yes,telephone,may,mon,nonexistent,no
...,...,...,...,...,...,...,...,...,...,...,...
41183,retired,married,professional.course,no,yes,no,cellular,nov,fri,nonexistent,yes
41184,blue-collar,married,professional.course,no,no,no,cellular,nov,fri,nonexistent,no
41185,retired,married,university.degree,no,yes,no,cellular,nov,fri,nonexistent,no
41186,technician,married,professional.course,no,no,no,cellular,nov,fri,nonexistent,yes


<h2 style="color: orangered;">Descriptive Analysis of Categorical Data<h2>

In [7]:
def check_categories_in_categorical_data(category):
    if category in categorical_columns:
        print("--------------------------------------------------------")
        print(f"Unique values in column: {category} \n {dataset[category].unique()}")
        print("--------------------------------------------------------")
        print(f"Value Counts in column: {category} \n {dataset[category].value_counts()}")
        print("--------------------------------------------------------")
    else:
        print("No such categorical column in dataset.")

## For Bank Client data 

In [8]:
check_categories_in_categorical_data("job")

--------------------------------------------------------
Unique values in column: job 
 ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']
--------------------------------------------------------
Value Counts in column: job 
 admin.           10422
blue-collar       9254
technician        6743
services          3969
management        2924
retired           1720
entrepreneur      1456
self-employed     1421
housemaid         1060
unemployed        1014
student            875
unknown            330
Name: job, dtype: int64
--------------------------------------------------------


In [9]:
check_categories_in_categorical_data("marital")

--------------------------------------------------------
Unique values in column: marital 
 ['married' 'single' 'divorced' 'unknown']
--------------------------------------------------------
Value Counts in column: marital 
 married     24928
single      11568
divorced     4612
unknown        80
Name: marital, dtype: int64
--------------------------------------------------------


In [10]:
check_categories_in_categorical_data("education")

--------------------------------------------------------
Unique values in column: education 
 ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']
--------------------------------------------------------
Value Counts in column: education 
 university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
unknown                 1731
illiterate                18
Name: education, dtype: int64
--------------------------------------------------------


In [11]:
check_categories_in_categorical_data("default")

--------------------------------------------------------
Unique values in column: default 
 ['no' 'unknown' 'yes']
--------------------------------------------------------
Value Counts in column: default 
 no         32588
unknown     8597
yes            3
Name: default, dtype: int64
--------------------------------------------------------


In [12]:
check_categories_in_categorical_data("housing")

--------------------------------------------------------
Unique values in column: housing 
 ['no' 'yes' 'unknown']
--------------------------------------------------------
Value Counts in column: housing 
 yes        21576
no         18622
unknown      990
Name: housing, dtype: int64
--------------------------------------------------------


In [13]:
check_categories_in_categorical_data("loan")

--------------------------------------------------------
Unique values in column: loan 
 ['no' 'yes' 'unknown']
--------------------------------------------------------
Value Counts in column: loan 
 no         33950
yes         6248
unknown      990
Name: loan, dtype: int64
--------------------------------------------------------


## For information related with the last contact of the current campaign

In [14]:
check_categories_in_categorical_data("contact")

--------------------------------------------------------
Unique values in column: contact 
 ['telephone' 'cellular']
--------------------------------------------------------
Value Counts in column: contact 
 cellular     26144
telephone    15044
Name: contact, dtype: int64
--------------------------------------------------------


In [15]:
check_categories_in_categorical_data("month")

--------------------------------------------------------
Unique values in column: month 
 ['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']
--------------------------------------------------------
Value Counts in column: month 
 may    13769
jul     7174
aug     6178
jun     5318
nov     4101
apr     2632
oct      718
sep      570
mar      546
dec      182
Name: month, dtype: int64
--------------------------------------------------------


In [16]:
check_categories_in_categorical_data("day_of_week")

--------------------------------------------------------
Unique values in column: day_of_week 
 ['mon' 'tue' 'wed' 'thu' 'fri']
--------------------------------------------------------
Value Counts in column: day_of_week 
 thu    8623
mon    8514
wed    8134
tue    8090
fri    7827
Name: day_of_week, dtype: int64
--------------------------------------------------------


## For information regarding other attributes

In [17]:
check_categories_in_categorical_data("poutcome")

--------------------------------------------------------
Unique values in column: poutcome 
 ['nonexistent' 'failure' 'success']
--------------------------------------------------------
Value Counts in column: poutcome 
 nonexistent    35563
failure         4252
success         1373
Name: poutcome, dtype: int64
--------------------------------------------------------


## For response variable

In [18]:
check_categories_in_categorical_data("y")

--------------------------------------------------------
Unique values in column: y 
 ['no' 'yes']
--------------------------------------------------------
Value Counts in column: y 
 no     36548
yes     4640
Name: y, dtype: int64
--------------------------------------------------------


<h1 style="color: orangered">Descriptive Analysis of Numeric Data<h1>

In [19]:
numeric_cols=dataset.select_dtypes(include=np.number)
numeric_cols

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,261,1,999,0,1.1,93.994,-36.4,4.857,5191.0
1,57,149,1,999,0,1.1,93.994,-36.4,4.857,5191.0
2,37,226,1,999,0,1.1,93.994,-36.4,4.857,5191.0
3,40,151,1,999,0,1.1,93.994,-36.4,4.857,5191.0
4,56,307,1,999,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,383,1,999,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,189,2,999,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,442,1,999,0,-1.1,94.767,-50.8,1.028,4963.6


In [20]:
numeric_cols.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,962.475454,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,186.910907,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,999.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,999.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,999.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,999.0,7.0,1.4,94.767,-26.9,5.045,5228.1


In [21]:
numeric_cols['pdays'].unique()

array([999,   6,   4,   3,   5,   1,   0,  10,   7,   8,   9,  11,   2,
        12,  13,  14,  15,  16,  21,  17,  18,  22,  25,  26,  19,  27,
        20], dtype=int64)

In [22]:
dataset["pdays"]= dataset["pdays"].replace(999, 0)  
numeric_cols=dataset.select_dtypes(include=np.number)

In [23]:
numeric_cols

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,261,1,0,0,1.1,93.994,-36.4,4.857,5191.0
1,57,149,1,0,0,1.1,93.994,-36.4,4.857,5191.0
2,37,226,1,0,0,1.1,93.994,-36.4,4.857,5191.0
3,40,151,1,0,0,1.1,93.994,-36.4,4.857,5191.0
4,56,307,1,0,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...,...
41183,73,334,1,0,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,383,1,0,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,189,2,0,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,442,1,0,0,-1.1,94.767,-50.8,1.028,4963.6


In [24]:
numeric_cols.describe()

Unnamed: 0,age,duration,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
count,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0,41188.0
mean,40.02406,258.28501,2.567593,0.221229,0.172963,0.081886,93.575664,-40.5026,3.621291,5167.035911
std,10.42125,259.279249,2.770014,1.348874,0.494901,1.57096,0.57884,4.628198,1.734447,72.251528
min,17.0,0.0,1.0,0.0,0.0,-3.4,92.201,-50.8,0.634,4963.6
25%,32.0,102.0,1.0,0.0,0.0,-1.8,93.075,-42.7,1.344,5099.1
50%,38.0,180.0,2.0,0.0,0.0,1.1,93.749,-41.8,4.857,5191.0
75%,47.0,319.0,3.0,0.0,0.0,1.4,93.994,-36.4,4.961,5228.1
max,98.0,4918.0,56.0,27.0,7.0,1.4,94.767,-26.9,5.045,5228.1


<h1 style="color: orangered">Multivariate Analysis of Variables<h1>

In [25]:
%store -r dataset

In [26]:
dataset_copy = dataset.copy()

## For numerical data

In [27]:
numerical_columns = dataset_copy.select_dtypes(include=np.number)
numerical_columns

Unnamed: 0,age,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
0,56,261,1,0,1.1,93.994,-36.4,4.857,5191.0
2,37,226,1,0,1.1,93.994,-36.4,4.857,5191.0
3,40,151,1,0,1.1,93.994,-36.4,4.857,5191.0
4,56,307,1,0,1.1,93.994,-36.4,4.857,5191.0
6,59,139,1,0,1.1,93.994,-36.4,4.857,5191.0
...,...,...,...,...,...,...,...,...,...
41183,73,334,1,0,-1.1,94.767,-50.8,1.028,4963.6
41184,46,383,1,0,-1.1,94.767,-50.8,1.028,4963.6
41185,56,189,2,0,-1.1,94.767,-50.8,1.028,4963.6
41186,44,442,1,0,-1.1,94.767,-50.8,1.028,4963.6


### Correlation check



In [28]:
display(numerical_columns.corr())

Unnamed: 0,age,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed
age,1.0,0.007246,-0.001714,0.048681,-0.049864,-0.035059,0.12504,-0.036006,-0.064202
duration,0.007246,1.0,-0.068146,0.018941,-0.022834,0.013335,-0.009972,-0.028489,-0.039813
campaign,-0.001714,-0.068146,1.0,-0.080916,0.157984,0.127693,-0.012209,0.14101,0.148248
previous,0.048681,0.018941,-0.080916,1.0,-0.403942,-0.176838,-0.027428,-0.439157,-0.488675
emp.var.rate,-0.049864,-0.022834,0.157984,-0.403942,1.0,0.765979,0.156177,0.969405,0.900454
cons.price.idx,-0.035059,0.013335,0.127693,-0.176838,0.765979,1.0,0.026074,0.667203,0.48888
cons.conf.idx,0.12504,-0.009972,-0.012209,-0.027428,0.156177,0.026074,1.0,0.242456,0.074105
euribor3m,-0.036006,-0.028489,0.14101,-0.439157,0.969405,0.667203,0.242456,1.0,0.944904
nr.employed,-0.064202,-0.039813,0.148248,-0.488675,0.900454,0.48888,0.074105,0.944904,1.0


<p>It shows euribor3m and nr.employed showed high correlation with various other variables. So, dropping these columns.</p>

In [29]:
dataset_copy.drop("euribor3m", axis=1, inplace=True)
dataset_copy.drop("nr.employed", axis=1, inplace=True)

In [30]:
numerical_columns = dataset_copy.select_dtypes(include=np.number)
numerical_columns

Unnamed: 0,age,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx
0,56,261,1,0,1.1,93.994,-36.4
2,37,226,1,0,1.1,93.994,-36.4
3,40,151,1,0,1.1,93.994,-36.4
4,56,307,1,0,1.1,93.994,-36.4
6,59,139,1,0,1.1,93.994,-36.4
...,...,...,...,...,...,...,...
41183,73,334,1,0,-1.1,94.767,-50.8
41184,46,383,1,0,-1.1,94.767,-50.8
41185,56,189,2,0,-1.1,94.767,-50.8
41186,44,442,1,0,-1.1,94.767,-50.8


In [31]:
display(numerical_columns.corr())

Unnamed: 0,age,duration,campaign,previous,emp.var.rate,cons.price.idx,cons.conf.idx
age,1.0,0.007246,-0.001714,0.048681,-0.049864,-0.035059,0.12504
duration,0.007246,1.0,-0.068146,0.018941,-0.022834,0.013335,-0.009972
campaign,-0.001714,-0.068146,1.0,-0.080916,0.157984,0.127693,-0.012209
previous,0.048681,0.018941,-0.080916,1.0,-0.403942,-0.176838,-0.027428
emp.var.rate,-0.049864,-0.022834,0.157984,-0.403942,1.0,0.765979,0.156177
cons.price.idx,-0.035059,0.013335,0.127693,-0.176838,0.765979,1.0,0.026074
cons.conf.idx,0.12504,-0.009972,-0.012209,-0.027428,0.156177,0.026074,1.0
