# Dataset Source
https://archive.ics.uci.edu/ml/datasets/Bank+Marketing# <br>
(select "Data Folder", download "bank-additional.zip", and extract "bank-additional-full.csv").

# Data Pre-processing

In [None]:
import pandas as pd

In [None]:
# Import data into a pandas dataframe
data = pd.read_csv("bank-additional-full.csv", delimiter=';')

In [None]:
# See what the dataset looks like
data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
# View number of (rows, columns) in the dataset
data.shape

(41188, 21)

In [None]:
# List all columns in the dataset
data.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'contact', 'month', 'day_of_week', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'emp.var.rate', 'cons.price.idx',
       'cons.conf.idx', 'euribor3m', 'nr.employed', 'y'],
      dtype='object')

In [None]:
# Consider only bank client data for purpose of predicting whether a client will subscribe to a term deposit or not.
# This results in reducing number of variables from 21 to 8.
df_temp = data[["age", "job", "marital", "education", "default", "housing", "loan","y"]]

In [None]:
# Verify that appropriate columns have been selected
df_temp.columns

Index(['age', 'job', 'marital', 'education', 'default', 'housing', 'loan',
       'y'],
      dtype='object')

In [None]:
# View data types of the columns
df_temp.dtypes

age           int64
job          object
marital      object
education    object
default      object
housing      object
loan         object
y            object
dtype: object

In [None]:
# View number of rows in the dataframe
df_temp.shape[0]

41188

In [None]:
# Check dataframe for missing values
df_temp.isna().sum()

age          0
job          0
marital      0
education    0
default      0
housing      0
loan         0
y            0
dtype: int64

There are no missing values. However, the dataset description reveals that unavailable information is represented as 'unknown' values.

In [None]:
# Select 'unknown' values in the dataset. Doing so converts 'unknown' values to NaN.
df = df_temp[~(df_temp == "unknown")]
df.head(2)

Unnamed: 0,age,job,marital,education,default,housing,loan,y
0,56,housemaid,married,basic.4y,no,no,no,no
1,57,services,married,high.school,,no,no,no


In [None]:
# View number of 'unknown' values that were convert to NaN in each column.
df.isna().sum()

age             0
job           330
marital        80
education    1731
default      8597
housing       990
loan          990
y               0
dtype: int64

In [None]:
# Drop 'unknown' values because they have no utility for predicting classification
df.dropna(inplace = True)

In [None]:
# Confirm that all unknown/NaN values have been dropped.
df.isna().sum()

age          0
job          0
marital      0
education    0
default      0
housing      0
loan         0
y            0
dtype: int64

All unknown/NaN values have been dropped.

In [None]:
# View number of rows in the dataframe after dropping rows with 'unknown' values
df.shape[0]

30488