In [3]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

import sklearn.linear_model as skl_lm
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.metrics import roc_curve, auc
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.metrics import confusion_matrix, classification_report, precision_score
from sklearn import preprocessing
from sklearn import neighbors
from sklearn.metrics import f1_score


import statsmodels.api as sm
import statsmodels.formula.api as smf

In [4]:
# import libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [5]:
df = pd.read_csv('European_bank_marketing.csv')
df.drop('duration', axis=1, inplace=True)
df.shape

(41188, 21)

In [6]:
# Separate numeric frm categoric variables

In [7]:
# select numerical columns
df_numeric = df.select_dtypes(include=[np.number])
numeric_cols = df_numeric.columns.values
# select non-numeric columns
df_non_numeric = df.select_dtypes(exclude=[np.number])
non_numeric_cols = df_non_numeric.columns.values

In [8]:
df_numeric.head(3)

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,term_deposit,Ethnicity_African
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,0


In [9]:
df_non_numeric.head(3)

Unnamed: 0,job,marital,education,default,housing,loan,contact,month,day_of_week,poutcome
0,housemaid,married,basic.4y,no,no,no,telephone,may,mon,nonexistent
1,services,married,high.school,unknown,no,no,telephone,may,mon,nonexistent
2,services,married,high.school,no,yes,no,telephone,may,mon,nonexistent


In [10]:
# Missing Values
#  calculating % of values missing in each col, then storing this information in a DataFrame

In [11]:
# % of values missing in each column
values_list = list()
cols_list = list()
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())*100
    cols_list.append(col)
    values_list.append(pct_missing)
pct_missing_df = pd.DataFrame()
pct_missing_df['col'] = cols_list
pct_missing_df['pct_missing'] = values_list

In [12]:
# We can see there is no missing data

pct_missing_df

Unnamed: 0,col,pct_missing
0,age,0.0
1,job,0.0
2,marital,0.0
3,education,0.0
4,default,0.0
5,housing,0.0
6,loan,0.0
7,contact,0.0
8,month,0.0
9,day_of_week,0.0


In [13]:
df.isnull().sum()

age                  0
job                  0
marital              0
education            0
default              0
housing              0
loan                 0
contact              0
month                0
day_of_week          0
campaign             0
pdays                0
previous             0
poutcome             0
emp.var.rate         0
cons.price.idx       0
cons.conf.idx        0
euribor3m            0
nr.employed          0
term_deposit         0
Ethnicity_African    0
dtype: int64

In [14]:
# Drop any columns that have a same value for each row
# Any columns of such add no explanatory power
df.drop(columns = df.columns[df.nunique() == 1], inplace = True)

In [15]:
# Calculate the number of unique values in each column as a % of total obvservations

from numpy import unique
for i in range(df.shape[1]):
    num = len(unique(df.iloc[:, i]))
    percentage = float(num) / df.shape[0] * 100
    print('%d, %d, %.1f%%' % (i, num, percentage))

0, 78, 0.2%
1, 12, 0.0%
2, 4, 0.0%
3, 8, 0.0%
4, 3, 0.0%
5, 3, 0.0%
6, 3, 0.0%
7, 2, 0.0%
8, 10, 0.0%
9, 5, 0.0%
10, 42, 0.1%
11, 27, 0.1%
12, 8, 0.0%
13, 3, 0.0%
14, 10, 0.0%
15, 26, 0.1%
16, 26, 0.1%
17, 316, 0.8%
18, 11, 0.0%
19, 2, 0.0%
20, 2, 0.0%


In [16]:
# Check for Duplicate rows
df[df.duplicated()]

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,term_deposit,Ethnicity_African
10,41,blue-collar,married,unknown,unknown,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
11,25,services,single,high.school,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
16,35,blue-collar,married,basic.6y,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
31,59,technician,married,unknown,no,yes,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
104,52,admin.,divorced,university.degree,no,no,no,telephone,may,mon,...,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39082,23,student,single,basic.9y,no,yes,no,cellular,dec,tue,...,999,0,nonexistent,-3.0,92.713,-33.0,0.708,5023.5,1,0
39435,33,admin.,single,university.degree,no,yes,no,cellular,apr,mon,...,999,0,nonexistent,-1.8,93.749,-34.6,0.642,5008.7,0,0
39985,27,admin.,single,high.school,no,no,no,cellular,jun,tue,...,999,0,nonexistent,-1.7,94.055,-39.8,0.761,4991.6,1,0
40806,35,technician,married,professional.course,no,yes,no,cellular,sep,thu,...,999,2,failure,-1.1,94.199,-37.5,0.878,4963.6,0,0


In [17]:
# Drop Duplicate Rows
df.drop_duplicates(inplace = True)

In [30]:
df

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,term_deposit,Ethnicity_African
0,56,3,1,0,0,0,0,1,6,1,...,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0,0
1,57,7,1,3,1,0,0,1,6,1,...,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0,0
2,37,7,1,3,0,2,0,1,6,1,...,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0,0
3,40,0,1,1,0,0,0,1,6,1,...,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0,0
4,56,7,1,3,0,0,2,1,6,1,...,999,0,1,1.1,93.994,-36.4,4.857,5191.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41183,73,5,1,5,0,2,0,0,7,0,...,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1,0
41184,46,1,1,5,0,0,0,0,7,0,...,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0,0
41185,56,5,1,6,0,2,0,0,7,0,...,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,0,0
41186,44,9,1,5,0,0,0,0,7,0,...,999,0,1,-1.1,94.767,-50.8,1.028,4963.6,1,0
