In [1]:
import numpy as np
import pandas as pd

import seaborn as sns

In [2]:
df = pd.read_csv('bank/bank-full.csv', delimiter=';')

In [3]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 
            'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

df = df[features]
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45211 entries, 0 to 45210
Data columns (total 15 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   age        45211 non-null  int64 
 1   job        45211 non-null  object
 2   marital    45211 non-null  object
 3   education  45211 non-null  object
 4   balance    45211 non-null  int64 
 5   housing    45211 non-null  object
 6   contact    45211 non-null  object
 7   day        45211 non-null  int64 
 8   month      45211 non-null  object
 9   duration   45211 non-null  int64 
 10  campaign   45211 non-null  int64 
 11  pdays      45211 non-null  int64 
 12  previous   45211 non-null  int64 
 13  poutcome   45211 non-null  object
 14  y          45211 non-null  object
dtypes: int64(7), object(8)
memory usage: 5.2+ MB


In [6]:
df.education.mode()

0    secondary
Name: education, dtype: object

In [7]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

df[numerical].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [8]:
df.y = (df.y == 'yes').astype(int)
df.y.head()

0    0
1    0
2    0
3    0
4    0
Name: y, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

df_train_full, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_train_full, test_size=0.25, random_state=42)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

del df_train['y']
del df_val['y']
del df_test['y']

In [10]:
df_train.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome
0,32,technician,single,tertiary,1100,yes,cellular,11,aug,67,1,-1,0,unknown
1,38,entrepreneur,married,secondary,0,yes,cellular,17,nov,258,1,-1,0,unknown
2,49,blue-collar,married,secondary,3309,yes,cellular,15,may,349,2,-1,0,unknown
3,37,housemaid,married,primary,2410,no,cellular,4,aug,315,1,-1,0,unknown
4,31,self-employed,married,tertiary,3220,no,cellular,26,aug,74,4,-1,0,unknown


In [11]:
categorical = list(df_train.drop(numerical, axis=1).columns)

from sklearn.metrics import mutual_info_score

def mutual_info_y_score(series):
    return mutual_info_score(series, y_train)

mi = df_train[categorical].apply(mutual_info_y_score)
mi.sort_values(ascending=False).round(2)

poutcome     0.03
month        0.03
contact      0.01
housing      0.01
job          0.01
education    0.00
marital      0.00
dtype: float64

In [12]:
dict_train = df_train[categorical + numerical].to_dict(orient='records')

from sklearn.feature_extraction import DictVectorizer
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(dict_train)

In [13]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'liblinear'
,max_iter,1000


In [14]:
dict_val = df_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(dict_val)
y_val_pred = model.predict_proba(X_val)[:,1]

df_pred = pd.DataFrame()
df_pred['probability'] = y_val_pred
df_pred['prediction'] = (y_val_pred >= 0.5).astype(int)
df_pred['actual'] = y_val
df_pred['correct'] = df_pred.prediction == df_pred.actual

df_pred_old = df_pred.correct.mean()

In [15]:
from IPython.display import display

col = list(df_train.columns)
df_train_old = df_train.copy()
df_val_old = df_val.copy()
accuracies = []

In [16]:
for c in col:
    df_train = df_train_old.drop(c, axis=1)
    df_val = df_val_old.drop(c, axis=1)
    
    dict_train = df_train.to_dict(orient='records')
    from sklearn.feature_extraction import DictVectorizer
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(dict_train)

    from sklearn.linear_model import LogisticRegression
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train, y_train)

    dict_val = df_val.to_dict(orient='records')
    X_val = dv.transform(dict_val)
    y_val_pred = model.predict_proba(X_val)[:,1]
    
    df_pred = pd.DataFrame()
    df_pred['probability'] = y_val_pred
    df_pred['prediction'] = (y_val_pred >= 0.5).astype(int)
    df_pred['actual'] = y_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual

    accuracies.append(df_pred.correct.mean().round(3))

dict(zip(col, accuracies-df_pred_old))
    

{'age': np.float64(9.312099093128356e-05),
 'job': np.float64(9.312099093128356e-05),
 'marital': np.float64(9.312099093128356e-05),
 'education': np.float64(9.312099093128356e-05),
 'balance': np.float64(9.312099093128356e-05),
 'housing': np.float64(9.312099093128356e-05),
 'contact': np.float64(-0.0009068790090687173),
 'day': np.float64(9.312099093128356e-05),
 'month': np.float64(-0.0009068790090687173),
 'duration': np.float64(-0.010906879009068726),
 'campaign': np.float64(-0.0009068790090687173),
 'pdays': np.float64(9.312099093128356e-05),
 'previous': np.float64(9.312099093128356e-05),
 'poutcome': np.float64(-0.007906879009068724)}