In [1]:
#!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("./bank-full.csv", sep=";")

In [4]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


#### Data preparation

In [5]:
cols_needed = [
    "age", 
    "job", 
    "marital",
    "education",
    "balance",
    "housing",
    "contact",
    "day",
    "month",
    "duration",
    "campaign",
    "pdays",
    "previous",
    "poutcome",
    "y"
]

In [6]:
df = df[cols_needed]

In [7]:
df.isna().any()

age          False
job          False
marital      False
education    False
balance      False
housing      False
contact      False
day          False
month        False
duration     False
campaign     False
pdays        False
previous     False
poutcome     False
y            False
dtype: bool

#### Question 01

In [8]:
df['education'].mode()

0    secondary
Name: education, dtype: object

#### Question 02

In [9]:
d  = [
    ["age", "balance"],
    ["day", "campaign"],
    ["day", "pdays"],
    ["pdays", "previous"],
]

res = {}
for pair in d:
    res[f"{pair[0]} & {pair[1]}"] = df[pair].corr(numeric_only=True).loc[pair[0], pair[1]]


print(res)

{'age & balance': 0.09778273937134807, 'day & campaign': 0.1624902163261922, 'day & pdays': -0.09304407377294048, 'pdays & previous': 0.4548196354805043}


#### Target Encoding

In [10]:
df['y'] = df['y'].apply(lambda x: 1 if x == "yes" else 0)

In [11]:
df['y'].describe()

count    45211.000000
mean         0.116985
std          0.321406
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: y, dtype: float64

#### Split the data

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

In [14]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [15]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [16]:
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

del df_train['y']
del df_val['y']
del df_test['y']

#### Question 03

In [17]:
from sklearn.metrics import mutual_info_score

In [18]:
m_scores = {}
for feature in ["contact", "education", "housing", "poutcome"]:
    m_scores[feature] = mutual_info_score(df_train[feature], y_train)

In [19]:
sorted(m_scores.items(), key=lambda x:x[1], reverse=True)

[('poutcome', 0.029532821290436224),
 ('contact', 0.013356062198247219),
 ('housing', 0.010343105891750026),
 ('education', 0.0026967549991295282)]

#### Question 04

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer

In [21]:
dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [22]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [23]:
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict_proba(X_val)[:, 1]
y_pred

array([0.01384133, 0.00991416, 0.15855571, ..., 0.05156528, 0.0091288 ,
       0.27121895])

In [25]:
y_pred = model.predict(X_val)

In [26]:
from sklearn.metrics import accuracy_score

In [27]:
round(accuracy_score(y_val, y_pred), 2)

0.9

#### Question 05

In [28]:
feature_elemination = {}

for feature in df_train.columns:
    dv_t = DictVectorizer(sparse=False)

    train_dict_t = df_train.loc[:, ~df_train.columns.isin([feature])].to_dict(orient='records')
    X_train_t = dv_t.fit_transform(train_dict_t)

    val_dict_t = df_val.loc[:, ~df_train.columns.isin([feature])].to_dict(orient='records')
    X_val_t = dv_t.transform(val_dict_t)

    model_t = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

    model.fit(X_train_t, y_train)

    y_pred_t = model.predict(X_val_t)

    feature_elemination[feature] = abs(accuracy_score(y_val, y_pred) - accuracy_score(y_val, y_pred_t))


In [29]:
sorted(feature_elemination.items(), key=lambda x:x[1], reverse=True)

[('duration', 0.011059500110594978),
 ('poutcome', 0.007299270072992692),
 ('month', 0.0012165450121653931),
 ('housing', 0.0008847600088476293),
 ('job', 0.0005529750055297544),
 ('previous', 0.00044238000442375913),
 ('contact', 0.00033178500331787486),
 ('pdays', 0.00033178500331787486),
 ('age', 0.00022119000221187957),
 ('balance', 0.00022119000221187957),
 ('day', 0.00022119000221187957),
 ('marital', 0.0001105950011059953),
 ('campaign', 0.0001105950011059953),
 ('education', 0.0)]

#### Question 06

In [30]:
reg = {}
for C in [0.01, 0.1, 1, 10, 100]:
    dv_t = DictVectorizer(sparse=False)

    train_dict_t = df_train.to_dict(orient='records')
    X_train_t = dv_t.fit_transform(train_dict_t)

    val_dict_t = df_val.to_dict(orient='records')
    X_val_t = dv_t.transform(val_dict_t)

    model_t = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)

    model.fit(X_train_t, y_train)

    y_pred_t = model.predict(X_val_t)

    reg[C] = round(accuracy_score(y_val, y_pred_t), 3)

In [31]:
reg

{0.01: 0.901, 0.1: 0.901, 1: 0.901, 10: 0.901, 100: 0.901}