In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
data = pd.read_csv("datasets/bank-full.csv")

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


In [4]:
features = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']

In [5]:
selected_columns = data[features]

In [6]:
selected_columns

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,825,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,1729,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,5715,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,668,no,telephone,17,nov,508,4,-1,0,unknown,no


In [7]:
selected_columns.education.value_counts()

education
secondary    23202
tertiary     13301
primary       6851
unknown       1857
Name: count, dtype: int64

In [8]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
selected_columns[numerical].head()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
0,58,2143,5,261,1,-1,0
1,44,29,5,151,1,-1,0
2,33,2,5,76,1,-1,0
3,47,1506,5,92,1,-1,0
4,33,1,5,198,1,-1,0


In [9]:
categorical = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
selected_columns[categorical].head()

Unnamed: 0,job,marital,education,housing,contact,month,poutcome
0,management,married,tertiary,yes,unknown,may,unknown
1,technician,single,secondary,yes,unknown,may,unknown
2,entrepreneur,married,secondary,yes,unknown,may,unknown
3,blue-collar,married,unknown,yes,unknown,may,unknown
4,unknown,single,unknown,no,unknown,may,unknown


In [10]:
selected_columns.isnull().sum()

age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64

In [11]:
selected_columns[numerical].corr()

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


age and balance - 0.097783<br>
day and campaign - 	0.162490<br>
day and pdays - -0.093044<br>
pdays and previous - 0.454820

In [12]:
data.y = (data.y == "yes").astype(int)

In [13]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,0


In [14]:
from sklearn.model_selection import train_test_split

In [15]:
data_full_train, data_test = train_test_split(data, test_size=0.2, random_state=42)
data_train, data_val = train_test_split(data_full_train, test_size=0.25, random_state=42)

In [16]:
len(data_train), len(data_val), len(data_test)

(27126, 9042, 9043)

In [17]:
data_train = data_train.reset_index(drop=True)
data_val = data_val.reset_index(drop=True)
data_test = data_test.reset_index(drop=True)

In [18]:
y_train = data_train.y.values
y_val = data_val.y.values
y_test = data_test.y.values

In [19]:
del data_train["y"]
del data_val["y"]
del data_test["y"]

In [20]:
data_full_train[data_full_train['y'] == 1]

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
32192,56,blue-collar,married,primary,no,3498,no,no,cellular,15,apr,264,2,-1,0,unknown,1
44313,55,technician,married,tertiary,no,1320,no,no,cellular,28,jul,340,1,91,2,success,1
42319,35,technician,married,tertiary,no,3160,yes,no,cellular,16,nov,445,2,95,1,failure,1
42210,49,management,married,primary,no,3371,no,no,cellular,11,nov,269,2,-1,0,unknown,1
44862,54,services,single,secondary,no,0,no,no,cellular,23,sep,209,1,-1,0,unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43021,52,management,married,tertiary,no,4675,no,no,cellular,12,feb,181,3,-1,0,unknown,1
43323,54,services,divorced,secondary,no,0,no,no,cellular,18,mar,362,1,290,3,success,1
41606,25,management,single,tertiary,no,808,no,no,cellular,18,sep,267,2,114,2,failure,1
16023,35,technician,married,tertiary,no,328,yes,no,cellular,22,jul,654,2,-1,0,unknown,1


In [21]:
from sklearn.metrics import mutual_info_score

In [22]:
def mutual_info_score_y(series):
    return mutual_info_score(series, data_full_train.y)

In [23]:
mi = data_full_train[categorical].apply(mutual_info_score_y)
mi.sort_values(ascending=False)

poutcome     0.029257
month        0.024774
contact      0.014164
housing      0.009800
job          0.007765
education    0.002458
marital      0.002019
dtype: float64

In [24]:
from sklearn.feature_extraction import DictVectorizer

In [41]:
dv = DictVectorizer(sparse=False)

train_dict = data_train[categorical + numerical].to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = data_val[categorical + numerical].to_dict(orient='records')
X_val = dv.transform(val_dict)

In [26]:
from sklearn.linear_model import LogisticRegression

In [27]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

In [28]:
model.fit(X_train, y_train)

In [29]:
y_pred = model.predict_proba(X_val)[:,1]

In [30]:
y_pred

array([0.01252233, 0.01036543, 0.14732673, ..., 0.05617135, 0.00940853,
       0.28917677])

In [31]:
y_decision = (y_pred >= 0.5)
y_decision

array([False, False, False, ..., False, False, False])

In [32]:
y_decision.astype(int)

array([0, 0, 0, ..., 0, 0, 0])

In [33]:
round((y_val == y_decision).mean(), 2)

np.float64(0.9)

In [34]:
orig_mean = (y_val == y_decision).mean()

## Q5

In [35]:
features_1 = ['job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
features_2 = ['age', 'job', 'marital', 'education', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
features_3 = ['age', 'job', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome']
features_4 = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'poutcome']

In [36]:
features_reduced = [features_1, features_2, features_3, features_4]

In [37]:
for features_list in features_reduced:
    train_dict_1 = data_train[features_list].to_dict(orient='records')
    X_train_1 = dv.fit_transform(train_dict_1)
    val_dict_1 = data_val[features_list].to_dict(orient='records')
    X_val_1 = dv.transform(val_dict_1)
    
    model_small = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model_small.fit(X_train_1, y_train)
    y_pred_1 = model_small.predict_proba(X_val_1)[:, 1]
    s_mean = (y_val == (y_pred_1 >= 0.5)).mean()
    print(f"{s_mean}\t{orig_mean - s_mean}")

0.9012386640123866	0.00033178500331787486
0.9013492590134926	0.00022119000221187957
0.9001327140013271	0.0014377350143773837
0.9012386640123866	0.00033178500331787486


In [50]:
c_values = [0.01, 0.1, 1, 10, 100]

In [51]:
for c in c_values:
    model_c = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
    model_c.fit(X_train, y_train)
    y_pred = model_c.predict_proba(X_val)[:, 1]
    c_mean = (y_val == (y_pred >= 0.5)).mean()
    print(f"{c}\t{c_mean}")

0.01	0.8979208139792081
0.1	0.9009068790090687
1	0.9015704490157045
10	0.9009068790090687
100	0.9012386640123866
