In [1]:
import pandas as pd
import numpy as np

In [2]:
# Get the data from local drive
from google.colab import files
uploaded = files.upload()

Saving bank-full.csv to bank-full.csv


In [3]:
# Read the data in, delimiter is ; instead of ,
df=pd.read_csv('bank-full.csv', sep = ';')

print(df.head())

   age           job  marital  education default  balance housing loan  \
0   58    management  married   tertiary      no     2143     yes   no   
1   44    technician   single  secondary      no       29     yes   no   
2   33  entrepreneur  married  secondary      no        2     yes  yes   
3   47   blue-collar  married    unknown      no     1506     yes   no   
4   33       unknown   single    unknown      no        1      no   no   

   contact  day month  duration  campaign  pdays  previous poutcome   y  
0  unknown    5   may       261         1     -1         0  unknown  no  
1  unknown    5   may       151         1     -1         0  unknown  no  
2  unknown    5   may        76         1     -1         0  unknown  no  
3  unknown    5   may        92         1     -1         0  unknown  no  
4  unknown    5   may       198         1     -1         0  unknown  no  


In [4]:
# Only keep the columns mentioned in the home work
del df['default']
del df['loan']
df.head()

Unnamed: 0,age,job,marital,education,balance,housing,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,2143,yes,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,29,yes,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,2,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,1506,yes,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,1,no,unknown,5,may,198,1,-1,0,unknown,no


In [5]:
df.shape

(45211, 15)

In [6]:
# Check for any missing values
df.isnull().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
balance,0
housing,0
contact,0
day,0
month,0
duration,0


**Question 1**

What is the most frequent observation (mode) for the column education?

In [7]:
df.education.mode()

Unnamed: 0,education
0,secondary


**Question 2**

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

In [8]:
# See which columns are numerical first
df.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
balance,int64
housing,object
contact,object
day,int64
month,object
duration,int64


In [9]:
# List of numerical variables I'll use
numerical = ['age', 'balance', 'day', 'campaign','pdays','previous','duration']

In [10]:
df[numerical].corr()

Unnamed: 0,age,balance,day,campaign,pdays,previous,duration
age,1.0,0.097783,-0.00912,0.00476,-0.023758,0.001288,-0.004648
balance,0.097783,1.0,0.004503,-0.014578,0.003435,0.016674,0.02156
day,-0.00912,0.004503,1.0,0.16249,-0.093044,-0.05171,-0.030206
campaign,0.00476,-0.014578,0.16249,1.0,-0.088628,-0.032855,-0.08457
pdays,-0.023758,0.003435,-0.093044,-0.088628,1.0,0.45482,-0.001565
previous,0.001288,0.016674,-0.05171,-0.032855,0.45482,1.0,0.001203
duration,-0.004648,0.02156,-0.030206,-0.08457,-0.001565,0.001203,1.0


In [11]:
# Target Encoding, replace yes/no with 1/0

df.y = (df.y == 'yes').astype(int)

In [None]:
df.y.value_counts()

Unnamed: 0_level_0,count
y,Unnamed: 1_level_1
0,39922
1,5289


In [12]:
# Split the data
from sklearn.model_selection import train_test_split

In [13]:
df_full_train, df_test = train_test_split(df, test_size = 0.2, random_state = 42)

In [14]:
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state = 42)

In [15]:
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

In [16]:
# Unshuffle the indicies for tidyness
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [17]:
# Set up Y variable
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

In [18]:
# Delete y from train, val and test dataset as it's our target variable
del df_train['y']
del df_val['y']
del df_test['y']

**Question 3**

Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.
Round the scores to 2 decimals using round(score, 2).

In [20]:
from sklearn.metrics import mutual_info_score

In [19]:
df.dtypes

Unnamed: 0,0
age,int64
job,object
marital,object
education,object
balance,int64
housing,object
contact,object
day,int64
month,object
duration,int64


In [21]:
categorical = ['job','marital','education','housing','contact','month', 'poutcome']

In [22]:
def mutual_info_y_score(series):
    return mutual_info_score(series, df_full_train.y)

In [23]:
mi = df_full_train[categorical].apply(mutual_info_y_score)
mi.round(2).sort_values(ascending=False)

Unnamed: 0,0
poutcome,0.03
month,0.02
job,0.01
housing,0.01
contact,0.01
marital,0.0
education,0.0


### **Question 4**

Now let's train a logistic regression.
Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
Fit the model on the training dataset.
To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

In [61]:
# One hot encoding for the categorical varibales
from sklearn.feature_extraction import DictVectorizer

In [62]:
train_dicts = df_train[categorical + numerical].to_dict(orient='records')

In [63]:
train_dicts[0]

{'job': 'technician',
 'marital': 'single',
 'education': 'tertiary',
 'housing': 'yes',
 'contact': 'cellular',
 'month': 'aug',
 'poutcome': 'unknown',
 'age': 32,
 'balance': 1100,
 'day': 11,
 'campaign': 1,
 'pdays': -1,
 'previous': 0,
 'duration': 67}

In [64]:
dv = DictVectorizer(sparse=False)

In [65]:
dv.fit_transform(train_dicts)

array([[3.200e+01, 1.100e+03, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.800e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [4.900e+01, 3.309e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [5.400e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.500e+01, 2.311e+03, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.000e+01, 1.500e+01, 2.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [66]:
dv.get_feature_names_out()

array(['age', 'balance', 'campaign', 'contact=cellular',
       'contact=telephone', 'contact=unknown', 'day', 'duration',
       'education=primary', 'education=secondary', 'education=tertiary',
       'education=unknown', 'housing=no', 'housing=yes', 'job=admin.',
       'job=blue-collar', 'job=entrepreneur', 'job=housemaid',
       'job=management', 'job=retired', 'job=self-employed',
       'job=services', 'job=student', 'job=technician', 'job=unemployed',
       'job=unknown', 'marital=divorced', 'marital=married',
       'marital=single', 'month=apr', 'month=aug', 'month=dec',
       'month=feb', 'month=jan', 'month=jul', 'month=jun', 'month=mar',
       'month=may', 'month=nov', 'month=oct', 'month=sep', 'pdays',
       'poutcome=failure', 'poutcome=other', 'poutcome=success',
       'poutcome=unknown', 'previous'], dtype=object)

In [67]:
X_train = dv.fit_transform(train_dicts)

In [68]:
val_dicts = df_val[categorical + numerical].to_dict(orient='records')

In [69]:
X_val = dv.transform(val_dicts)

In [70]:
from sklearn.linear_model import LogisticRegression

In [71]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train, y_train)

In [72]:
y_pred = model.predict_proba(X_val)[:,1]

In [73]:
decision = (y_pred>=0.5)

In [74]:
orig_acc = round((y_val == decision).mean(),2)
print(orig_acc)

0.9


## **Question 5**


*   Let's find the least useful feature using the feature elimination technique
*   Train a model with all these features (using the same parameters as in Q4).
*   Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
*   For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Without Age

In [38]:
num_no_age = ['balance', 'day', 'campaign','pdays','previous','duration']

In [39]:
train_dicts_no_age = df_train[categorical + num_no_age].to_dict(orient='records')

In [40]:
val_dicts_no_age = df_val[categorical + numerical].to_dict(orient='records')

In [41]:
dv_no_age = DictVectorizer(sparse=False)
dv_no_age.fit_transform(train_dicts_no_age)

array([[1.100e+03, 1.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [0.000e+00, 1.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [3.309e+03, 2.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       ...,
       [0.000e+00, 1.000e+00, 0.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [2.311e+03, 2.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00],
       [1.500e+01, 2.000e+00, 1.000e+00, ..., 0.000e+00, 1.000e+00,
        0.000e+00]])

In [42]:
X_train_no_age = dv_no_age.fit_transform(train_dicts_no_age)

In [43]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_no_age, y_train)

In [44]:
X_val_no_age = dv_no_age.transform(val_dicts_no_age)

In [45]:
y_pred_no_age = model.predict_proba(X_val_no_age)[:,1]

In [48]:
decision_no_age = (y_pred_no_age>0.5).astype(int)

In [49]:
(y_val == decision_no_age).mean()

0.9013492590134926

Without Balance

In [76]:
num_no_bal = ['age', 'day', 'campaign','pdays','previous','duration']

In [77]:
train_dicts_no_bal = df_train[categorical + num_no_bal].to_dict(orient='records')

In [78]:
val_dicts_no_bal = df_val[categorical + numerical].to_dict(orient='records')

In [79]:
dv_no_bal = DictVectorizer(sparse=False)
dv_no_bal.fit_transform(train_dicts_no_bal)

array([[32.,  1.,  1., ...,  0.,  1.,  0.],
       [38.,  1.,  1., ...,  0.,  1.,  0.],
       [49.,  2.,  1., ...,  0.,  1.,  0.],
       ...,
       [54.,  1.,  0., ...,  0.,  1.,  0.],
       [25.,  2.,  1., ...,  0.,  1.,  0.],
       [30.,  2.,  1., ...,  0.,  1.,  0.]])

In [80]:
X_train_no_bal = dv_no_bal.transform(train_dicts_no_bal)

In [81]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_no_bal, y_train)

In [82]:
X_val_no_bal = dv_no_bal.transform(val_dicts_no_bal)

In [83]:
y_pred_no_bal = model.predict_proba(X_val_no_bal)[:,1]

In [84]:
decision_no_bal = (y_pred_no_bal>=0.5)

In [85]:
(y_val == decision_no_bal).mean()

0.9010174740101747

Without Previous

In [86]:
num_no_prev = ['age', 'day', 'campaign','pdays','balance','duration']

In [87]:
train_dicts_no_prev = df_train[categorical + num_no_prev].to_dict(orient='records')

In [88]:
val_dicts_no_prev = df_val[categorical + num_no_prev].to_dict(orient='records')

In [89]:
dv_no_prev = DictVectorizer(sparse=False)
dv_no_prev.fit_transform(train_dicts_no_prev)

array([[3.200e+01, 1.100e+03, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [3.800e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [4.900e+01, 3.309e+03, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       ...,
       [5.400e+01, 0.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [2.500e+01, 2.311e+03, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00],
       [3.000e+01, 1.500e+01, 2.000e+00, ..., 0.000e+00, 0.000e+00,
        1.000e+00]])

In [90]:
X_train_no_prev = dv_no_prev.fit_transform(train_dicts_no_prev)

In [91]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_no_prev, y_train)

In [92]:
X_val_no_prev = dv_no_prev.transform(val_dicts_no_prev)

In [93]:
y_pred_no_prev = model.predict_proba(X_val_no_prev)[:,1]

In [94]:
decision_no_prev = (y_pred_no_prev>=0.5)

In [95]:
(y_val == decision_no_prev).mean()

0.9009068790090687

In [96]:
model.coef_[0].round(3)

array([ 1.000e-03,  0.000e+00, -8.100e-02,  2.620e-01,  5.800e-02,
       -1.318e+00,  1.000e-02,  4.000e-03, -4.300e-01, -2.550e-01,
       -6.600e-02, -2.470e-01, -1.440e-01, -8.540e-01,  9.100e-02,
       -1.990e-01, -2.640e-01, -3.660e-01, -8.400e-02,  2.230e-01,
       -2.890e-01, -1.150e-01,  3.150e-01, -1.470e-01,  5.000e-02,
       -2.140e-01, -3.400e-01, -4.800e-01, -1.790e-01, -9.000e-03,
       -7.280e-01,  4.520e-01, -3.200e-01, -1.285e+00, -1.040e+00,
        3.130e-01,  1.533e+00, -5.080e-01, -9.800e-01,  7.600e-01,
        8.160e-01, -1.000e-03, -8.250e-01, -5.970e-01,  1.484e+00,
       -1.060e+00])

Without Marital

In [97]:
cat_no_mar=['job', 'education', 'housing', 'contact', 'month', 'poutcome']

In [98]:
train_dicts_no_mar = df_train[cat_no_mar + numerical].to_dict(orient='records')

In [99]:
val_dicts_no_mar = df_val[cat_no_mar + numerical].to_dict(orient='records')

In [100]:
dv_no_mar = DictVectorizer(sparse=False)


In [101]:
X_train_no_mar = dv_no_mar.fit_transform(train_dicts_no_mar)

In [102]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_no_mar, y_train)

In [103]:
X_val_no_mar = dv_no_mar.transform(val_dicts_no_mar)

In [104]:
y_pred_no_mar = model.predict_proba(X_val_no_mar)[:,1]

In [105]:
decision_no_mar = (y_pred_no_mar>=0.5)

In [106]:
(y_val == decision_no_mar).mean()

0.9009068790090687

In [108]:

print((y_val == decision).mean())
print(f"No Age" ,orig_acc-(y_val == decision_no_age).mean())
print(f"No Bal" ,orig_acc-(y_val == decision_no_bal).mean())
print(f"No Previous" ,orig_acc-(y_val == decision_no_prev).mean())
print(f"No Marital" ,orig_acc-(y_val == decision_no_mar).mean())

0.9009068790090687
No Age -0.00044238000442387015
No Bal -0.0001105950011059953
No Previous 0.0
No Marital 0.0


## **Question 6**

Now let's train a regularized logistic regression.

Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].

Train models using all the features as in Q4.

Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

In [110]:
for c in [0.01, 0.1, 1, 10, 100]:
  model = LogisticRegression(solver='liblinear', C=c, max_iter=1000, random_state=42)
  model.fit(X_train, y_train)

  y_pred_c = model.predict_proba(X_val)[:,1]

  decision_c = (y_pred_c>=0.5)

  print(c, (y_val == decision_c).mean())

0.01 0.8979208139792081
0.1 0.9007962840079629
1 0.9009068790090687
10 0.9009068790090687
100 0.9006856890068569
