In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("bank-full.csv", delimiter=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
cols = ["age", "job", "marital", "education", "balance", "housing", "contact",
        "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]
df = df[cols]

In [4]:
mf = df.isnull().sum()
mf[mf > 0]

Unnamed: 0,0


# Q1
What is the most frequent observation (mode) for the column `education`

In [5]:
df.groupby('education').size()

Unnamed: 0_level_0,0
education,Unnamed: 1_level_1
primary,6851
secondary,23202
tertiary,13301
unknown,1857


# Q2
What are the two features that have the biggest correlation

In [6]:
ndf = df.select_dtypes(include=['number'])
corr_m = ndf.corr()
corr_m

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [7]:
corr_m = corr_m.mask(np.eye(len(corr_m), dtype=bool))
corr_m

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,


In [8]:
print(corr_m.abs().stack().max(axis=0))
print(corr_m.abs().stack().idxmax(axis=0))

0.4548196354805043
('pdays', 'previous')


In [None]:
corr_m_lower = corr_m.where(np.triu(np.ones(corr_m.shape), k=1).astype(bool))
print(corr_m_lower.stack().idxmax())
print(corr_m_lower.stack().max())

('pdays', 'previous')
0.4548196354805043


In [9]:
df['y'] = df['y'].map({'yes': 1, 'no': 0}).astype(int)
df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y'].map({'yes': 1, 'no': 0}).astype(int)


Unnamed: 0,y
0,0
1,0
2,0
3,0
4,0
...,...
45206,1
45207,1
45208,1
45209,0


In [10]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
y_train = df_train['y']
df_train = df_train.drop('y', axis=1)
y_val = df_val['y']
df_val = df_val.drop('y', axis=1)
y_test = df_test['y']
df_test = df_test.drop('y', axis=1)
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

# Q3
Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.

In [11]:
cols_wo_y = cols[:-1]
scores = [mutual_info_score(df_train[col], y_train) for col in cols_wo_y]
max(zip(cols_wo_y, scores), key=lambda x: x[1])

('balance', 0.11661461717469257)

In [12]:
q3_cols = ['contact', 'education', 'housing', 'poutcome']
scores = [mutual_info_score(df_train[col], y_train) for col in q3_cols]
max(zip(q3_cols, scores), key=lambda x: x[1])

('poutcome', 0.029532821290436224)

# Q4
Calculate the accuracy on the validation dataset

In [13]:
def one_hot_encoding(df, dv, train=True):
    x_dict = df.to_dict(orient='records')
    if train:
        return dv.fit_transform(x_dict)
    else:
        return dv.transform(x_dict)

In [14]:
def accuracy_scoring(df_train, df_val, y_train, y_val, C=1.0):
    dv = DictVectorizer(sparse=False)
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    x_train = one_hot_encoding(df_train, dv)
    model.fit(x_train, y_train)
    x_val = one_hot_encoding(df_val, dv, train=False)
    y_val_pred = model.predict(x_val)
    return (y_val == y_val_pred).mean()

In [15]:
org_score = accuracy_scoring(df_train, df_val, y_train, y_val)
org_score

0.9009068790090687

In [16]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
dv = DictVectorizer(sparse=False)
x_train = one_hot_encoding(df_train, dv)
model.fit(x_train, y_train)
x_val = one_hot_encoding(df_val, dv, train=False)
y_val_pred = model.predict(x_val)
accuracy_score(y_val, y_val_pred)

0.9009068790090687

# Q5
 feature has the smallest difference

In [17]:
q5_cols = ['age', 'balance', 'marital', 'previous']

In [18]:
scores = []
for c in q5_cols:
    #a = [col for col in df_train.columns if col != c]
    df_train_a = df_train.drop(c, axis=1)
    df_val_a = df_val.drop(c, axis=1)
    scores.append(accuracy_scoring(df_train_a, df_val_a, y_train, y_val))

In [19]:
score_diff = [abs(org_score - s) for s in scores]
min(zip(q5_cols, score_diff), key=lambda x: x[1])

('marital', 0.0)

In [20]:
(q5_cols, score_diff)

(['age', 'balance', 'marital', 'previous'],
 [0.00044238000442387015, 0.0001105950011059953, 0.0, 0.0])

In [21]:
scores = []
for c in cols[:-1]:
    #a = [col for col in df_train.columns if col != c]
    df_train_a = df_train.drop(c, axis=1)
    df_val_a = df_val.drop(c, axis=1)
    scores.append(accuracy_scoring(df_train_a, df_val_a, y_train, y_val))


In [22]:
arrs = list(zip(cols[:-1], scores, [abs(org_score - s) for s in scores]))

In [23]:
pd.DataFrame(arrs, columns=['feature','accuracy_wo','accuracy_diff']).sort_values('accuracy_diff')

Unnamed: 0,feature,accuracy_wo,accuracy_diff
2,marital,0.900907,0.0
3,education,0.900907,0.0
11,pdays,0.900907,0.0
12,previous,0.900907,0.0
4,balance,0.901017,0.000111
1,job,0.901128,0.000221
5,housing,0.901128,0.000221
6,contact,0.900464,0.000442
0,age,0.901349,0.000442
7,day,0.901349,0.000442


# Q6
Which of these C leads to the best accuracy on the validation set?

In [24]:
C_list = [0.01, 0.1, 1, 10, 100]
scores = [accuracy_scoring(df_train, df_val, y_train, y_val, C=i) for i in C_list]
c_df = pd.DataFrame(list(zip(C_list, scores)), columns=['C', 'accuracy'])

In [25]:
c_df.sort_values('accuracy', ascending=False)

Unnamed: 0,C,accuracy
2,1.0,0.900907
3,10.0,0.900907
1,0.1,0.900796
4,100.0,0.900686
0,0.01,0.897921
