In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score, accuracy_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

In [2]:
df = pd.read_csv("data/bank-full.csv", delimiter=";")
df

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,unknown,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,unknown,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,unknown,no


In [3]:
cols = ["age", "job", "marital", "education", "balance", "housing", "contact",
        "day", "month", "duration", "campaign", "pdays", "previous", "poutcome", "y"]
df = df[cols]

In [4]:
mf = df.isnull().sum()
mf[mf > 0]

Series([], dtype: int64)

### Question 1

What is the most frequent observation (mode) for the column `education`?

- `unknown`
- `primary`
- `secondary`
- `tertiary`


In [5]:
df.groupby('education').size()

education
primary       6851
secondary    23202
tertiary     13301
unknown       1857
dtype: int64

### Question 2

Create the [correlation matrix](https://www.google.com/search?q=correlation+matrix) for the numerical features of your dataset. 
In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

- `age` and `balance`
- `day` and `campaign`
- `day` and `pdays`
- `pdays` and `previous`

### Target encoding

* Now we want to encode the `y` variable.
* Let's replace the values `yes`/`no` with `1`/`0`.

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the `train_test_split` function) and set the seed to `42`.
* Make sure that the target value `y` is not in your dataframe.


In [6]:
ndf = df.select_dtypes(include=['number'])
corr_m = ndf.corr()
corr_m

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [7]:
corr_m = corr_m.mask(np.eye(len(corr_m), dtype=bool))
corr_m

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,


In [8]:
print(corr_m.abs().stack().max(axis=0))
print(corr_m.abs().stack().idxmax(axis=0))

0.4548196354805043
('pdays', 'previous')


In [9]:
corr_m_lower = corr_m.where(np.triu(np.ones(corr_m.shape), k=1).astype(bool))
print(corr_m_lower.stack().idxmax())
print(corr_m_lower.stack().max())

('pdays', 'previous')
0.4548196354805043


In [10]:
df['y'] = df['y'].map({'yes': 1, 'no': 0}).astype(int)
df.iloc[:, -1]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['y'] = df['y'].map({'yes': 1, 'no': 0}).astype(int)


0        0
1        0
2        0
3        0
4        0
        ..
45206    1
45207    1
45208    1
45209    0
45210    0
Name: y, Length: 45211, dtype: int64

In [11]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
y_train = df_train['y']
df_train = df_train.drop('y', axis=1)
y_val = df_val['y']
df_val = df_val.drop('y', axis=1)
y_test = df_test['y']
df_test = df_test.drop('y', axis=1)
len(df_train), len(df_val), len(df_test)

(27126, 9042, 9043)

### Question 3

* Calculate the mutual information score between `y` and other categorical variables in the dataset. Use the training set only.
* Round the scores to 2 decimals using `round(score, 2)`.

Which of these variables has the biggest mutual information score?
  
- `contact`
- `education`
- `housing`
- `poutcome`


In [12]:
cols_wo_y = cols[:-1]
scores = [mutual_info_score(df_train[col], y_train) for col in cols_wo_y]
max(zip(cols_wo_y, scores), key=lambda x: x[1])

('balance', 0.11661461717469257)

In [13]:
q3_cols = ['contact', 'education', 'housing', 'poutcome']
scores = [mutual_info_score(df_train[col], y_train) for col in q3_cols]
max(zip(q3_cols, scores), key=lambda x: x[1])

('poutcome', 0.029532821290436224)

### Question 4

* Now let's train a logistic regression.
* Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.
* Fit the model on the training dataset.
    - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    - `model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)`
* Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

- 0.6
- 0.7
- 0.8
- 0.9


In [14]:
def one_hot_encoding(df, dv, train=True):
    x_dict = df.to_dict(orient='records')
    if train:
        return dv.fit_transform(x_dict)
    else:
        return dv.transform(x_dict)

In [15]:
def accuracy_scoring(df_train, df_val, y_train, y_val, C=1.0):
    dv = DictVectorizer(sparse=False)
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    x_train = one_hot_encoding(df_train, dv)
    model.fit(x_train, y_train)
    x_val = one_hot_encoding(df_val, dv, train=False)
    y_val_pred = model.predict(x_val)
    return (y_val == y_val_pred).mean()

In [16]:
org_score = accuracy_scoring(df_train, df_val, y_train, y_val)
org_score

0.9011280690112807

In [17]:
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
dv = DictVectorizer(sparse=False)
x_train = one_hot_encoding(df_train, dv)
model.fit(x_train, y_train)
x_val = one_hot_encoding(df_val, dv, train=False)
y_val_pred = model.predict(x_val)
accuracy_score(y_val, y_val_pred)

0.9011280690112807

### Question 5 

* Let's find the least useful feature using the *feature elimination* technique.
* Train a model using the same features and parameters as in Q4 (without rounding).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature. 

Which of following feature has the smallest difference?

- `age`
- `balance`
- `marital`
- `previous`

> **Note**: The difference doesn't have to be positive.


In [18]:
q5_cols = ['age', 'balance', 'marital', 'previous']

In [19]:
scores = []
for c in q5_cols:
    #a = [col for col in df_train.columns if col != c]
    df_train_a = df_train.drop(c, axis=1)
    df_val_a = df_val.drop(c, axis=1)
    scores.append(accuracy_scoring(df_train_a, df_val_a, y_train, y_val))

In [20]:
score_diff = [abs(org_score - s) for s in scores]
min(zip(q5_cols, score_diff), key=lambda x: x[1])

('previous', 0.0)

In [21]:
(q5_cols, score_diff)

(['age', 'balance', 'marital', 'previous'],
 [0.00011059500110588427, 0.00044238000442387015, 0.00022119000221187957, 0.0])

In [22]:
scores = []
for c in cols[:-1]:
    #a = [col for col in df_train.columns if col != c]
    df_train_a = df_train.drop(c, axis=1)
    df_val_a = df_val.drop(c, axis=1)
    scores.append(accuracy_scoring(df_train_a, df_val_a, y_train, y_val))


In [23]:
arrs = list(zip(cols[:-1], scores, [abs(org_score - s) for s in scores]))

In [24]:
pd.DataFrame(arrs, columns=['feature','accuracy_wo','accuracy_diff']).sort_values('accuracy_diff')

Unnamed: 0,feature,accuracy_wo,accuracy_diff
12,previous,0.901128,0.0
0,age,0.901239,0.000111
3,education,0.901017,0.000111
11,pdays,0.901017,0.000111
2,marital,0.901349,0.000221
7,day,0.901349,0.000221
1,job,0.900796,0.000332
6,contact,0.900796,0.000332
4,balance,0.900686,0.000442
5,housing,0.900464,0.000664


### Question 6

* Now let's train a regularized logistic regression.
* Let's try the following values of the parameter `C`: `[0.01, 0.1, 1, 10, 100]`.
* Train models using all the features as in Q4.
* Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these `C` leads to the best accuracy on the validation set?

- 0.01
- 0.1
- 1
- 10
- 100

> **Note**: If there are multiple options, select the smallest `C`.

## Submit the results

* Submit your results here: https://courses.datatalks.club/ml-zoomcamp-2024/homework/hw03
* If your answer doesn't match options exactly, select the closest one

In [25]:
C_list = [0.01, 0.1, 1, 10, 100]
scores = [accuracy_scoring(df_train, df_val, y_train, y_val, C=i) for i in C_list]
c_df = pd.DataFrame(list(zip(C_list, scores)), columns=['C', 'accuracy'])

In [26]:
c_df.sort_values('accuracy', ascending=False)

Unnamed: 0,C,accuracy
2,1.0,0.901128
1,0.1,0.900907
4,100.0,0.900575
3,10.0,0.89969
0,0.01,0.897589


In [32]:
scores

{0.01: 0.898, 0.1: 0.901, 1: 0.901, 10: 0.9, 100: 0.901}

In [31]:
max(scores, key=scores.get)

0.1

In [27]:
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [29]:
scores = {}
for C in [0.01, 0.1, 1, 10, 100]:
    model = LogisticRegression(solver='liblinear', max_iter=1000, C=C, random_state=42)
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_val)
    
    score = accuracy_score(y_val, y_pred)
    scores[C] = round(score, 3)
    print(f'C = {C}:\t Accuracy = {score}')

C = 0.01:	 Accuracy = 0.8975890289758903
C = 0.1:	 Accuracy = 0.9009068790090687
C = 1:	 Accuracy = 0.9011280690112807
C = 10:	 Accuracy = 0.8996903339969033
C = 100:	 Accuracy = 0.900575094005751


In [30]:
print(f'The smallest `C` is {max(scores, key=scores.get)}.')

The smallest `C` is 0.1.
