In [1]:
import pandas as pd
import numpy as np

### Preparing the dataset

In [2]:
!wget https://archive.ics.uci.edu/static/public/222/bank+marketing.zip

--2024-10-17 06:11:23--  https://archive.ics.uci.edu/static/public/222/bank+marketing.zip
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: unspecified
Saving to: ‘bank+marketing.zip’

bank+marketing.zip      [    <=>             ] 999.85K  1.20MB/s    in 0.8s    

2024-10-17 06:11:24 (1.20 MB/s) - ‘bank+marketing.zip’ saved [1023843]



In [3]:
!unzip bank+marketing.zip

Archive:  bank+marketing.zip
 extracting: bank.zip                
 extracting: bank-additional.zip     


In [4]:
!unzip bank.zip

Archive:  bank.zip
  inflating: bank-full.csv           
  inflating: bank-names.txt          
  inflating: bank.csv                


In [5]:
data = pd.read_csv("/content/bank-full.csv", sep=';')

In [6]:
# Select required columns
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month',
           'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
data = data[columns]

In [7]:
# Check missing values
data.isnull().sum()

Unnamed: 0,0
age,0
job,0
marital,0
education,0
balance,0
housing,0
contact,0
day,0
month,0
duration,0


In [8]:
# Get the mode of the 'education' column
print(f"The mode for the 'education' column is: {data['education'].mode()[0]}")

The mode for the 'education' column is: secondary


### Question 2

Create the correlation matrix for the numerical features of your dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features.

What are the two features that have the biggest correlation?

* age and balance
* day and campaign
* day and pdays
* pdays and previous

### Target encoding

* Now we want to encode the y variable.
* Let's replace the values yes/no with 1/0.

### Split the data

* Split your data in train/val/test sets with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value y is not in your dataframe.

In [9]:
df_numerical = data.copy().select_dtypes(include='number')

In [10]:
# Create the correlation matrix
correlation_matrix = df_numerical.corr()
correlation_matrix

Unnamed: 0,age,balance,day,duration,campaign,pdays,previous
age,1.0,0.097783,-0.00912,-0.004648,0.00476,-0.023758,0.001288
balance,0.097783,1.0,0.004503,0.02156,-0.014578,0.003435,0.016674
day,-0.00912,0.004503,1.0,-0.030206,0.16249,-0.093044,-0.05171
duration,-0.004648,0.02156,-0.030206,1.0,-0.08457,-0.001565,0.001203
campaign,0.00476,-0.014578,0.16249,-0.08457,1.0,-0.088628,-0.032855
pdays,-0.023758,0.003435,-0.093044,-0.001565,-0.088628,1.0,0.45482
previous,0.001288,0.016674,-0.05171,0.001203,-0.032855,0.45482,1.0


In [11]:
highest_corr = correlation_matrix.unstack().abs()[correlation_matrix.unstack().abs().lt(1)].idxmax()
print(f"The two features with the highest correlation are: {highest_corr}")

The two features with the highest correlation are: ('pdays', 'previous')


### Target encoding

In [12]:
# Replace the values `yes`/`no` with `1`/`0` in column y.
df = data.copy()
df['y'] = df['y'].map({'yes': '1', 'no': '0'})

### Split the data

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# split data into train, validation, and test (60%/20%/20%)

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

assert len(df) == (len(df_train) + len(df_val) + len(df_test))

In [15]:
# Reset_index for y_train, y_val and y_test.
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Define y_train, y_val and y_test.
y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

# Drop y from y_train, y_val and y_test.
df_train = df_train.drop('y', axis=1)
df_val = df_val.drop('y', axis=1)
df_test = df_test.drop('y', axis=1)

assert 'y' not in df_train.columns
assert 'y' not in df_val.columns
assert 'y' not in df_test.columns

In [16]:
print(len(df_train), len(df_val), len(df_test))

27126 9042 9043


### Question 3
Calculate the mutual information score between y and other categorical variables in the dataset. Use the training set only.

Round the scores to 2 decimals using round(score, 2).

Which of these variables has the biggest mutual information score?

* contact
* education
* housing
* poutcome

In [17]:
from sklearn.metrics import mutual_info_score

In [18]:
for col in ["contact", "education", "housing", "poutcome"]:
    print(round(mutual_info_score(y_train, df_train[col]), 2))

0.01
0.0
0.01
0.03


The variable with the highest mutual information score is poutcome with a score of 0.03.

### Question 4
Now let's train a logistic regression.

Remember that we have several categorical variables in the dataset. Include them using one-hot encoding.

Fit the model on the training dataset.

To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:

model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)

Calculate the accuracy on the validation dataset and round it to 2 decimal digits.

What accuracy did you get?

* 0.6
* 0.7
* 0.8
* 0.9

In [19]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [20]:
# Derive X_train.
dv = DictVectorizer(sparse=False)
train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

# Fit X_train and y_train to model.
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state = 42)
model.fit(X_train, y_train)

In [21]:
# Derive X_val.
val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

# Obtain y_pred.
y_pred = model.predict(X_val)

In [22]:
# Calculate accuracy.
accuracy = np.round(accuracy_score(y_val, y_pred), 2)
print(f'Accuracy on the validation dataset: = {accuracy}')

Accuracy on the validation dataset: = 0.9


### Question 5
Let's find the least useful feature using the feature elimination technique.

Train a model with all these features (using the same parameters as in Q4).

Now exclude each feature from this set and train a model without it. Record the accuracy for each model.

For each feature, calculate the difference between the original accuracy and the accuracy without the feature.

Which of following feature has the smallest difference?

* age
* balance
* marital
* previous

In [23]:
#List the features
features = df_train.columns.to_list()
features

['age',
 'job',
 'marital',
 'education',
 'balance',
 'housing',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome']

In [24]:
# Store the original accuracy score of the model
original_accuracy = accuracy

# Create an empty DataFrame to store the results (eliminated feature, new accuracy, and the accuracy difference)
scores = pd.DataFrame(columns=['eliminate_ft', 'accuracy', 'difference'])

# Iterate over each feature to test its impact when removed
for feature in features:
    # Create a copy of the features and remove the current feature being tested
    subset = features.copy()
    subset.remove(feature)

    # Use DictVectorizer to convert the remaining features into a numeric format
    dv = DictVectorizer(sparse=False)
    train_dict = df_train[subset].to_dict(orient='records')
    X_train = dv.fit_transform(train_dict)

    # Train the Logistic Regression model with the modified feature set
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state = 42)
    model.fit(X_train, y_train)

    # Transform the validation data using the same feature set
    val_dict = df_val[subset].to_dict(orient='records')
    X_val = dv.transform(val_dict)

    # Make predictions on the validation set and calculate the accuracy
    y_pred = model.predict(X_val)
    score = accuracy_score(y_val, y_pred)

    # Record the feature that was removed, the new accuracy, and the difference compared to the original accuracy
    scores.loc[len(scores)] = [feature, score, original_accuracy - score]

In [25]:
# show scores
scores['difference'] = [abs(x) for x in scores['difference']]
scores

Unnamed: 0,eliminate_ft,accuracy,difference
0,age,0.901349,0.001349
1,job,0.901128,0.001128
2,marital,0.900907,0.000907
3,education,0.900907,0.000907
4,balance,0.901017,0.001017
5,housing,0.901128,0.001128
6,contact,0.900464,0.000464
7,day,0.901349,0.001349
8,month,0.899801,0.000199
9,duration,0.889737,0.010263


In [26]:
filtered_scores = scores[scores['eliminate_ft'].isin(['age', 'balance', 'marital', 'previous'])]
filtered_scores.sort_values(by='difference')

Unnamed: 0,eliminate_ft,accuracy,difference
2,marital,0.900907,0.000907
12,previous,0.900907,0.000907
4,balance,0.901017,0.001017
0,age,0.901349,0.001349


### Question 6
Now let's train a regularized logistic regression.

Let's try the following values of the parameter C: [0.01, 0.1, 1, 10, 100].

Train models using all the features as in Q4.

Calculate the accuracy on the validation dataset and round it to 3 decimal digits.

Which of these C leads to the best accuracy on the validation set?

* 0.01
* 0.1
* 1
* 10
* 100

Note: If there are multiple options, select the smallest C.

In [27]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

assert len(df) == (len(df_train) + len(df_val) + len(df_test))

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.y.values
y_val = df_val.y.values
y_test = df_test.y.values

df_train = df_train.drop('y', axis=1)
df_val = df_val.drop('y', axis=1)
df_test = df_test.drop('y', axis=1)

assert 'y' not in df_train.columns
assert 'y' not in df_val.columns
assert 'y' not in df_test.columns

In [32]:
# One-hot encode categorical variables in the training set
X_train_encoded = pd.get_dummies(df_train, drop_first=True)
X_val_encoded = pd.get_dummies(df_val, drop_first=True)

# Align the training and validation sets
X_train_encoded, X_val_encoded = X_train_encoded.align(X_val_encoded, join='left', axis=1, fill_value=0)

# Values of C to test
C_values = [0.01, 0.1, 1, 10, 100]

# Store the results
results = []

# Iterate over the values of C
for C in C_values:
    # Initialize and fit the logistic regression model with regularization
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_encoded, y_train)

    # Calculate accuracy on the validation dataset
    accuracy = accuracy_score(y_val, model.predict(X_val_encoded))

    # Round the accuracy to 3 decimal digits and store the result
    results.append((C, round(accuracy, 3)))

# Create a DataFrame from the results
df_results = pd.DataFrame(results, columns=["C", "Accuracy"])

# Identify the value of C with the best accuracy
best_accuracy_row = df_results.loc[df_results['Accuracy'].idxmax()]

# Print the results and the best C value
print(f"The best C value is: {best_accuracy_row['C']} with an accuracy of {best_accuracy_row['Accuracy']}")

The best C value is: 1.0 with an accuracy of 0.901
