In [82]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Load the dataset
df = pd.read_csv('bank-full.csv', sep=';')

# Select specified columns
columns = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 
           'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df = df[columns]

# Check for missing values
print("Missing values:\n", df.isna().sum())

Missing values:
 age          0
job          0
marital      0
education    0
balance      0
housing      0
contact      0
day          0
month        0
duration     0
campaign     0
pdays        0
previous     0
poutcome     0
y            0
dtype: int64


In [None]:
# Question 1: Find the most frequent value (mode) of 'education'
education_mode = df['education'].mode()[0]
print("Q1: Mode of education:", education_mode)  

Q1: Mode of education: secondary


In [None]:
# Question 2: Find the pair of numerical features with the highest correlation
numerical_features = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
corr_matrix = df[numerical_features].corr()
corr_matrix_unstack = corr_matrix.abs().unstack()
corr_matrix_unstack = corr_matrix_unstack[corr_matrix_unstack < 1.0]  
max_corr_pair = corr_matrix_unstack.idxmax()
print("Q2: Features with highest correlation:", max_corr_pair) 

Q2: Features with highest correlation: ('pdays', 'previous')


In [85]:
# Encode target variable y: yes -> 1, no -> 0
df['y'] = df['y'].map({'yes': 1, 'no': 0})

# Separate features and target
X = df.drop('y', axis=1)
y = df['y']

# Split data: 60% train, 20% validation, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.25, random_state=42)

In [86]:
# Question 3: Calculate mutual information for categorical variables
categorical_features = ['job', 'marital', 'education', 'housing', 'contact', 'month', 'poutcome']
X_train_encoded = X_train[categorical_features].copy()
for col in categorical_features:
    le = LabelEncoder()
    X_train_encoded[col] = le.fit_transform(X_train[col].astype(str))

mi_scores = mutual_info_classif(X_train_encoded, y_train, random_state=42)
mi_scores = dict(zip(categorical_features, np.round(mi_scores, 2)))
max_mi_feature = max(mi_scores, key=mi_scores.get)
print("Q3: Feature with highest mutual information:", max_mi_feature, mi_scores[max_mi_feature])  

Q3: Feature with highest mutual information: poutcome 0.04


In [87]:
# Prepare data for logistic regression with one-hot encoding
X_train_full = pd.get_dummies(X_train, columns=categorical_features)
X_val_full = pd.get_dummies(X_val, columns=categorical_features)
X_test_full = pd.get_dummies(X_test, columns=categorical_features)

# Align columns to ensure same features in train, validation, and test
X_val_full = X_val_full.reindex(columns=X_train_full.columns, fill_value=0)
X_test_full = X_test_full.reindex(columns=X_train_full.columns, fill_value=0)

In [None]:
# Question 4: Train logistic regression and calculate validation accuracy
model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
model.fit(X_train_full, y_train)
val_pred = model.predict(X_val_full)
val_accuracy = round(accuracy_score(y_val, val_pred), 2)
print("Q4: Validation accuracy:", val_accuracy)  

Q4: Validation accuracy: 0.9


In [None]:
# Question 5: Find the least important feature via feature elimination
baseline_accuracy = val_accuracy
features_to_test = ['age', 'balance', 'marital', 'previous']
accuracy_diffs = {}

for feature in features_to_test:
 
    if feature in categorical_features:
        drop_cols = [col for col in X_train_full.columns if col.startswith(feature + '_')]
    else:
        drop_cols = [feature]
    
    X_train_drop = X_train_full.drop(columns=drop_cols)
    X_val_drop = X_val_full.drop(columns=drop_cols)
    
    # Train model without the feature
    model = LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42)
    model.fit(X_train_drop, y_train)
    val_pred_drop = model.predict(X_val_drop)
    accuracy_drop = accuracy_score(y_val, val_pred_drop)
    
    # Calculate difference
    diff = baseline_accuracy - accuracy_drop
    accuracy_diffs[feature] = diff

least_important_feature = min(accuracy_diffs, key=lambda k: abs(accuracy_diffs[k]))
print("Q5: Least important feature:", least_important_feature, "with difference:", accuracy_diffs[least_important_feature]) 

Q5: Least important feature: marital with difference: -0.00013271400132708333


In [None]:
# Question 6: Test different C values for logistic regression
C_values = [0.01, 0.1, 1, 10, 100]
accuracies = {}

for C in C_values:
    model = LogisticRegression(solver='liblinear', C=C, max_iter=1000, random_state=42)
    model.fit(X_train_full, y_train)
    val_pred = model.predict(X_val_full)
    accuracy = round(accuracy_score(y_val, val_pred), 3)
    accuracies[C] = accuracy

best_C = min([C for C in C_values if accuracies[C] == max(accuracies.values())])
print("Q6: Best C value:", best_C, "with accuracy:", accuracies[best_C])  

Q6: Best C value: 0.1 with accuracy: 0.901


In [91]:
# Display all answers
print("Answers to all questions:")
print("Q1: Mode of education:", education_mode)
print("Q2: Features with highest correlation:", max_corr_pair)
print("Q3: Feature with highest mutual information:", max_mi_feature, "with score:", mi_scores[max_mi_feature])
print("Q4: Validation accuracy:", val_accuracy)
print("Q5: Least important feature:", least_important_feature, "with difference:", accuracy_diffs[least_important_feature])
print("Q6: Best C value:", best_C, "with accuracy:", accuracies[best_C])

Answers to all questions:
Q1: Mode of education: secondary
Q2: Features with highest correlation: ('pdays', 'previous')
Q3: Feature with highest mutual information: poutcome with score: 0.04
Q4: Validation accuracy: 0.9
Q5: Least important feature: marital with difference: -0.00013271400132708333
Q6: Best C value: 0.1 with accuracy: 0.901
