In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
import matplotlib as plt
import numpy as np
import seaborn as sns

In [None]:
df = pd.read_csv('bank-full.csv', sep=';')
base = ['age', 'job', 'marital', 'education', 'balance', 'housing', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
df = df[base]
df.head()

In [None]:
df['education'].mode()

In [None]:
df.dtypes

In [None]:
numerical = ['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']
df[numerical].corr()

In [None]:
df['y'] = df['y'].replace({'yes': 1, 'no': 0}).astype(int)
df['y'].head()

In [None]:
X = df.drop(columns=['y'])
y = df['y']

X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.4, random_state=42)

X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

In [None]:
categorical = ['education', 'housing', 'contact', 'poutcome']

def calculate_mi(series):
    return mutual_info_score(series, y_train)

df_mi = X_train[categorical].apply(calculate_mi)
df_mi = df_mi.sort_values(ascending=False).to_frame(name='MI')

print(df_mi.head())
print(df_mi.tail())

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(drop='first'), categorical_columns)
    ], remainder='passthrough'
)

# Create a pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(solver='liblinear', C=1.0, max_iter=1000, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Validate the model
accuracy = model.score(X_val, y_val)
print(round(accuracy, 2))

In [None]:
base_accuracy = model.score(X_val, y_val)
feature_accuracies = {}

for column in X_train.columns:
    X_train_temp = X_train.drop(columns=[column])
    X_val_temp = X_val.drop(columns=[column])
    
    # Train model without this feature
    model.fit(X_train_temp, y_train)
    accuracy = model.score(X_val_temp, y_val)
    feature_accuracies[column] = base_accuracy - accuracy

# Identify the feature with the smallest difference
print(min(feature_accuracies, key=feature_accuracies.get))