In [10]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report

In [12]:
csv_path = "/content/bank.csv"
df = pd.read_csv(csv_path, sep=';')
df["y"] = df["y"].map({"yes": 1, "no": 0})

X = df.drop(columns=["y"])
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [13]:
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="most_frequent")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, num_cols),
        ("cat", categorical_transformer, cat_cols)
    ]
)

In [15]:
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [16]:
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=["No", "Yes"]))

Confusion Matrix:
 [[664 137]
 [ 23  81]]

Classification Report:

              precision    recall  f1-score   support

          No       0.97      0.83      0.89       801
         Yes       0.37      0.78      0.50       104

    accuracy                           0.82       905
   macro avg       0.67      0.80      0.70       905
weighted avg       0.90      0.82      0.85       905



In [18]:
ohe = clf.named_steps["preprocess"].named_transformers_["cat"].named_steps["onehot"]
feature_names = np.concatenate([num_cols, ohe.get_feature_names_out(cat_cols)])
coef = clf.named_steps["model"].coef_.ravel()

top3_idx = np.argsort(coef)[-3:][::-1]
print("\nTop 3 features most strongly influencing 'Yes':")
for rank, i in enumerate(top3_idx, 1):
    print(f"{rank}. {feature_names[i]} (coef = {coef[i]:.4f})")


Top 3 features most strongly influencing 'Yes':
1. month_oct (coef = 1.8044)
2. poutcome_success (coef = 1.6752)
3. month_mar (coef = 1.6407)
