In [1]:
# ============================
# 1. Import libraries
# ============================
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [2]:
df = pd.read_csv("bank-additional-full.csv", sep=";")

print("Dataset shape:", df.shape)
print(df.head())


Dataset shape: (41188, 21)
   age        job  marital    education  default housing loan    contact  \
0   56  housemaid  married     basic.4y       no      no   no  telephone   
1   57   services  married  high.school  unknown      no   no  telephone   
2   37   services  married  high.school       no     yes   no  telephone   
3   40     admin.  married     basic.6y       no      no   no  telephone   
4   56   services  married  high.school       no      no  yes  telephone   

  month day_of_week  ...  campaign  pdays  previous     poutcome emp.var.rate  \
0   may         mon  ...         1    999         0  nonexistent          1.1   
1   may         mon  ...         1    999         0  nonexistent          1.1   
2   may         mon  ...         1    999         0  nonexistent          1.1   
3   may         mon  ...         1    999         0  nonexistent          1.1   
4   may         mon  ...         1    999         0  nonexistent          1.1   

   cons.price.idx  cons.conf.

In [3]:
# ============================================
# 3. Encode target (yes/no -> 1/0)
# ============================================
df['y'] = df['y'].map({'yes': 1, 'no': 0})

print("Target distribution:\n", df['y'].value_counts())


Target distribution:
 0    36548
1     4640
Name: y, dtype: int64


In [21]:
# ============================================
# 4. Encode categorical variables
# ============================================
categorical_cols = df.select_dtypes(include=['object']).columns
df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)

print("Dataset shape after encoding:", df.shape)


Dataset shape after encoding: (41188, 54)


In [22]:
# ============================================
# 5. Train-test split
# ============================================
X = df.drop("y", axis=1)
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


In [23]:
# ============================================
# 6. Train Logistic Regression
# ============================================
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [24]:
# ============================================
# 7. Evaluation
# ============================================
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))

Accuracy: 0.9150279193979121

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.98      0.95      7310
           1       0.70      0.43      0.53       928

    accuracy                           0.92      8238
   macro avg       0.82      0.70      0.74      8238
weighted avg       0.90      0.92      0.91      8238


Confusion Matrix:
 [[7140  170]
 [ 530  398]]
