In [5]:
# Ignore all warnings
import warnings
warnings.filterwarnings("ignore")

# Data imports
import pandas as pd
import numpy as np
from tqdm import tqdm

# Visualization imports
from ydata_profiling import ProfileReport
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [8, 4]
import plotly.express as px
import plotly
plotly.offline.init_notebook_mode(connected=True)
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import seaborn as sns


# Preprocessing
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_val_score, train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [7]:
df = pd.read_csv('bank-full.csv', sep=';')

### **Bank Client Data**

The dataset contains information about bank clients. Here is the information organized in a table format:

#### **Client Information**
| Attribute | Description                   | Type        | Possible Values                                                                 |
|-----------|-------------------------------|-------------|---------------------------------------------------------------------------------|
| **Age**       | Client age                    | Numeric     |                                                                                 |
| **Job**       | Type of job                   | Categorical | *admin.*, *unknown*, *unemployed*, *management*, *housemaid*, *entrepreneur*, *student*, *blue-collar*, *self-employed*, *retired*, *technician*, *services* |
| **Marital**   | Marital status                | Categorical | *married*, *divorced (means divorced or widowed)*, *single*                                                |
| **Education** | Education level               | Categorical | *unknown*, *secondary*, *primary*, *tertiary*                                   |
| **Default**   | Has credit in default?        | Binary      | *yes*, *no*                                                                     |
| **Balance**   | Average yearly balance, euros | Numeric     |                                                                                 |
| **Housing**   | Has housing loan?             | Binary      | *yes*, *no*                                                                     |
| **Loan**      | Has personal loan?            | Binary      | *yes*, *no*                                                                     |

#### **Related with the last contact of the current campaign**
| Attribute | Description                   | Type        | Possible Values                                                                 |
|-----------|-------------------------------|-------------|---------------------------------------------------------------------------------|
| **Contact**   | Contact communication type    | Categorical | *unknown*, *telephone*, *cellular*                                             |
| **Day**       | Last contact day of the month | Numeric     |                                                                                 |
| **Month**     | Last contact month of year    | Categorical | *jan*, *feb*, *mar*, *apr*, *may*, *jun*, *jul*, *aug*, *sep*, *oct*, *nov* *dec*                                                |
| **Duration**  | Last contact duration, seconds| Numeric     |                                                                                 |

#### **Other attributes**
| Attribute | Description                   | Type        | Possible Values                                                                 |
|-----------|-------------------------------|-------------|---------------------------------------------------------------------------------|
| **Campaign**  | Number of contacts in campaign| Numeric     |                                                                                 |
| **Pdays**     | Days since last contact       | Numeric     | -1 means not previously contacted                                             |
| **Previous**  | Number of contacts prior to this campaign | Numeric |                                                                         |
| **Poutcome**  | Outcome of previous marketing campaign  | Categorical | *unknown*, *other*, *failure*, *success*                                       |

#### **Output variable (desired target)**
| Attribute | Description                   | Type        | Possible Values                                                                 |
|-----------|-------------------------------|-------------|---------------------------------------------------------------------------------|
| **Y**  | has the client subscribed a term deposit?| binary     |  *yes*, *no*  |


In [3]:
# create report
profile = ProfileReport(
    df,
    title='Bank Marketing Profiling Report',
    interactions=None,
    duplicates=None
)

# profile.to_notebook_iframe()
profile.to_widgets()

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

In [8]:
# Define columns
ordinal_feature = ['pdays']
nominal_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'poutcome']
numeric_features = ['age', 'balance', 'day', 'duration', 'campaign', 'previous']

# Create a ColumnTransformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('ordinal', OrdinalEncoder(categories=[sorted(df.pdays.unique())]), ordinal_feature),
        ('nominal', OneHotEncoder(drop='first'), nominal_features),
        ('numerical', StandardScaler(), numeric_features)
    ],
)

# Encode Y
# Define the mapping
mapping = {'yes': 1, 'no': 0}
# Apply mapping to the target column
df['y'] = df['y'].map(mapping)

# SPLIT data
X_train, X_test, y_train, y_test = train_test_split(df.drop('y', axis=1), df['y'], test_size=0.2, stratify=df['y'], random_state=42)

In [9]:
# Create a list of model objects
models = [
    ('Random Forest', RandomForestClassifier()),
    ('XGBoost', XGBClassifier()),
    ('Logistic Regression', LogisticRegression()),
    ('SVC', SVC()),
    ('K Neighbors', KNeighborsClassifier())
]

In [10]:
# Define the KFold for cross-validation
split = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for name, model in tqdm(models):
    # Compose data preprocessing and model into a single pipeline
    steps = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('model', model)
    ])
    
    # Compute cross-validation accuracy for each model
    cv_results = cross_val_score(steps, X_train, y_train, cv=split)
    
    # Outputs rounded to 4 decimal places
    min_score = round(cv_results.min(), 4)
    max_score = round(cv_results.max(), 4)
    mean_score = round(cv_results.mean(), 4)
    std_dev = round(cv_results.std(), 4)
    
    print(f"[{name}] Cross Validation Accuracy Score: {mean_score} +/- {std_dev} (std) min: {min_score}, max: {max_score}")

 20%|██        | 1/5 [01:21<05:25, 81.50s/it]

[Random Forest] Cross Validation Accuracy Score: 0.9041 +/- 0.002 (std) min: 0.9016, max: 0.907


 40%|████      | 2/5 [01:26<01:49, 36.44s/it]

[XGBoost] Cross Validation Accuracy Score: 0.9045 +/- 0.0027 (std) min: 0.8995, max: 0.907


 60%|██████    | 3/5 [01:28<00:41, 20.55s/it]

[Logistic Regression] Cross Validation Accuracy Score: 0.9014 +/- 0.0029 (std) min: 0.8981, max: 0.905


 80%|████████  | 4/5 [02:56<00:47, 47.39s/it]

[SVC] Cross Validation Accuracy Score: 0.8829 +/- 0.0002 (std) min: 0.8825, max: 0.8832


100%|██████████| 5/5 [03:20<00:00, 40.09s/it]

[K Neighbors] Cross Validation Accuracy Score: 0.8922 +/- 0.0047 (std) min: 0.8843, max: 0.8976





XGBoosting wins with a small margin

In [11]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [12]:
param_dist = {
    'n_estimators': range(50, 500, 50),
    'learning_rate': [0.01, 0.1, 0.2, 0.3],
    'max_depth': range(3, 10),
    'min_child_weight': range(1, 6),
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 0.4],
    'reg_alpha': [0, 0.1, 0.2, 0.3],
    'reg_lambda': [0, 0.1, 0.2, 0.3],
}


In [13]:
xgb = XGBClassifier(random_state=42)
random_search = RandomizedSearchCV(
    estimator=xgb,
    param_distributions=param_dist,
    n_iter=100,
    scoring='accuracy',
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)


In [14]:
random_search.fit(X_train, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [15]:
best_params = random_search.best_params_
best_estimator = random_search.best_estimator_

print(best_params)
best_estimator

{'subsample': 1.0, 'reg_lambda': 0.3, 'reg_alpha': 0, 'n_estimators': 150, 'min_child_weight': 4, 'max_depth': 5, 'learning_rate': 0.1, 'gamma': 0.3, 'colsample_bytree': 0.7}


In [17]:
y_pred = best_estimator.predict(X_test)

In [21]:
# Generate and print the classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)

Classification Report:
               precision    recall  f1-score   support

           0       0.93      0.97      0.95      7985
           1       0.66      0.45      0.53      1058

    accuracy                           0.91      9043
   macro avg       0.79      0.71      0.74      9043
weighted avg       0.90      0.91      0.90      9043



In [36]:
# Create an interactive heatmap using Plotly
fig = px.imshow(cm, x=best_estimator.classes_, y=best_estimator.classes_,
                labels=dict(x="Predicted Labels", y="True Labels", color="Count"),
                title="Confusion Matrix")
fig.update_layout(width=800, height=500)

# Set count values on tiles
fig.update_traces(texttemplate='%{text}', text=cm.tolist())

# Display the interactive plot
fig.show()

In [53]:
# Get feature importances
importances = best_estimator.feature_importances_

# Get feature names
feature_names = list(ordinal_feature+preprocessor.transformers_[1][1].get_feature_names_out().tolist()+numeric_features)  # Replace with your actual feature names

# Create a DataFrame with feature names and importances
importance_df = pd.DataFrame({'Feature': feature_names, 'Importance': importances})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Importance', ascending=False)

# Create an interactive bar plot using Plotly
fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h', title='Feature Importances from XGBoost Model')

# Display the interactive plot
fig.show()

In [59]:
import shap

In [60]:
# Create an explainer using SHAP
explainer = shap.Explainer(best_estimator)

# Calculate SHAP values for the testing data
shap_values = explainer.shap_values(X_test)

# Visualize feature importances using a summary plot
shap.summary_plot(shap_values, X_test, feature_names=feature_names)