In [74]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    average_precision_score,
    confusion_matrix)
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.base import clone
from scipy.stats import randint, uniform

In [75]:
df = pd.read_csv('../data/processed/bank_marketing_ml.csv')

In [58]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,poutcome_missing,target,pdays_contacted,has_previous_campaign
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45195 entries, 0 to 45194
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    45195 non-null  int64 
 1   job                    45195 non-null  object
 2   marital                45195 non-null  object
 3   education              45195 non-null  object
 4   default                45195 non-null  object
 5   balance                45195 non-null  int64 
 6   housing                45195 non-null  object
 7   loan                   45195 non-null  object
 8   contact                45195 non-null  object
 9   day                    45195 non-null  int64 
 10  month                  45195 non-null  object
 11  campaign               45195 non-null  int64 
 12  pdays                  45195 non-null  int64 
 13  previous               45195 non-null  int64 
 14  poutcome               45195 non-null  object
 15  y                  

In [48]:
df.target.value_counts()

target
0    39906
1     5289
Name: count, dtype: int64

In [76]:
numeric_cols = ['age','balance','day','campaign','pdays','previous','poutcome_missing','pdays_contacted','has_previous_campaign']
categorical_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

In [77]:
X = df[categorical_cols + numeric_cols].copy()
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [78]:
y_train.unique()

array([0, 1])

# Đánh giá Model Performance dựa trên 2 phương pháp xử lý class imbalance

## SMOTE

In [79]:
#Hàm preprocessing khi dùng smote
def build_tree_smote_components(cat_cols, num_cols, *, k_neighbors=5, random_state=42):
    def clip_only(X):
        X = X.astype(float)
        lo = np.nanpercentile(X, 1, axis=0)
        hi = np.nanpercentile(X, 99, axis=0)
        return np.clip(X, lo, hi)

    # PRE-SMOTE: cat -> ordinal (kỹ thuật), num -> impute + clip
    pre_smote = ColumnTransformer(
        transformers=[
            ("cat_ord", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
            ]), cat_cols),
            ("num", Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("clip", FunctionTransformer(clip_only, feature_names_out="one-to-one")),
            ]), num_cols),
        ],
        remainder="drop"
    )

    # cat indices là [0..len(cat_cols)-1] vì cat đặt trước num
    cat_idx = list(range(len(cat_cols)))

    smote = SMOTENC(
        categorical_features=cat_idx,
        k_neighbors=k_neighbors,
        random_state=random_state
    )

    # POST-SMOTE: onehot cat + passthrough num
    post = ColumnTransformer(
        transformers=[
            ("cat_oh", OneHotEncoder(handle_unknown="ignore"), cat_idx),
            ("num_passthrough", "passthrough", slice(len(cat_cols), None)),
        ],
        remainder="drop"
    )

    return pre_smote, smote, post


In [80]:
pre_smote, smote, post = build_tree_smote_components(
    categorical_cols, numeric_cols, k_neighbors=5, random_state=42
)

clf_gb_smote = ImbPipeline(steps=[
    ("pre_smote", pre_smote),
    ("smote", smote),
    ("post", post),
    ("model", GradientBoostingClassifier(
        random_state=42,
    ))
])

In [81]:
param_dist = {
    "smote__k_neighbors": [3, 5, 7],
    "model__n_estimators": randint(150, 501),      # 150..500
    "model__learning_rate": uniform(0.03, 0.12),   # 0.03..0.15
    "model__max_depth": [2, 3],
    "model__subsample": uniform(0.6, 0.4),         # 0.6..1.0
    "model__min_samples_leaf": [50, 100],
}

In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=clf_gb_smote,
    param_distributions=param_dist,
    n_iter=25,                 # 25 combo -> 25*5=125 fits (nhanh hơn ~4x)
    scoring="average_precision",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    refit=True,
)

print(search.best_score_, search.best_params_)

Fitting 5 folds for each of 25 candidates, totalling 125 fits
0.4278683548416372 {'model__learning_rate': np.float64(0.14587584396894712), 'model__max_depth': 3, 'model__min_samples_leaf': 100, 'model__n_estimators': 414, 'model__subsample': np.float64(0.6063865008880857), 'smote__k_neighbors': 5}


In [87]:
best_model_smote = search.best_estimator_

y_pred_smote = best_model_smote.predict(X_test)
y_proba_smote = best_model_smote.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred_smote))
print("ROC AUC:", roc_auc_score(y_test, y_proba_smote))
print("Average Precision:", average_precision_score(y_test, y_proba_smote))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7981
           1       0.58      0.29      0.39      1058

    accuracy                           0.89      9039
   macro avg       0.74      0.63      0.66      9039
weighted avg       0.87      0.89      0.88      9039

ROC AUC: 0.7963691650467593
Average Precision: 0.43855589925300476
Confusion Matrix:
 [[7753  228]
 [ 749  309]]


## Class-weight

In [27]:
def make_preprocess_tree(cat_cols, num_cols):
    def clip_only(X):
        X = X.astype(float)
        lo = np.nanpercentile(X, 1, axis=0)
        hi = np.nanpercentile(X, 99, axis=0)
        return np.clip(X, lo, hi)

    return ColumnTransformer(
        transformers=[
            ("num",
             Pipeline([
                 ("imp", SimpleImputer(strategy="median")),
                 ("clip", FunctionTransformer(clip_only, feature_names_out="one-to-one")),
             ]),
             num_cols),
            ("cat",
             Pipeline([
                 ("imp", SimpleImputer(strategy="most_frequent")),
                 ("oh", OneHotEncoder(handle_unknown="ignore")),
             ]),
             cat_cols),
        ],
        remainder="drop"
    )


In [40]:
param_grid = [
    {
        "model__n_estimators": [200, 400],
        "model__learning_rate": [0.05, 0.1],
        "model__max_depth": [2, 3],
        "model__subsample": [0.7, 0.9],
        "model__min_samples_leaf": [50, 100]
    }
]


In [42]:
#class_weight:
pre_tree = make_preprocess_tree(categorical_cols, numeric_cols)

clf_gb_cw = Pipeline(steps=[
    ("prep", pre_tree),
    ("model", GradientBoostingClassifier(
        random_state=42,
    ))
])


In [43]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_score = -1
best_params = None

for params in ParameterGrid(param_grid):
    scores = []

    for tr_idx, va_idx in skf.split(X_train, y_train):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        model = clone(clf_gb_cw)
        model.set_params(**params)

        sw_tr = compute_sample_weight("balanced", y_tr)
        model.fit(X_tr, y_tr, model__sample_weight=sw_tr)

        proba = model.predict_proba(X_va)[:, 1]
        scores.append(average_precision_score(y_va, proba))

    mean_score = np.mean(scores)

    if mean_score > best_score:
        best_score = mean_score
        best_params = params


In [44]:
print(f"BEST CV PR-AUC: {best_score:.4f}")
print(f"BEST PARAMS: {best_params}")

BEST CV PR-AUC: 0.4511
BEST PARAMS: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__min_samples_leaf': 50, 'model__n_estimators': 400, 'model__subsample': 0.9}


In [45]:
best_model = clone(clf_gb_cw)
best_model.set_params(**best_params)

sw_full = compute_sample_weight("balanced", y_train)

best_model.fit(
    X_train,
    y_train,
    model__sample_weight=sw_full
)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function mak...00253990CD9E0>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,400
,subsample,0.9
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,50
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


In [47]:
# Predict
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC :", average_precision_score(y_test, y_proba))

Confusion matrix:
 [[6666 1315]
 [ 358  700]]

Classification report:
               precision    recall  f1-score   support

           0     0.9490    0.8352    0.8885      7981
           1     0.3474    0.6616    0.4556      1058

    accuracy                         0.8149      9039
   macro avg     0.6482    0.7484    0.6720      9039
weighted avg     0.8786    0.8149    0.8378      9039

ROC-AUC: 0.8053346925791856
PR-AUC : 0.4561032146157314


**Fine tune mô hình bằng StratifiedKFold và ParamGrid**
- FP = 1.315 khách hàng: Mô hình dự đoán yes nhưng thực tế là no
- FN = 358 khách hàng: Khách có khả năng yes nhưng mô hình bỏ sót
- Với class imbalace: 
    - Recall = 0.66: phát hiện phần lớn khách hàng tiềm năng 
    - Precision = 0.34: trade-off cho một cuộc gọi không thành công & bỏ sót khách tiềm năng

Trong phạm vi đề tài, mô hình phù hợp cho việc hỗ trợ ra quyết định, giúp xếp hạng và ưu tiên danh sách khách hàng cần liên hệ trong chiến dịch telemarketing sau này.

So sánh kết quả của 2 phương pháp xử lý mất cân bằng dữ liệu: **Class-weight** cho ra kết quả tốt hơn.

# Đánh giá Business Performance dựa sau khi đã fine tune model

In [88]:
y_proba

array([0.2524482 , 0.30133478, 0.40234654, ..., 0.42302173, 0.83131034,
       0.11671982], shape=(9039,))

In [89]:
df_rank = pd.DataFrame({
    "p_yes": y_proba,
    "y_true": y_test.values
})

In [90]:
df_rank = df_rank.sort_values("p_yes", ascending=False)
df_rank.head()

Unnamed: 0,p_yes,y_true
981,0.989294,1
3520,0.988273,1
1453,0.987408,0
3969,0.98603,1
5363,0.984065,1


In [95]:
K = 0.15
n_top = int(len(df_rank) * K)

top_k = df_rank.head(n_top)

In [96]:
CR_model = top_k["y_true"].mean()
CR_model

np.float64(0.44206642066420665)

→ Trong **15% khách hàng** được ưu tiên gọi, **44% thực sự đồng ý** (yes).

In [97]:
CR_random = df_rank["y_true"].mean()
lift = CR_model / CR_random
print(f"Lift @ {K*100:.0f}%: {lift:.2f}")

Lift @ 15%: 3.78


So với chiến lược gọi ngẫu nhiên, mô hình giúp **tăng hơn 3.78 lần tỷ lệ chuyển đổi** khi chỉ tập trung vào 30% khách hàng tiềm năng nhất.

Kết quả cho thấy mô hình mang lại hiệu quả rõ rệt. Khi chỉ tập trung vào 15% khách hàng tiềm năng nhất, tỷ lệ chuyển đổi đạt 44%, cao hơn khoảng 4 lần so với tỷ lệ chuyển đổi trung bình của toàn bộ tập dữ liệu (~11–12%).