In [13]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    average_precision_score,
    confusion_matrix)
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.base import clone
from scipy.stats import randint, uniform

In [14]:
df = pd.read_csv('../data/processed/bank_marketing_ml.csv')

In [15]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,poutcome_missing,target,pdays_contacted,has_previous_campaign
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,1,-1,0,no_previous_campaign,no,1,0,0,0


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45195 entries, 0 to 45194
Data columns (total 20 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   age                    45195 non-null  int64 
 1   job                    45195 non-null  object
 2   marital                45195 non-null  object
 3   education              45195 non-null  object
 4   default                45195 non-null  object
 5   balance                45195 non-null  int64 
 6   housing                45195 non-null  object
 7   loan                   45195 non-null  object
 8   contact                45195 non-null  object
 9   day                    45195 non-null  int64 
 10  month                  45195 non-null  object
 11  campaign               45195 non-null  int64 
 12  pdays                  45195 non-null  int64 
 13  previous               45195 non-null  int64 
 14  poutcome               45195 non-null  object
 15  y                  

In [17]:
df.target.value_counts()

target
0    39906
1     5289
Name: count, dtype: int64

In [18]:
numeric_cols = ['age','balance','day','campaign','pdays','previous','poutcome_missing','pdays_contacted','has_previous_campaign']
categorical_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

In [19]:
X = df[categorical_cols + numeric_cols].copy()
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [20]:
y_train.unique()

array([0, 1])

# Đánh giá Model Performance dựa trên 2 phương pháp xử lý class imbalance

## SMOTE

In [21]:
#Hàm preprocessing khi dùng smote
def build_tree_smote_components(cat_cols, num_cols, *, k_neighbors=5, random_state=42):
    def clip_only(X):
        X = X.astype(float)
        lo = np.nanpercentile(X, 1, axis=0)
        hi = np.nanpercentile(X, 99, axis=0)
        return np.clip(X, lo, hi)

    # PRE-SMOTE: cat -> ordinal (kỹ thuật), num -> impute + clip
    pre_smote = ColumnTransformer(
        transformers=[
            ("cat_ord", Pipeline([
                ("imp", SimpleImputer(strategy="most_frequent")),
                ("ord", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1)),
            ]), cat_cols),
            ("num", Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("clip", FunctionTransformer(clip_only, feature_names_out="one-to-one")),
            ]), num_cols),
        ],
        remainder="drop"
    )

    # cat indices là [0..len(cat_cols)-1] vì cat đặt trước num
    cat_idx = list(range(len(cat_cols)))

    smote = SMOTENC(
        categorical_features=cat_idx,
        k_neighbors=k_neighbors,
        random_state=random_state
    )

    # POST-SMOTE: onehot cat + passthrough num
    post = ColumnTransformer(
        transformers=[
            ("cat_oh", OneHotEncoder(handle_unknown="ignore"), cat_idx),
            ("num_passthrough", "passthrough", slice(len(cat_cols), None)),
        ],
        remainder="drop"
    )

    return pre_smote, smote, post


In [22]:
pre_smote, smote, post = build_tree_smote_components(
    categorical_cols, numeric_cols, k_neighbors=5, random_state=42
)

clf_gb_smote = ImbPipeline(steps=[
    ("pre_smote", pre_smote),
    ("smote", smote),
    ("post", post),
    ("model", GradientBoostingClassifier(
        random_state=42,
    ))
])

In [23]:
param_dist = {
    "smote__k_neighbors": [3, 5, 7],
    "model__n_estimators": randint(150, 501),      # 150..500
    "model__learning_rate": uniform(0.03, 0.12),   # 0.03..0.15
    "model__max_depth": [2, 3],
    "model__subsample": uniform(0.6, 0.4),         # 0.6..1.0
    "model__min_samples_leaf": [50, 100],
}

In [24]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

search = RandomizedSearchCV(
    estimator=clf_gb_smote,
    param_distributions=param_dist,
    n_iter=25,                 # 25 combo -> 25*5=125 fits (nhanh hơn ~4x)
    scoring="average_precision",
    cv=cv,
    n_jobs=-1,
    verbose=2,
    random_state=42,
    refit=True,
)

print(search.best_score_, search.best_params_)

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_score_'

In [None]:
best_model_smote = search.best_estimator_

y_pred_smote = best_model_smote.predict(X_test)
y_proba_smote = best_model_smote.predict_proba(X_test)[:, 1]
print(classification_report(y_test, y_pred_smote))
print("ROC AUC:", roc_auc_score(y_test, y_proba_smote))
print("Average Precision:", average_precision_score(y_test, y_proba_smote))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_smote))

              precision    recall  f1-score   support

           0       0.91      0.97      0.94      7981
           1       0.58      0.29      0.39      1058

    accuracy                           0.89      9039
   macro avg       0.74      0.63      0.66      9039
weighted avg       0.87      0.89      0.88      9039

ROC AUC: 0.7963691650467593
Average Precision: 0.43855589925300476
Confusion Matrix:
 [[7753  228]
 [ 749  309]]


## Class-weight

In [35]:
def make_preprocess_tree(cat_cols, num_cols):
    def clip_only(X):
        X = X.astype(float)
        lo = np.nanpercentile(X, 1, axis=0)
        hi = np.nanpercentile(X, 99, axis=0)
        return np.clip(X, lo, hi)

    return ColumnTransformer(
        transformers=[
            ("num",
             Pipeline([
                 ("imp", SimpleImputer(strategy="median")),
                 ("clip", FunctionTransformer(clip_only, feature_names_out="one-to-one")),
             ]),
             num_cols),
            ("cat",
             Pipeline([
                 ("imp", SimpleImputer(strategy="most_frequent")),
                 ("oh", OneHotEncoder(handle_unknown="ignore")),
             ]),
             cat_cols),
        ],
        remainder="drop"
    )


In [36]:
param_grid = [
    {
        "model__n_estimators": [200, 400],
        "model__learning_rate": [0.05, 0.1],
        "model__max_depth": [2, 3],
        "model__subsample": [0.7, 0.9],
        "model__min_samples_leaf": [50, 100]
    }
]


In [27]:
#class_weight:
pre_tree = make_preprocess_tree(categorical_cols, numeric_cols)

clf_gb_cw = Pipeline(steps=[
    ("prep", pre_tree),
    ("model", GradientBoostingClassifier(
        random_state=42,
    ))
])


In [28]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

best_score = -1
best_params = None

for params in ParameterGrid(param_grid):
    scores = []

    for tr_idx, va_idx in skf.split(X_train, y_train):
        X_tr, X_va = X_train.iloc[tr_idx], X_train.iloc[va_idx]
        y_tr, y_va = y_train.iloc[tr_idx], y_train.iloc[va_idx]

        model = clone(clf_gb_cw)
        model.set_params(**params)

        sw_tr = compute_sample_weight("balanced", y_tr)
        model.fit(X_tr, y_tr, model__sample_weight=sw_tr)

        proba = model.predict_proba(X_va)[:, 1]
        scores.append(average_precision_score(y_va, proba))

    mean_score = np.mean(scores)

    if mean_score > best_score:
        best_score = mean_score
        best_params = params


KeyboardInterrupt: 

In [None]:
print(f"BEST CV PR-AUC: {best_score:.4f}")
print(f"BEST PARAMS: {best_params}")

BEST CV PR-AUC: 0.4511
BEST PARAMS: {'model__learning_rate': 0.1, 'model__max_depth': 3, 'model__min_samples_leaf': 50, 'model__n_estimators': 400, 'model__subsample': 0.9}


In [31]:
best_params = {
    "learning_rate": 0.1,
    "max_depth": 3,
    "min_samples_leaf": 50,
    "n_estimators": 400,
    "subsample": 0.9,
}


In [32]:
best_model = clone(clf_gb_cw)
best_model.set_params(**best_params)

sw_full = compute_sample_weight("balanced", y_train)

best_model.fit(
    X_train,
    y_train,
    model__sample_weight=sw_full
)

ValueError: Invalid parameter 'learning_rate' for estimator Pipeline(steps=[('prep',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imp',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('clip',
                                                                   FunctionTransformer(feature_names_out='one-to-one',
                                                                                       func=<function make_preprocess_tree.<locals>.clip_only at 0x17b4f1f30>))]),
                                                  ['age', 'balance', 'day',
                                                   'campaign', 'pdays',
                                                   'previous',
                                                   'poutcome_missing',
                                                   'pdays_contacted',
                                                   'has_previous_campaign']),
                                                 ('cat',
                                                  Pipeline(steps=[('imp',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('oh',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['job', 'marital',
                                                   'education', 'default',
                                                   'housing', 'loan', 'contact',
                                                   'month', 'poutcome'])])),
                ('model', GradientBoostingClassifier(random_state=42))]). Valid parameters are: ['memory', 'steps', 'transform_input', 'verbose'].

In [None]:
# Predict
y_pred = best_model.predict(X_test)
y_proba = best_model.predict_proba(X_test)[:, 1]

print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, digits=4))

print("ROC-AUC:", roc_auc_score(y_test, y_proba))
print("PR-AUC :", average_precision_score(y_test, y_proba))

Confusion matrix:
 [[6666 1315]
 [ 358  700]]

Classification report:
               precision    recall  f1-score   support

           0     0.9490    0.8352    0.8885      7981
           1     0.3474    0.6616    0.4556      1058

    accuracy                         0.8149      9039
   macro avg     0.6482    0.7484    0.6720      9039
weighted avg     0.8786    0.8149    0.8378      9039

ROC-AUC: 0.8053346925791856
PR-AUC : 0.4561032146157314


**Fine tune mô hình bằng StratifiedKFold và ParamGrid**
- FP = 1.315 khách hàng: Mô hình dự đoán yes nhưng thực tế là no
- FN = 358 khách hàng: Khách có khả năng yes nhưng mô hình bỏ sót
- Với class imbalace: 
    - Recall = 0.66: phát hiện phần lớn khách hàng tiềm năng 
    - Precision = 0.34: trade-off cho một cuộc gọi không thành công & bỏ sót khách tiềm năng

Trong phạm vi đề tài, mô hình phù hợp cho việc hỗ trợ ra quyết định, giúp xếp hạng và ưu tiên danh sách khách hàng cần liên hệ trong chiến dịch telemarketing sau này.

So sánh kết quả của 2 phương pháp xử lý mất cân bằng dữ liệu: **Class-weight** cho ra kết quả tốt hơn.

# Đánh giá Business Performance dựa sau khi đã fine tune model

In [None]:
y_proba

array([0.2524482 , 0.30133478, 0.40234654, ..., 0.42302173, 0.83131034,
       0.11671982], shape=(9039,))

In [None]:
df_rank = pd.DataFrame({
    "p_yes": y_proba,
    "y_true": y_test.values
})

In [None]:
df_rank = df_rank.sort_values("p_yes", ascending=False)
df_rank.head()

Unnamed: 0,p_yes,y_true
981,0.989294,1
3520,0.988273,1
1453,0.987408,0
3969,0.98603,1
5363,0.984065,1


In [None]:
K = 0.15
n_top = int(len(df_rank) * K)

top_k = df_rank.head(n_top)

In [None]:
CR_model = top_k["y_true"].mean()
CR_model

np.float64(0.44206642066420665)

→ Trong **15% khách hàng** được ưu tiên gọi, **44% thực sự đồng ý** (yes).

In [None]:
CR_random = df_rank["y_true"].mean()
lift = CR_model / CR_random
print(f"Lift @ {K*100:.0f}%: {lift:.2f}")

Lift @ 15%: 3.78


So với chiến lược gọi ngẫu nhiên, mô hình giúp **tăng hơn 3.78 lần tỷ lệ chuyển đổi** khi chỉ tập trung vào 30% khách hàng tiềm năng nhất.

Kết quả cho thấy mô hình mang lại hiệu quả rõ rệt. Khi chỉ tập trung vào 15% khách hàng tiềm năng nhất, tỷ lệ chuyển đổi đạt 44%, cao hơn khoảng 4 lần so với tỷ lệ chuyển đổi trung bình của toàn bộ tập dữ liệu (~11–12%).

In [33]:
import numpy as np
from sklearn.utils.class_weight import compute_class_weight

def make_sample_weight(y):
    classes = np.unique(y)
    cw = compute_class_weight(
        class_weight="balanced",
        classes=classes,
        y=y
    )
    class_weight_dict = dict(zip(classes, cw))
    return np.array([class_weight_dict[yi] for yi in y])


In [39]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score
import mlflow
import mlflow.sklearn

BEST_PARAMS = {
    "learning_rate": 0.1,
    "max_depth": 3,
    "min_samples_leaf": 50,
    "n_estimators": 400,
    "subsample": 0.9,
}

numeric_cols = ['age','balance','day','campaign','pdays','previous','poutcome_missing','pdays_contacted','has_previous_campaign']
categorical_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

gb_clf = Pipeline(steps=[
    ("prep", make_preprocess_tree(categorical_cols, numeric_cols)),
    ("model", GradientBoostingClassifier(
        learning_rate=BEST_PARAMS["learning_rate"],
        max_depth=BEST_PARAMS["max_depth"],
        min_samples_leaf=BEST_PARAMS["min_samples_leaf"],
        n_estimators=BEST_PARAMS["n_estimators"],
        subsample=BEST_PARAMS["subsample"],
        random_state=42
    ))
])


sample_weight = make_sample_weight(y_train)

gb_clf.fit(
    X_train,
    y_train,
    model__sample_weight=sample_weight
)

y_proba = gb_clf.predict_proba(X_test)[:, 1]
pr_auc = average_precision_score(y_test, y_proba)

print("Gradient Boosting | TEST PR-AUC:", pr_auc)


Gradient Boosting | TEST PR-AUC: 0.4561032146157314


In [40]:
import numpy as np
import pandas as pd
from sklearn.metrics import (
    confusion_matrix,
    classification_report,
    precision_recall_curve,
    average_precision_score,
)

# ====== Predict & PR-AUC ======
y_proba = gb_clf.predict_proba(X_test)[:, 1]
pr_auc = average_precision_score(y_test, y_proba)
print("Gradient Boosting | TEST PR-AUC:", pr_auc)

# ====== Choose threshold (default 0.5 + option theo call_budget) ======
thr_default = 0.5

# Nếu bạn có call_budget (vd 10%), chọn ngưỡng để chỉ gọi top K%
call_budget = 0.10  # chỉnh theo bài của bạn
k = int(np.ceil(call_budget * len(y_proba)))
thr_budget = np.sort(y_proba)[-k] if k > 0 else 1.0

print(f"Threshold default: {thr_default:.3f}")
print(f"Threshold by call_budget={call_budget:.0%}: {thr_budget:.6f} (top {k} samples)")

def eval_at_threshold(thr: float, name: str):
    y_pred = (y_proba >= thr).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    print(f"\n=== {name} | thr={thr:.6f} ===")
    print("Confusion matrix [ [TN FP], [FN TP] ]:\n", cm)
    print(classification_report(y_test, y_pred, digits=4))
    return y_pred, cm

y_pred_05, cm_05 = eval_at_threshold(thr_default, "Default 0.5")
y_pred_b, cm_b = eval_at_threshold(thr_budget, f"Call budget {call_budget:.0%}")

# ====== Build error table to inspect bad cases ======
err = X_test.copy()
err = err.reset_index(drop=True)
err["y_true"] = np.array(y_test)
err["y_proba"] = y_proba
err["y_pred_05"] = y_pred_05
err["is_error_05"] = (err["y_true"] != err["y_pred_05"]).astype(int)

fn = err[(err["y_true"] == 1) & (err["y_pred_05"] == 0)].copy()
fp = err[(err["y_true"] == 0) & (err["y_pred_05"] == 1)].copy()

print("\n#Errors at thr=0.5")
print("FN count:", len(fn), "| FP count:", len(fp), "| Total:", len(err))


fn_hard = fn.sort_values("y_proba", ascending=True).head(15)
# FP “nặng”: model rất tự tin là 1 (proba cao) nhưng thực tế là 0
fp_hard = fp.sort_values("y_proba", ascending=False).head(15)

cols_show = (numeric_cols + categorical_cols)
cols_show = [c for c in cols_show if c in err.columns]  # an toàn nếu thiếu cột engineered

print("\n=== Hard False Negatives (missed conversions) ===")
print(fn_hard[cols_show + ["y_true", "y_pred_05", "y_proba"]].to_string(index=False))

print("\n=== Hard False Positives (wasted calls) ===")
print(fp_hard[cols_show + ["y_true", "y_pred_05", "y_proba"]].to_string(index=False))
def error_by(col):
    if col not in err.columns:
        return None
    g = (err.groupby(col)["is_error_05"]
         .agg(err_rate="mean", n="size")
         .sort_values(["err_rate", "n"], ascending=[False, False]))
    return g

group_cols = ["job", "month", "contact", "poutcome", "education", "marital", "housing", "loan", "default"]
for c in group_cols:
    g = error_by(c)
    if g is None:
        continue
    g2 = g[g["n"] >= 50].head(10)
    print(f"\n=== Top error groups by {c} (n>=50) ===")
    print(g2.to_string())


prec, rec, thr = precision_recall_curve(y_test, y_proba)
for t in [0.2, 0.3, 0.4, 0.5]:
    yp = (y_proba >= t).astype(int)
    cm = confusion_matrix(y_test, yp)
    tn, fp_, fn_, tp = cm.ravel()
    p = tp / (tp + fp_ + 1e-12)
    r = tp / (tp + fn_ + 1e-12)
    print(f"\nthr={t:.2f} | precision={p:.4f} | recall={r:.4f} | FP={fp_} | FN={fn_} | TP={tp}")


Gradient Boosting | TEST PR-AUC: 0.4561032146157314
Threshold default: 0.500
Threshold by call_budget=10%: 0.738337 (top 904 samples)

=== Default 0.5 | thr=0.500000 ===
Confusion matrix [ [TN FP], [FN TP] ]:
 [[6666 1315]
 [ 358  700]]
              precision    recall  f1-score   support

           0     0.9490    0.8352    0.8885      7981
           1     0.3474    0.6616    0.4556      1058

    accuracy                         0.8149      9039
   macro avg     0.6482    0.7484    0.6720      9039
weighted avg     0.8786    0.8149    0.8378      9039


=== Call budget 10% | thr=0.738337 ===
Confusion matrix [ [TN FP], [FN TP] ]:
 [[7542  439]
 [ 593  465]]
              precision    recall  f1-score   support

           0     0.9271    0.9450    0.9360      7981
           1     0.5144    0.4395    0.4740      1058

    accuracy                         0.8858      9039
   macro avg     0.7207    0.6923    0.7050      9039
weighted avg     0.8788    0.8858    0.8819      9039


#