In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("../data/processed/bank_marketing_ml.csv")

SMOTE - IMBALANCE - PROCESSING

In [3]:
numeric_cols = ['age','balance','day','campaign','pdays','previous','poutcome_missing','pdays_contacted','has_previous_campaign']
categorical_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

skew_cols = ["balance", "campaign", "pdays", "previous"]

In [4]:
from sklearn.model_selection import train_test_split


X = df[categorical_cols + numeric_cols].copy()
y = df['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [5]:
#classweight
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import FunctionTransformer, Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler


def make_preprocess_scaled(cat_cols, num_cols, skew_cols):
    def clip_and_log(X):
        X = X.astype(float)
        lo = np.nanpercentile(X, 1, axis=0)
        hi = np.nanpercentile(X, 99, axis=0)
        X = np.clip(X, lo, hi)
        return np.log1p(np.maximum(X, 0))  # dành cho biến không âm

    def signed_log1p(X):
        X = X.astype(float)
        lo = np.nanpercentile(X, 1, axis=0)
        hi = np.nanpercentile(X, 99, axis=0)
        X = np.clip(X, lo, hi)
        return np.sign(X) * np.log1p(np.abs(X))

    skew_signed = [c for c in skew_cols if c == "balance" and c in num_cols]
    skew_pos = [c for c in skew_cols if c != "balance" and c in num_cols]

    transformers = []

    if skew_pos:
        transformers.append((
            "num_skew_pos",
            Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("log", FunctionTransformer(clip_and_log, feature_names_out="one-to-one")),
                ("sc", RobustScaler()),
            ]),
            skew_pos
        ))

    if skew_signed:
        transformers.append((
            "num_skew_signed",
            Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("log", FunctionTransformer(signed_log1p, feature_names_out="one-to-one")),
                ("sc", RobustScaler()),
            ]),
            skew_signed
        ))

    rest_num = [c for c in num_cols if c not in set(skew_pos + skew_signed)]
    if rest_num:
        transformers.append((
            "num_rest",
            Pipeline([
                ("imp", SimpleImputer(strategy="median")),
                ("sc", RobustScaler()),
            ]),
            rest_num
        ))

    transformers.append((
        "cat",
        Pipeline([
            ("imp", SimpleImputer(strategy="most_frequent")),
            ("oh", OneHotEncoder(handle_unknown="ignore")),
        ]),
        cat_cols
    ))

    return ColumnTransformer(transformers=transformers, remainder="drop")


In [6]:
import mlflow
import os
import mlflow
from dotenv import load_dotenv

load_dotenv()
tracking_uri = os.getenv("MLFLOW_TRACKING_URI")
if not tracking_uri:
    raise ValueError("Không tìm thấy biến môi trường 'MLFLOW_TRACKING_URI'. Vui lòng kiểm tra lại cấu hình.")
print(f"-> Đang kết nối tới MLflow tại: {tracking_uri}")


-> Đang kết nối tới MLflow tại: https://mlflow.thonph.site/


In [7]:
import numpy as np

from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import average_precision_score, roc_auc_score

pre_svm = make_preprocess_scaled(categorical_cols, numeric_cols, skew_cols)  # KHuyên dùng cho SVM

base_svm = LinearSVC(max_iter=20000, random_state=42)


svm = CalibratedClassifierCV(estimator=base_svm, method="sigmoid", cv=3)

pipe = Pipeline(steps=[
    ("prep", pre_svm),
    ("model", svm),
])

# 4) grid params (tối ưu cho mất cân bằng lớp)
param_grid = {
    "model__estimator__C": [0.01, 0.1, 1, 3, 10],
    "model__estimator__class_weight": [None, "balanced"],  # tương đương ý tưởng class_weight trong RF
    "model__estimator__loss": ["hinge", "squared_hinge"],
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

gs = GridSearchCV(
    pipe,
    param_grid=param_grid,
    scoring="average_precision",   # PR-AUC (hợp lệch lớp)
    refit=True,
    cv=cv,
    n_jobs=-1,
    verbose=1,
)

gs.fit(X_train, y_train)

print("Best params:", gs.best_params_)
print("Best CV PR-AUC:", gs.best_score_)

best_model = gs.best_estimator_

proba = best_model.predict_proba(X_test)[:, 1]
print("SVM(best) | TEST PR-AUC:", average_precision_score(y_test, proba))
print("SVM(best) | TEST ROC-AUC:", roc_auc_score(y_test, proba))


Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best params: {'model__estimator__C': 0.01, 'model__estimator__class_weight': None, 'model__estimator__loss': 'squared_hinge'}
Best CV PR-AUC: 0.4053799933819756
SVM(best) | TEST PR-AUC: 0.4100377097547836
SVM(best) | TEST ROC-AUC: 0.7722198918082621


In [8]:
"""import json
from sklearn.metrics import (
    average_precision_score, roc_auc_score,
    precision_score, recall_score, f1_score,
    confusion_matrix, classification_report, accuracy_score
)

BEST_PARAMS = {
    "C": 0.01,
    "class_weight": None,
    "loss": "squared_hinge",
}


pre_svm = make_preprocess_scaled(categorical_cols, numeric_cols, skew_cols)  

base_svm = LinearSVC(
    C=BEST_PARAMS["C"],
    class_weight=BEST_PARAMS["class_weight"],
    loss=BEST_PARAMS["loss"],
    max_iter=20000,
    random_state=42,
)

svm_cal = CalibratedClassifierCV(
    estimator=base_svm,
    method="sigmoid",
    cv=3
)

best_pipe = Pipeline(steps=[
    ("prep", pre_svm),
    ("model", svm_cal),
])


mlflow.set_experiment("ModelComparing")  

with mlflow.start_run(run_name="svm_linear"):
    mlflow.log_param("model_family", "LinearSVC + CalibratedClassifierCV")
    mlflow.log_param("C", BEST_PARAMS["C"])
    mlflow.log_param("class_weight", str(BEST_PARAMS["class_weight"]))
    mlflow.log_param("loss", BEST_PARAMS["loss"])
    mlflow.log_param("calibration", "sigmoid_cv3")

    # fit
    best_pipe.fit(X_train, y_train)

    # predict proba + metrics threshold-free
    proba = best_pipe.predict_proba(X_test)[:, 1]
    pr_auc = float(average_precision_score(y_test, proba))
    roc_auc = float(roc_auc_score(y_test, proba))

    mlflow.log_metric("pr_auc", pr_auc)
    mlflow.log_metric("roc_auc", roc_auc)

    t0 = 0.5
    y_pred = (proba >= t0).astype(int)

    prec = float(precision_score(y_test, y_pred, zero_division=0))
    rec  = float(recall_score(y_test, y_pred, zero_division=0))
    f1   = float(f1_score(y_test, y_pred, zero_division=0))
    acc = float(accuracy_score(y_test, y_pred))

    mlflow.log_metric("precision", prec)
    mlflow.log_metric("recall", rec)
    mlflow.log_metric("f1", f1)
    mlflow.log_metric("accuracy", acc)


    cm = confusion_matrix(y_test, y_pred)
    report_txt = classification_report(y_test, y_pred, digits=3)

    os.makedirs("mlflow_artifacts_best", exist_ok=True)

    cm_path = "mlflow_artifacts_best/confusion_matrix_t0.5.json"
    with open(cm_path, "w", encoding="utf-8") as f:
        json.dump(cm.tolist(), f, ensure_ascii=False, indent=2)

    rep_path = "mlflow_artifacts_best/classification_report_t0.5.txt"
    with open(rep_path, "w", encoding="utf-8") as f:
        f.write(report_txt)

    mlflow.log_artifact(cm_path)
    mlflow.log_artifact(rep_path)


    # thresholds = np.linspace(0.05, 0.95, 19)
    # for t in thresholds:
    #     yp = (proba >= t).astype(int)
    #     step = int(round(t * 1000))  # để MLflow vẽ line theo step
    #     mlflow.log_metric("curve_precision", float(precision_score(y_test, yp, zero_division=0)), step=step)
    #     mlflow.log_metric("curve_recall", float(recall_score(y_test, yp, zero_division=0)), step=step)
    #     mlflow.log_metric("curve_f1", float(f1_score(y_test, yp, zero_division=0)), step=step)

    print("Logged to MLflow:", {"test_pr_auc": pr_auc, "test_roc_auc": roc_auc})
"""

'import json\nfrom sklearn.metrics import (\n    average_precision_score, roc_auc_score,\n    precision_score, recall_score, f1_score,\n    confusion_matrix, classification_report, accuracy_score\n)\n\nBEST_PARAMS = {\n    "C": 0.01,\n    "class_weight": None,\n    "loss": "squared_hinge",\n}\n\n\npre_svm = make_preprocess_scaled(categorical_cols, numeric_cols, skew_cols)  \n\nbase_svm = LinearSVC(\n    C=BEST_PARAMS["C"],\n    class_weight=BEST_PARAMS["class_weight"],\n    loss=BEST_PARAMS["loss"],\n    max_iter=20000,\n    random_state=42,\n)\n\nsvm_cal = CalibratedClassifierCV(\n    estimator=base_svm,\n    method="sigmoid",\n    cv=3\n)\n\nbest_pipe = Pipeline(steps=[\n    ("prep", pre_svm),\n    ("model", svm_cal),\n])\n\n\nmlflow.set_experiment("ModelComparing")  \n\nwith mlflow.start_run(run_name="svm_linear"):\n    mlflow.log_param("model_family", "LinearSVC + CalibratedClassifierCV")\n    mlflow.log_param("C", BEST_PARAMS["C"])\n    mlflow.log_param("class_weight", str(BEST_PA

In [9]:
proba


array([0.05147711, 0.08719906, 0.08629645, ..., 0.09093424, 0.3569419 ,
       0.06281367])

In [10]:
df_rank = pd.DataFrame({
    "p_yes": proba,
    "y_true": y_test.values
})

In [11]:
df_rank = df_rank.sort_values("p_yes", ascending=False)
df_rank.head()

Unnamed: 0,p_yes,y_true
6524,0.945151,0
3204,0.945054,1
1466,0.944728,1
7387,0.944465,1
7401,0.942397,0


In [12]:
K = 0.15
n_top = int(len(df_rank) * K)

top_k = df_rank.head(n_top)

In [13]:
CR_model = top_k["y_true"].mean()
CR_model

0.3933579335793358

Trong 15% khách hàng được ưu tiên gọi, có 39% khách hàng thực sự đồng ý

In [14]:
CR_random = df_rank["y_true"].mean()
lift = CR_model / CR_random
print(f"Lift @ {K*100:.0f}%: {lift:.2f}")

Lift @ 15%: 3.36


--> So với việc gọi ngẫu nhiên, mô hình giúp tăng 3.36 lần tỷ lệ chuyển đổi