In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import math
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler, FunctionTransformer, OrdinalEncoder
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTENC
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score,
    classification_report,
    average_precision_score,
    confusion_matrix)
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.model_selection import train_test_split, StratifiedKFold, ParameterGrid, GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.utils.class_weight import compute_sample_weight
from sklearn.base import clone
from scipy.stats import randint, uniform

# Train mô hình dùng toàn bộ dữ liệu

In [11]:
df_train = pd.read_csv('../data/processed/bank_marketing_ml.csv')

In [20]:
df_train.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'campaign', 'pdays', 'previous',
       'poutcome', 'y', 'poutcome_missing', 'target', 'pdays_contacted',
       'has_previous_campaign'],
      dtype='object')

In [None]:
numeric_cols = ['age','balance','day','campaign','pdays','previous','poutcome_missing','pdays_contacted','has_previous_campaign']
categorical_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

In [15]:
X = df_train[categorical_cols + numeric_cols].copy()
y = df_train['target']

In [14]:
def make_preprocess_tree(cat_cols, num_cols):
    def clip_only(X):
        X = X.astype(float)
        lo = np.nanpercentile(X, 1, axis=0)
        hi = np.nanpercentile(X, 99, axis=0)
        return np.clip(X, lo, hi)

    return ColumnTransformer(
        transformers=[
            ("num",
             Pipeline([
                 ("imp", SimpleImputer(strategy="median")),
                 ("clip", FunctionTransformer(clip_only, feature_names_out="one-to-one")),
             ]),
             num_cols),
            ("cat",
             Pipeline([
                 ("imp", SimpleImputer(strategy="most_frequent")),
                 ("oh", OneHotEncoder(handle_unknown="ignore")),
             ]),
             cat_cols),
        ],
        remainder="drop"
    )


In [17]:
pre_tree = make_preprocess_tree(categorical_cols, numeric_cols)

clf_gb_cw = Pipeline(steps=[
    ("prep", pre_tree),
    ("model", GradientBoostingClassifier(
        random_state=42,
        learning_rate= 0.1,
        max_depth= 3,
        min_samples_leaf=50,
        n_estimators=400,
        subsample=0.9,
    ))
])

In [18]:
sw_full = compute_sample_weight("balanced", y)

clf_gb_cw.fit(
    X,
    y,
    model__sample_weight=sw_full
)

0,1,2
,steps,"[('prep', ...), ('model', ...)]"
,transform_input,
,memory,
,verbose,False

0,1,2
,transformers,"[('num', ...), ('cat', ...)]"
,remainder,'drop'
,sparse_threshold,0.3
,n_jobs,
,transformer_weights,
,verbose,False
,verbose_feature_names_out,True
,force_int_remainder_cols,'deprecated'

0,1,2
,missing_values,
,strategy,'median'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,func,<function mak...001CDB895DF80>
,inverse_func,
,validate,False
,accept_sparse,False
,check_inverse,True
,feature_names_out,'one-to-one'
,kw_args,
,inv_kw_args,

0,1,2
,missing_values,
,strategy,'most_frequent'
,fill_value,
,copy,True
,add_indicator,False
,keep_empty_features,False

0,1,2
,categories,'auto'
,drop,
,sparse_output,True
,dtype,<class 'numpy.float64'>
,handle_unknown,'ignore'
,min_frequency,
,max_categories,
,feature_name_combiner,'concat'

0,1,2
,loss,'log_loss'
,learning_rate,0.1
,n_estimators,400
,subsample,0.9
,criterion,'friedman_mse'
,min_samples_split,2
,min_samples_leaf,50
,min_weight_fraction_leaf,0.0
,max_depth,3
,min_impurity_decrease,0.0


# Xử lý cho giống input khi train mô hình

In [26]:
df = pd.read_csv("../data/processed/campaign_execution_plan.csv")

In [27]:
df = df.rename(columns={"day_of_week": "day"})

In [28]:
df.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,age_group,balance_group,customer_tier,campaign_action
0,56,retired,married,secondary,no,2624,yes,no,telephone,5,jul,145,3,30,3,success,29–58,very_high,Neutral,High Priority Call
1,69,retired,single,secondary,no,1794,no,no,telephone,12,nov,293,1,30,2,unknown,>58,very_high,Best,Call Immediately
2,46,blue-collar,married,secondary,no,3290,no,no,telephone,5,oct,471,5,60,0,failure,29–58,very_high,Neutral,Delay & Re-nurture
3,32,technician,single,secondary,no,7784,yes,no,telephone,17,oct,577,4,240,2,other,29–58,very_high,Neutral,Review
4,60,admin,married,secondary,no,1404,yes,no,telephone,17,jun,482,5,60,2,other,>58,high,Neutral,Review


In [29]:
# Thêm cột: 'poutcome_missing', 'pdays_contacted', 'has_previous_campaign'
df["poutcome_missing"] = df["poutcome"].isna().astype(int)
df["pdays_contacted"] = (~df["pdays"].isin([-1, 999])).astype(int)
df["has_previous_campaign"] = (df["previous"] > 0).astype(int)

In [30]:
df.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'day', 'month', 'duration', 'campaign', 'pdays',
       'previous', 'poutcome', 'age_group', 'balance_group', 'customer_tier',
       'campaign_action', 'poutcome_missing', 'pdays_contacted',
       'has_previous_campaign'],
      dtype='object')

In [31]:
numeric_cols = ['age','balance','day','campaign','pdays','previous','poutcome_missing','pdays_contacted','has_previous_campaign']
categorical_cols = ['job','marital','education','default','housing','loan','contact','month','poutcome']

In [None]:
X_plan = df[categorical_cols + numeric_cols].copy()

In [35]:
y_pred = clf_gb_cw.predict(X_plan)

df["y_pred"] = y_pred

In [38]:
df.to_csv("../data/processed/execution_plan_validation.csv", index=False)

In [36]:
df[['campaign_action','y_pred']].head()

Unnamed: 0,campaign_action,y_pred
0,High Priority Call,1
1,Call Immediately,1
2,Delay & Re-nurture,1
3,Review,1
4,Review,1


In [37]:
df.campaign_action.unique()

array(['High Priority Call', 'Call Immediately', 'Delay & Re-nurture',
       'Review', 'Nurture Campaign'], dtype=object)

In [44]:
print(df.campaign_action.value_counts())
print('\n',df.y_pred.value_counts())

campaign_action
Review                77
High Priority Call    57
Delay & Re-nurture    41
Call Immediately      15
Nurture Campaign      10
Name: count, dtype: int64

 y_pred
1    179
0     21
Name: count, dtype: int64


In [41]:
df.groupby(['campaign_action','y_pred']).size().unstack(fill_value=0)

y_pred,0,1
campaign_action,Unnamed: 1_level_1,Unnamed: 2_level_1
Call Immediately,0,15
Delay & Re-nurture,10,31
High Priority Call,0,57
Nurture Campaign,1,9
Review,10,67


Mô hình dự đoán khách không tiềm năng có sự tương đồng với khuyến nghị triển khai được rút ra từ tri thức. Khách mà mô hình dự đoán là **không đăng ký** đều thuộc về nhóm **"Delay & Nurture", "Review"**, điều này càng khẳng định ngân hàng chưa nên triển khai chiến dịch ở nhóm khách này nếu muốn tiết kiệm ngân sách hay tăng tỉ lệ chuyển đổi nhanh chóng.

In [50]:
summary = (
    df.groupby(["campaign_action", "y_pred"])
      .size()
      .unstack(fill_value=0)
      .rename(columns={0: "Pred_0", 1: "Pred_1"})
)

summary["Total"] = summary["Pred_0"] + summary["Pred_1"]
summary["Pct_Pred_1"] = (summary["Pred_1"] / summary["Total"] * 100).round(1)

summary = summary.reset_index()
summary.columns.name = None
summary

Unnamed: 0,campaign_action,Pred_0,Pred_1,Total,Pct_Pred_1
0,Call Immediately,0,15,15,100.0
1,Delay & Re-nurture,10,31,41,75.6
2,High Priority Call,0,57,57,100.0
3,Nurture Campaign,1,9,10,90.0
4,Review,10,67,77,87.0
