Import Semua Packages / Library yang Digunakan

In [None]:
# LIB
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
import plotly.io as pio
import joblib
import json
import datetime as dt

from sklearn.metrics import roc_curve
from optuna.samplers import TPESampler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import precision_recall_curve, classification_report
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from imblearn.combine import SMOTETomek
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score,
    f1_score, roc_auc_score, confusion_matrix
)

# CatBoost
from catboost import CatBoostClassifier

#Parameter Tuning
import keras_tuner as kt



In [None]:
df = pd.read_csv("data/bank-full.csv", sep=";")

Shape: (45211, 17)


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,unknown,5,may,261,1,-1,0,unknown,no
1,44,technician,single,secondary,no,29,yes,no,unknown,5,may,151,1,-1,0,unknown,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,unknown,5,may,76,1,-1,0,unknown,no
3,47,blue-collar,married,unknown,no,1506,yes,no,unknown,5,may,92,1,-1,0,unknown,no
4,33,unknown,single,unknown,no,1,no,no,unknown,5,may,198,1,-1,0,unknown,no


Data Preprocessing

In [None]:
df = df.drop(columns=['duration'])

df = df.replace('unknown', np.nan)

binary_cols = ['default', 'housing', 'loan', 'y']
for col in binary_cols:
    df[col] = df[col].map({'yes': 1, 'no': 0})

df['month'] = df['month'].astype('category')

cat_features = [
    'job', 'marital', 'education', 'contact', 'month', 'poutcome'
]

In [None]:
print("Class counts:")
print(df['y'].value_counts())

print("\nClass percentage (%):")
print((df['y'].value_counts(normalize=True) * 100).round(2))

Class counts:
y
0    39922
1     5289
Name: count, dtype: int64

Class percentage (%):
y
0    88.3
1    11.7
Name: proportion, dtype: float64


In [None]:
df_fe = df.copy()


df_fe['balance_squared'] = df_fe['balance'] ** 2
df_fe['age_squared'] = df_fe['age'] ** 2


df_fe['balance_per_age'] = df_fe['balance'] / (df_fe['age'] + 1)
df_fe['balance_age'] = df_fe['balance'] * df_fe['age']


df_fe['contacted_before'] = (df_fe['pdays'] != -1). astype(int)
df_fe['days_since_contact'] = df_fe['pdays']. apply(lambda x: 365 if x == -1 else x)
df_fe['contact_frequency'] = df_fe['previous'] / (df_fe['days_since_contact'] + 1)
df_fe['contact_intensity'] = df_fe['campaign'] / (df_fe['days_since_contact'] + 1)


df_fe['frequent_campaign'] = (df_fe['campaign'] > 3).astype(int)
df_fe['high_campaign'] = (df_fe['campaign'] > 5).astype(int)
df_fe['previous_contact'] = (df_fe['previous'] > 0).astype(int)
df_fe['campaign_per_previous'] = df_fe['campaign'] / (df_fe['previous'] + 1)


df_fe['age_group'] = pd.cut(
    df_fe['age'],
    bins=[0, 30, 40, 50, 60, 100],
    labels=['young', 'middle_young', 'middle', 'senior', 'elderly']
)

df_fe['campaign_group'] = pd.cut(
    df_fe['campaign'],
    bins=[0, 1, 3, 5, 100],
    labels=['first', 'low', 'medium', 'high']
)

df_fe['has_positive_balance'] = (df_fe['balance'] > 0).astype(int)
df_fe['has_debt'] = (df_fe['balance'] < 0).astype(int)
df_fe['high_balance'] = (df_fe['balance'] > df_fe['balance']. quantile(0.75)).astype(int)
df_fe['balance_category'] = pd.cut(
    df_fe['balance'],
    bins=[-np.inf, 0, 500, 2000, np.inf],
    labels=['negative', 'low', 'medium', 'high']
)

df_fe['total_loans'] = df_fe['housing'] + df_fe['loan']
df_fe['any_loan'] = ((df_fe['housing'] == 1) | (df_fe['loan'] == 1)).astype(int)
df_fe['no_loans'] = ((df_fe['housing'] == 0) & (df_fe['loan'] == 0)).astype(int)

high_success_months = ['mar', 'sep', 'oct', 'dec']
df_fe['high_success_month'] = df_fe['month'].isin(high_success_months).astype(int)

df_fe['young_professional'] = ((df_fe['age'] >= 25) & (df_fe['age'] <= 40)).astype(int)
df_fe['retirement_age'] = (df_fe['age'] >= 60).astype(int)

df_fe['balance_x_contacted'] = df_fe['balance'] * df_fe['contacted_before']
df_fe['age_x_balance_pos'] = df_fe['age'] * df_fe['has_positive_balance']

df_fe = df_fe. drop(columns=['balance_to_avg_ratio'], errors='ignore')

print("Feature engineering complete")
print(f"Total features: {len(df_fe. columns)}")
print(df_fe.columns.tolist())

Feature engineering complete
Total features: 42
['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'campaign', 'pdays', 'previous', 'poutcome', 'y', 'balance_squared', 'age_squared', 'balance_per_age', 'balance_age', 'contacted_before', 'days_since_contact', 'contact_frequency', 'contact_intensity', 'frequent_campaign', 'high_campaign', 'previous_contact', 'campaign_per_previous', 'age_group', 'campaign_group', 'has_positive_balance', 'has_debt', 'high_balance', 'balance_category', 'total_loans', 'any_loan', 'no_loans', 'high_success_month', 'young_professional', 'retirement_age', 'balance_x_contacted', 'age_x_balance_pos']


In [None]:
y = df_fe["y"]
X = df_fe.drop("y", axis=1)

X_temp, X_test_df, y_temp, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

X_train_raw, X_val_raw, y_train, y_val = train_test_split(
    X_temp, y_temp,
    test_size=0.25,
    random_state=42,
    stratify=y_temp
)


categorical_cols = X_train_raw.select_dtypes(include=["object", "category"]).columns.tolist()
numeric_cols = X_train_raw.select_dtypes(include=["int64", "float64"]).columns.tolist()

print("Numeric columns:", len(numeric_cols))
print("Categorical columns:", len(categorical_cols))


ohe = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_cols),
        ("cat", categorical_transformer, categorical_cols),
    ]
)


X_train = preprocessor.fit_transform(X_train_raw)
X_val = preprocessor.transform(X_val_raw)
X_test = preprocessor.transform(X_test_df)

print("-")
print("DATA SPLIT SUMMARY")
print("-")
print(f"Train: {X_train.shape} - {len(y_train)} samples")
print(f"Val:   {X_val.shape} - {len(y_val)} samples")
print(f"Test:  {X_test.shape} - {len(y_test)} samples")
print(f"Total features after encoding: {X_train.shape[1]}")

Numeric columns: 32
Categorical columns: 9
-
DATA SPLIT SUMMARY
-
Train: (27126, 79) - 27126 samples
Val:   (9042, 79) - 9042 samples
Test:  (9043, 79) - 9043 samples
Total features after encoding: 79


*Model*

Output

In [None]:
y_test_proba = final_model.predict_proba(X_test)[:, 1]

scores_1_to_10 = np.round(1 + (y_test_proba * 9), 3)

results_df = pd.DataFrame({
    'Customer_Index': X_test_df.index,
    'Subscription_Score': scores_1_to_10,
    'Probability': y_test_proba.round(3)
})

top_10_customers = results_df.nlargest(30, 'Subscription_Score')

print("\n" + "="*20)
print("POTENTIAL CUSTOMERS")
print("="*20)
print(top_10_customers.to_string(index=False, float_format=lambda x: f'{x:.3f}'))


POTENTIAL CUSTOMERS
 Customer_Index  Subscription_Score  Probability
          43122               9.913        0.990
          43135               9.866        0.985
          43165               9.839        0.982
          44376               9.809        0.979
          43164               9.800        0.978
          43039               9.793        0.977
          45126               9.790        0.977
          43579               9.780        0.976
          34147               9.779        0.975
          33897               9.769        0.974
          41641               9.768        0.974
          34057               9.765        0.974
          42993               9.765        0.974
          34011               9.763        0.974
          43571               9.756        0.973
          43146               9.756        0.973
          43303               9.743        0.971
          45098               9.742        0.971
          45102               9.724        0.969

In [None]:
X_all_prepared = preprocessor.transform(X)

proba_all = final_model.predict_proba(X_all_prepared)[:, 1]

df_rekom = df.copy()

df_rekom["prob_subscription"] = proba_all. round(3)
df_rekom["score"] = (1 + (proba_all * 9)). round(3)

df_rekom = df_rekom.sort_values("prob_subscription", ascending=False).reset_index(drop=True)
df_rekom["global_rank"] = df_rekom.index + 1

def get_priority_level(p):
    if p >= 0.7:
        return "HIGH"
    elif p >= 0.4:
        return "MEDIUM"
    else:
        return "LOW"

df_rekom["priority_level"] = df_rekom["prob_subscription"].apply(get_priority_level)

print("\n" + "="*60)
print("POTENTIAL CUSTOMERS FOR TERM DEPOSIT")
print("="*60)
print(df_rekom[['global_rank', 'age', 'job', 'marital', 'education',
                'balance', 'prob_subscription', 'score', 'priority_level']].head(10). to_string(index=False))

df_rekom.to_csv('customer_recommendations.csv', index=False)
print("Full recommendations saved to 'customer_recommendations. csv'")


POTENTIAL CUSTOMERS FOR TERM DEPOSIT
 global_rank  age        job  marital education  balance  prob_subscription  score priority_level
           1   63 technician  married secondary      973              0.995  9.952           HIGH
           2   71    retired divorced secondary        0              0.991  9.921           HIGH
           3   20    student   single secondary      215              0.990  9.913           HIGH
           4   35 management   single  tertiary      681              0.987  9.886           HIGH
           5   23    student   single secondary        0              0.986  9.877           HIGH
           6   25    student   single secondary      469              0.985  9.866           HIGH
           7   37 management  married secondary      565              0.985  9.864           HIGH
           8   19    student   single secondary      329              0.984  9.858           HIGH
           9   61    retired  married  tertiary     2557              0.984  9.8

In [None]:
X_all_prepared = preprocessor.transform(X)

proba_all = final_model. predict_proba(X_all_prepared)[:, 1]

df_rekom = df.copy()

df_rekom["prob_subscription"] = proba_all. round(3)
df_rekom["score"] = (1 + (proba_all * 9)).round(3)

df_rekom = df_rekom.sort_values("prob_subscription", ascending=False).reset_index(drop=True)
df_rekom["global_rank"] = df_rekom.index + 1


def get_priority_level(p):
    if p >= 0.7:
        return "HIGH"
    elif p >= 0.4:
        return "MEDIUM"
    else:
        return "LOW"

df_rekom["priority_level"] = df_rekom["prob_subscription"].apply(get_priority_level)

print("\n" + "="*60)
print("TOP 10 POTENTIAL CUSTOMERS FOR TERM DEPOSIT")
print("="*60)
print(df_rekom[['global_rank', 'age', 'job', 'marital', 'education',
                 'prob_subscription', 'score', 'priority_level']].head(10).to_string(index=False))

def get_daily_recommendation(df_sorted, date_str, calls_per_day=200):
    """
    Get daily recommendations based on date offset.
    Cycles through all customers in ranked order.
    """
    n = len(df_sorted)

    base_date = dt.date(2025, 1, 1)
    target_date = pd.to_datetime(date_str).date()

    day_offset = (target_date - base_date).days

    start_idx = (day_offset * calls_per_day) % n
    end_idx = start_idx + calls_per_day

    if end_idx <= n:
        df_day = df_sorted.iloc[start_idx:end_idx].copy()
    else:
        part1 = df_sorted.iloc[start_idx:]
        part2 = df_sorted.iloc[:end_idx - n]
        df_day = pd.concat([part1, part2]).copy()

    df_day = df_day.reset_index(drop=True)
    df_day["daily_rank"] = df_day. index + 1
    return df_day

target_date = "2025-01-01"
df_hari_ini = get_daily_recommendation(df_rekom, target_date, calls_per_day=200)

print(f"\n" + "="*60)
print(f"DAILY RECOMMENDATIONS FOR {target_date}")
print(f"Total customers to call: {len(df_hari_ini)}")
print("="*60)
print(df_hari_ini[['daily_rank', 'global_rank', 'age', 'job', 'marital',
                     'prob_subscription', 'score', 'priority_level']].head(10).to_string(index=False))


print(f"\n--- Priority Distribution for {target_date} ---")
print(df_hari_ini['priority_level'].value_counts(). sort_index())

# Optional: Save daily recommendations
df_hari_ini.to_csv(f'recommendations_{target_date}.csv', index=False)
print(f"\n✓ Daily recommendations saved to 'recommendations_{target_date}.csv'")



TOP 10 POTENTIAL CUSTOMERS FOR TERM DEPOSIT
 global_rank  age        job  marital education  prob_subscription  score priority_level
           1   63 technician  married secondary              0.995  9.952           HIGH
           2   71    retired divorced secondary              0.991  9.921           HIGH
           3   20    student   single secondary              0.990  9.913           HIGH
           4   35 management   single  tertiary              0.987  9.886           HIGH
           5   23    student   single secondary              0.986  9.877           HIGH
           6   25    student   single secondary              0.985  9.866           HIGH
           7   37 management  married secondary              0.985  9.864           HIGH
           8   19    student   single secondary              0.984  9.858           HIGH
           9   61    retired  married  tertiary              0.984  9.859           HIGH
          10   43   services  married secondary              0.98

SAVE

In [None]:
joblib.dump(final_model, 'model.pkl')

joblib.dump(preprocessor, 'preprocessor.pkl')

feature_names = preprocessor.get_feature_names_out(). tolist()
with open('feature_names.json', 'w') as f:
    json. dump(feature_names, f)

    print("Model, preprocessor, and feature names saved")

Model, preprocessor, and feature names saved


In [None]:
print("\n" + "="*80)
print("MODEL INPUT AND OUTPUT")
print("="*80)


print("\nMODEL INPUTS")
print("-" * 80)

print("\n1.  FEATURES USED FOR TRAINING:")
print(f"   Total number of features: {len(X.columns)}")
print(f"   Feature names: {list(X.columns)}")

print("\n2. FEATURE TYPES:")
categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

print(f"\n   Categorical features ({len(categorical_features)}):")
for feat in categorical_features:
    unique_count = df_fe[feat].nunique() # Changed df to df_fe
    print(f"      - {feat}: {unique_count} unique values")
    print(f"        Values: {df_fe[feat].unique()[:5].tolist()}" + # Changed df to df_fe
          (f" ...  (+{unique_count-5} more)" if unique_count > 5 else ""))

print(f"\n   Numerical features ({len(numerical_features)}):")
for feat in numerical_features:
    print(f"      - {feat}: min={df_fe[feat].min()}, max={df_fe[feat].max()}, mean={df_fe[feat].mean():.2f}") # Changed df to df_fe

print("\n3. TARGET VARIABLE:")
print(f"   Variable name: {y.name}")
print(f"   Type: Binary Classification")
print(f"   Classes: {y.unique().tolist()}")
print(f"   Distribution:")
print(f"      - Class 0 (No subscription): {(y==0).sum()} ({(y==0).sum()/len(y)*100:.2f}%)")
print(f"      - Class 1 (Subscription): {(y==1).sum()} ({(y==1).sum()/len(y)*100:.2f}%)")

print("\n4. DATASET SIZE:")
print(f"   Total samples: {len(df)}")
print(f"   Training set: {len(X_train)} samples ({len(X_train)/len(df)*100:.1f}%)")
print(f"   Test set: {len(X_test)} samples ({len(X_test)/len(df)*100:.1f}%)")

# ========== PREPROCESSING ==========
print("\n" + "="*30)
print("PREPROCESSING STEPS")
print("-" * 30)
print(f"   - Categorical encoding: Applied to {len(categorical_features)} features")
print(f"   - Numerical scaling: Applied to {len(numerical_features)} features")
print(f"   - Preprocessor type: {type(preprocessor).__name__}")

print("\n" + "="*80)
print("MODEL INFO")
print("-" * 80)
print(f"   Model type: {type(final_model).__name__}")
print(f"   Model parameters: {final_model.get_params()}")

# ========== OUTPUTS ==========
print("\n" + "="*80)
print("MODEL OUTPUTS")
print("-" * 80)

print("\n1.  PREDICTION TYPES:")
print("   - Binary prediction: 0 (No subscription) or 1 (Subscription)")
print("   - Probability: Value between 0 and 1")
print("   - Score: Scaled to 1-10 range (with 3 decimals)")

print("\n2.  OUTPUT COLUMNS IN RECOMMENDATION DATAFRAME:")
output_columns = {
    'prob_subscription': 'Probability of subscription (0-1, 3 decimals)',
    'score': 'Subscription score (1-10 scale, 3 decimals)',
    'global_rank': 'Overall rank among all customers (1 = highest)',
    'priority_level': 'Priority category (HIGH/MEDIUM/LOW)',
    'daily_rank': 'Rank within daily recommendation list'
}

for col, desc in output_columns.items():
    print(f"   - {col}: {desc}")

print("\n3. PRIORITY LEVEL DEFINITIONS:")
print("   - HIGH: prob_subscription ≥ 0.70 (70%)")
print("   - MEDIUM: 0.40 ≤ prob_subscription < 0.70 (40-70%)")
print("   - LOW: prob_subscription < 0.40 (<40%)")

print("\n4. SAMPLE PREDICTIONS (Top 5):")
sample_output = df_rekom[['global_rank', 'prob_subscription', 'score', 'priority_level']].head(5)
print(sample_output.to_string(index=False))



print(f"\n   Priority distribution (all {len(df_rekom)} customers):")
priority_dist = df_rekom['priority_level'].value_counts().sort_index()
for level in ['HIGH', 'MEDIUM', 'LOW']:
    count = priority_dist.get(level, 0)
    pct = count / len(df_rekom) * 100
    print(f"      {level:6s}: {count:5d} customers ({pct:5.2f}%)")




MODEL INPUT AND OUTPUT

MODEL INPUTS
--------------------------------------------------------------------------------

1.  FEATURES USED FOR TRAINING:
   Total number of features: 41
   Feature names: ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 'contact', 'day', 'month', 'campaign', 'pdays', 'previous', 'poutcome', 'balance_squared', 'age_squared', 'balance_per_age', 'balance_age', 'contacted_before', 'days_since_contact', 'contact_frequency', 'contact_intensity', 'frequent_campaign', 'high_campaign', 'previous_contact', 'campaign_per_previous', 'age_group', 'campaign_group', 'has_positive_balance', 'has_debt', 'high_balance', 'balance_category', 'total_loans', 'any_loan', 'no_loans', 'high_success_month', 'young_professional', 'retirement_age', 'balance_x_contacted', 'age_x_balance_pos']

2. FEATURE TYPES:

   Categorical features (9):
      - job: 11 unique values
        Values: ['management', 'technician', 'entrepreneur', 'blue-collar', nan] ...