In [1]:
%load_ext autoreload
%autoreload 2
seed = 42

# Prepare data

In [2]:
# Data
from sklift.datasets import fetch_hillstrom
import pandas as pd

hillstrom_dataset = fetch_hillstrom()
full_data, full_target, full_treatment = fetch_hillstrom(target_col="visit",return_X_y_t=True)

mask_women = full_treatment != 'Womens E-Mail'
men_data_raw = full_data[mask_women]
men_target_raw = full_target[mask_women]
men_treatment_raw = full_treatment[mask_women]

# Xử lý biến Treatment (QUAN TRỌNG)
# Không dùng get_dummies ở đây vì dễ nhầm lẫn. Dùng map để kiểm soát 0 và 1.
# Mens E-Mail -> 1 (Treatment)
# No E-Mail   -> 0 (Control)
men_treatment_binary = men_treatment_raw.map({
    'Mens E-Mail': 1,
    'No E-Mail': 0
})
men_data = pd.get_dummies(men_data_raw, drop_first=True)
men_data_np, men_target_np, men_treatment_np = men_data.values.astype("float32"), men_target_raw.values.astype("float32"), men_treatment_binary.values.astype("float32")

# Women
mask_women = full_treatment != 'Mens E-Mail'
women_data_raw = full_data[mask_women]
women_target_raw = full_target[mask_women]
women_treatment_raw = full_treatment[mask_women]

women_treatment_binary = women_treatment_raw.map({
    'Womens E-Mail': 1,
    'No E-Mail': 0
})


women_data = pd.get_dummies(women_data_raw, drop_first=True)
women_data_np, women_target_np, women_treatment_np = women_data.values.astype("float32"), women_target_raw.values.astype("float32"), women_treatment_binary.values.astype("float32")

print("Splited data into women and men datasets")
print(f"Men: {men_data_np.shape}")
print(f"Women: {women_data_np.shape}")

Splited data into women and men datasets
Men: (42613, 15)
Women: (42693, 15)


  from .autonotebook import tqdm as notebook_tqdm


## 0.2. Split train-test and scale data

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Split train test sets
X_men_train, X_men_test, y_men_train, y_men_test, t_men_train, t_men_test = train_test_split(
    men_data_np, men_target_np, men_treatment_np, test_size=0.2, random_state=seed, stratify=men_treatment_np
)

X_women_train, X_women_test, y_women_train, y_women_test, t_women_train, t_women_test = train_test_split(
    women_data_np, women_target_np, women_treatment_np, test_size=0.2, random_state=seed, stratify=women_treatment_np
)
print("Splited men & women datasets into train & test sets")
print(f"MEN - Train size: {X_men_train.shape[0]}, Test size: {X_men_test.shape[0]}")
print(f"WOMEN - Train size: {X_women_train.shape[0]}, Test size: {X_women_test.shape[0]}\n")

# Features scaling
scaler = StandardScaler()
X_men_train_scaled = scaler.fit_transform(X_men_train)
X_men_test_scaled = scaler.transform(X_men_test)

X_women_train_scaled = scaler.fit_transform(X_women_train)
X_women_test_scaled = scaler.transform(X_women_test)
print("Feature scaled X_men_train, X_men_test, X_women_train, X_women_test")

Splited men & women datasets into train & test sets
MEN - Train size: 34090, Test size: 8523
WOMEN - Train size: 34154, Test size: 8539

Feature scaled X_men_train, X_men_test, X_women_train, X_women_test


# Optuna

In [None]:
import optuna
from pipeline import TLearnerPipeline
from optuna.samplers import TPESampler
from sklearn.model_selection import KFold
import numpy as np
from sklift.metrics import uplift_auc_score
from causalml.inference.meta import BaseTClassifier
from xgboost import XGBClassifier
import torch

optuna.logging.set_verbosity(optuna.logging.WARNING)
sampler = TPESampler(seed=seed)

def my_objective(trial, X, y, t):
    # 1. Định nghĩa Search Space theo bài báo (Table 7)
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1), # [cite: 1244]
        'max_depth': trial.suggest_float('max_depth', 0.7, 1),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0), # [cite: 1255]
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0), # [cite: 1261]
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 50.0, log=True), # [cite: 1267]
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0, log=True), # [cite: 1273]
        'gamma': trial.suggest_float('gamma', 0.001, 100.0, log=True), # [cite: 1279]
        'n_estimators': 500, # Giới hạn số cây như bài báo [cite: 1466]
        'seed': seed
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=seed) # [cite: 1070]
    cv_scores = []
    
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        t_train, t_val = t[train_idx], t[val_idx]

        t_learner = TLearnerPipeline(**params)
        t_learner.fit(X_train, y_train, t_train)

        y0_pred, y1_pred, uplift_pred = t_learner.predict(X_val)
        
        auuc = uplift_auc_score(y_val, uplift_pred, t_val)
        cv_scores.append(auuc)

    # Trả về trung bình loss của 5 folds
    return np.mean(cv_scores)


def lib_objective(trial, X, y, t):
    # 1. Định nghĩa Search Space theo bài báo (Table 7)
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_float('max_depth', 0.7, 1), # Bài báo ghi 0.7-1 (nghi vấn typo), ta dùng range chuẩn [2, 6]
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 50.0, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0.001, 100.0, log=True),
        'n_estimators': 500,
        'seed': seed
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=seed)
    cv_scores = []
    
    for train_idx, val_idx in kf.split(X):
        X_train, X_val = X[train_idx], X[val_idx]
        y_train, y_val = y[train_idx], y[val_idx]
        t_train, t_val = t[train_idx], t[val_idx]

        t_learner = BaseTClassifier(XGBClassifier(**params))
        t_learner.fit(X_train, y_train, t_train)

        uplift_pred = t_learner.predict(X_val)
        print(type(uplift_pred))

        if len(uplift_pred.shape) > 1:
            uplift_pred = uplift_pred.flatten()
        
        auuc = uplift_auc_score(y_val, uplift_pred, t_val)
        cv_scores.append(auuc)

    # Trả về trung bình loss của 5 folds
    return np.mean(cv_scores)

## TPE Men

In [6]:
study_men = optuna.create_study(direction='maximize', sampler=sampler)
study_men.optimize(
    lambda trial: my_objective(trial, X_men_train, y_men_train, t_men_train), 
    n_trials=50, 
    timeout=10000
)
print(study_men.best_params)
print(study_men.best_value)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


{'learning_rate': 0.08938499947839136, 'max_depth': 0.7990020314987771, 'subsample': 0.8715228494662464, 'colsample_bytree': 0.7683588467792184, 'reg_lambda': 6.44507559893496, 'min_child_weight': 3.937538060016898, 'gamma': 65.68239932561528}
0.011291034534913463


In [None]:
study_men_lib = optuna.create_study(direction='maximize', sampler=sampler)
study_men_lib.optimize(
    lambda trial: lib_objective(trial, X_men_train, y_men_train, t_men_train), 
    n_trials=50, 
    timeout=10000
)
print(study_men_lib.best_params)
print(study_men_lib.best_value)

## TPE Women

In [7]:
study_women = optuna.create_study(direction='maximize')
study_women.optimize(
    lambda trial: my_objective(trial, X_women_train_scaled, y_women_train, t_women_train), 
    n_trials=50, 
    timeout=10000
)
print(study_men.best_params)
print(study_men.best_value)

{'learning_rate': 0.08938499947839136, 'max_depth': 0.7990020314987771, 'subsample': 0.8715228494662464, 'colsample_bytree': 0.7683588467792184, 'reg_lambda': 6.44507559893496, 'min_child_weight': 3.937538060016898, 'gamma': 65.68239932561528}
0.011291034534913463
