In [1]:
%load_ext autoreload
%autoreload 2
seed = 42

In [2]:
from sklift.datasets import fetch_hillstrom
from sklift.metrics import uplift_auc_score
from sklearn.model_selection import train_test_split
import pandas as pd
from pipeline_multi import MultiTreatmentTLearner
from sklearn.model_selection import KFold
import numpy as np
from optuna.samplers import TPESampler

# --- BƯỚC 0: Chuẩn bị dữ liệu ---
dataset = fetch_hillstrom(target_col='visit')
df = dataset.data
df['target'] = dataset.target
df['treatment_name'] = dataset.treatment

# Mã hóa treatment thành số để đưa vào model
# No E-Mail=0, Mens E-Mail=1, Womens E-Mail=2
treatment_map = {'No E-Mail': 0, 'Mens E-Mail': 1, 'Womens E-Mail': 2}
df['t_encoded'] = df['treatment_name'].map(treatment_map)

# Feature Engineering (One-hot encoding)
cat_cols = ['zip_code', 'channel', 'history_segment']
df_encoded = pd.get_dummies(df, columns=cat_cols, drop_first=True)

# Tách X, y, t
X = df_encoded.drop(['target', 'treatment_name', 't_encoded'], axis=1).values
y = df_encoded['target'].values
t = df_encoded['t_encoded'].values

# --- BƯỚC 1: Chia Train/Test (Chia 1 lần trên toàn bộ dữ liệu) ---
# Stratify theo cột treatment để đảm bảo tỷ lệ các nhóm ở train và test đều nhau
X_train, X_test, y_train, y_test, t_train, t_test = train_test_split(
    X, y, t, test_size=0.2, random_state=42, stratify=t
)

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
scores = []

def objective(trial, X, y, t):
    params = {
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.1),
        'max_depth': trial.suggest_float('max_depth', 0.7, 1),
        'subsample': trial.suggest_float('subsample', 0.7, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.1, 50.0, log=True),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.001, 10.0, log=True),
        'gamma': trial.suggest_float('gamma', 0.001, 100.0, log=True),
        'n_estimators': trial.suggest_int('n_estimators', 10, 500),
        'device': "gpu",
        'seed': 42
    }
    
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    cv_scores = []
    
    for train_idx, val_idx in kf.split(X):
        X_train_cv, X_val_cv = X[train_idx], X[val_idx]
        y_train_cv, y_val_cv = y[train_idx], y[val_idx]
        t_train_cv, t_val_cv = t[train_idx], t[val_idx]
        
        model = MultiTreatmentTLearner(**params)
        model.fit(X_train_cv, y_train_cv, t_train_cv)
        
        # --- ĐÁNH GIÁ NHÓM MEN (Group 1 vs Control 0) ---
        uplift_men = model.predict(X_val_cv, treatment_group=1)
        # Mask: chỉ giữ Control (0) và Men (1)
        mask_men = np.isin(t_val_cv, [0, 1])
        # Tính AUUC Men
        score_men = uplift_auc_score(
            y_true=y_val_cv[mask_men], 
            uplift=uplift_men[mask_men], 
            treatment=t_val_cv[mask_men]
        )
        
        # --- ĐÁNH GIÁ NHÓM WOMEN (Group 2 vs Control 0) ---
        uplift_women = model.predict(X_val_cv, treatment_group=2)
        
        # Mask: chỉ giữ Control (0) và Women (2)
        mask_women = np.isin(t_val_cv, [0, 2])
        
        # Map treatment: Women (2) thành 1 để hàm metric hiểu
        t_val_women_binary = (t_val_cv[mask_women] == 2).astype(int)
        
        # Tính AUUC Women
        score_women = uplift_auc_score(
            y_true=y_val_cv[mask_women], 
            uplift=uplift_women[mask_women], 
            treatment=t_val_women_binary
        )
        
        final_score = (score_men + score_women)/2
        scores.append({'men': score_men, 'women': score_women, 'average': final_score})
        cv_scores.append(final_score)
        
    return np.mean(cv_scores)

In [5]:
sampler = TPESampler(seed=42)
study = optuna.create_study(direction="maximize", sampler=sampler)

print("--- Bắt đầu tìm kiếm Hyperparameter (TPE) ---")
study.optimize(lambda trial: objective(trial, X_train, y_train, t_train), n_trials=50)

print("\nBest params:", study.best_params)
print("Best CV AUUC:", study.best_value)

# --- 4. HUẤN LUYỆN LẠI VỚI BEST PARAMS ---
print("\n--- Retraining Final Model ---")
best_params = study.best_params
best_params['seed'] = 42

final_model = MultiTreatmentTLearner(**best_params)
final_model.fit(X_train, y_train, t_train)

# --- EVALUATE CHO NHÓM MEN (1) ---
uplift_test_men = final_model.predict(X_test, treatment_group=1)

# Lọc: Chỉ lấy Control (0) và Men (1)
mask_men = np.isin(t_test, [0, 1])
test_auuc_men = uplift_auc_score(
    y_true=y_test[mask_men], 
    uplift=uplift_test_men[mask_men], 
    treatment=t_test[mask_men] # t_test ở đây chỉ chứa 0 và 1 nên hợp lệ
)

# --- EVALUATE CHO NHÓM WOMEN (2) ---
uplift_test_women = final_model.predict(X_test, treatment_group=2)

# Lọc: Chỉ lấy Control (0) và Women (2)
mask_women = np.isin(t_test, [0, 2])
# Lưu ý: Cần map giá trị 2 về 1 để hàm metric hiểu đó là nhóm treatment
t_test_women_binary = (t_test[mask_women] == 2).astype(int)

test_auuc_women = uplift_auc_score(
    y_true=y_test[mask_women], 
    uplift=uplift_test_women[mask_women], 
    treatment=t_test_women_binary
)

print(f"Final Test AUUC (Hillstrom Men): {test_auuc_men:.4f}")
print(f"Final Test AUUC (Hillstrom Women): {test_auuc_women:.4f}")

--- Bắt đầu tìm kiếm Hyperparameter (TPE) ---
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model for group 1...
  -> Training model for group 2...
  -> Training model for group 0...
  -> Training model 

In [9]:
import json
with open('hillstrom_multi_treatment_tpe_results.json', 'w') as f:
    json.dump(scores, f)

In [15]:
best_run = max(scores, key=lambda x: x['average'])
print(f"Details -> Men: {best_run['men']:.4f}, Women: {best_run['women']:.4f}")

TypeError: unsupported format string passed to numpy.ndarray.__format__