In [1]:
!pip install numpy pandas torch scikit-learn matplotlib seaborn scikit-uplift causalml

Collecting scikit-uplift
  Downloading scikit_uplift-0.5.1-py3-none-any.whl.metadata (11 kB)
Collecting causalml
  Downloading causalml-0.15.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (10 kB)
Collecting forestci==0.6 (from causalml)
  Downloading forestci-0.6-py3-none-any.whl.metadata (1.3 kB)
Collecting pathos==0.2.9 (from causalml)
  Downloading pathos-0.2.9-py3-none-any.whl.metadata (11 kB)
Downloading scikit_uplift-0.5.1-py3-none-any.whl (42 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.1/42.1 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading causalml-0.15.5-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (8.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.4/8.4 MB[0m [31m63.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading forestci-0.6-py3-none-any.whl (12 kB)
Downloading pathos-0.2.9-py3-none-any.whl (76 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

# Prepare the data

In [2]:
from causalml.dataset import make_uplift_classification

# Define the treatment names based on the 6 treatments mentioned in the paper
treatment_names = ['control', 'treatment_1', 'treatment_2', 'treatment_3', 'treatment_4', 'treatment_5', 'treatment_6']

delta_uplift_increase = {
    'control': 0.0, # <--- Uplift của Control = 0
    'treatment_1': 0.05, 'treatment_2': 0.1, 'treatment_3': 0.12,
    'treatment_4': 0.17, 'treatment_5': 0.2, 'treatment_6': 0.2
}

delta_uplift_decrease = {
    'control': 0.0, # <--- Uplift của Control = 0
    'treatment_1': 0.01, 'treatment_2': 0.02, 'treatment_3': 0.03,
    'treatment_4': 0.05, 'treatment_5': 0.06, 'treatment_6': 0.07
}
n_uplift_increase_mix_informative = [0, 1, 2, 3, 4, 5, 6]
n_uplift_decrease_mix_informative = [0, 1, 1, 1, 1, 1, 1]
    
# Generate the data
df, x_names = make_uplift_classification(
    n_samples=10000 * 7, # Adjust total samples to account for control + 6 treatments
    treatment_name=treatment_names,
    n_classification_features=100,
    n_classification_informative=20,
    n_classification_redundant=10,
    n_classification_repeated=10,
    positive_class_proportion=0.2,
    delta_uplift_increase_dict=delta_uplift_increase,
    delta_uplift_decrease_dict=delta_uplift_decrease,
    n_uplift_increase_mix_informative_dict={t: v for t, v in zip(treatment_names, n_uplift_increase_mix_informative)},
    n_uplift_decrease_mix_informative_dict={t: v for t, v in zip(treatment_names, n_uplift_decrease_mix_informative)},
    random_seed=42
)

  if entities is not ():
Failed to import duecredit due to No module named 'duecredit'


In [3]:
X = df[x_names].values

Y = df['conversion'].values

treatment_map = {
    'control': 0,
    'treatment_1': 1,
    'treatment_2': 2,
    'treatment_3': 3,
    'treatment_4': 4,
    'treatment_5': 5,
    'treatment_6': 6
}
T = df['treatment_group_key'].map(treatment_map).values

# Verify shapes
print(f"X shape: {X.shape}")
print(f"Y shape: {Y.shape}")
print(f"T shape: {T.shape}")

X shape: (490000, 100)
Y shape: (490000,)
T shape: (490000,)


# End to end pipeline

In [4]:
# Building model
from functools import partial
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
import torch
from torch import nn
import torch.nn.functional as F
class DragonNetBase(nn.Module):
    """
    Parameters
    ----------
    input_dim: int
        input dimension for convariates
    shared_hidden: int
        layer size for hidden shared representation layers
    outcome_hidden: int
        layer size for conditional outcome layers
    """
    def __init__(self, input_dim, shared_hidden=200, outcome_hidden=100):
        super(DragonNetBase, self).__init__()
        # NOTE: Shared representation layers - Dragon Body
        self.full_connect_1 = nn.Linear(in_features=input_dim, out_features=shared_hidden)
        self.full_connect_2 = nn.Linear(in_features=shared_hidden, out_features=shared_hidden)
        self.full_connect_3 = nn.Linear(in_features=shared_hidden, out_features=shared_hidden)

        # NOTE: Output of the Dragon Body
        self.treat_out = nn.Linear(in_features=shared_hidden, out_features=1)
        #---------------------------------------------------#

        # NOTE: Prediction heads - 1st Dragon Head - Control
        self.control_head_full_connect_1 = nn.Linear(in_features=shared_hidden, out_features=outcome_hidden)
        self.control_head_full_connect_2 = nn.Linear(in_features=outcome_hidden, out_features=outcome_hidden)
        self.control_head_full_connect_out = nn.Linear(in_features=outcome_hidden, out_features=1)

        # NOTE: Prediction heads - 2nd Dragon Head - Treatment
        self.treatment_head_full_connect_1 = nn.Linear(in_features=shared_hidden, out_features=outcome_hidden)
        self.treatment_head_full_connect_2 = nn.Linear(in_features=outcome_hidden, out_features=outcome_hidden)
        self.treatment_head_full_connect_out = nn.Linear(in_features=outcome_hidden, out_features=1)

        # NOTE: Propensity score head - 3rd Dragon Head - uses linear epsilon
        self.epsilon = nn.Linear(in_features=1, out_features=1)
        torch.nn.init.xavier_normal_(self.epsilon.weight)

    def forward(self, inputs):
        """
        forward method to train model.

        Parameters
        ----------
        inputs: torch.Tensor
            covariates

        Returns
        -------
        y0: torch.Tensor
            outcome under control
        y1: torch.Tensor
            outcome under treatment
        t_pred: torch.Tensor
            predicted treatment
        eps: torch.Tensor
            trainable epsilon parameter
        """
        #shared layer
        x = F.elu(self.full_connect_1(inputs))
        x = F.elu(self.full_connect_2(x))
        z = F.elu(self.full_connect_3(x))

        #propensity
        t_pred = torch.sigmoid(self.treat_out(z))

        y0 = F.elu(self.control_head_full_connect_1(z))
        y0 = F.elu(self.control_head_full_connect_2(y0))
        y0 = self.control_head_full_connect_out(y0)

        y1 = F.elu(self.treatment_head_full_connect_1(z))
        y1 = F.elu(self.treatment_head_full_connect_2(y1))
        y1 = self.treatment_head_full_connect_out(y1)

        eps = self.epsilon(torch.ones_like(t_pred)[:, 0:1])

        return y0, y1, t_pred, eps

def default_loss(y_true, t_true, t_pred, y0_pred, y1_pred, eps, alpha=1.0):
    """
    Generic loss function for dragonnet

    Parameters
    ----------
    y_true: torch.Tensor
        Actual target variable
    t_true: torch.Tensor
        Actual treatment variable
    t_pred: torch.Tensor
        Predicted treatment
    y0_pred: torch.Tensor
        Predicted target variable under control
    y1_pred: torch.Tensor
        Predicted target variable under treatment
    eps: torch.Tensor
        Trainable epsilon parameter
    alpha: float
        loss component weighting hyperparameter between 0 and 1
    Returns
    -------
    loss: torch.Tensor
    """
    t_pred = (t_pred + 0.01) / 1.02
    loss_t = torch.sum(F.binary_cross_entropy(t_pred, t_true))

    loss0 = torch.sum((1. - t_true) * torch.square(y_true - y0_pred))
    loss1 = torch.sum(t_true * torch.square(y_true - y1_pred))

    loss = loss0 + loss1 + alpha * loss_t

    return loss

def tarreg_loss(y_true, t_true, t_pred, y0_pred, y1_pred, eps, alpha=1.0, beta=1.0):
    """
    Targeted regularisation loss function for dragonnet

    Parameters
    ----------
    y_true: torch.Tensor
        Actual target variable
    t_true: torch.Tensor
        Actual treatment variable
    t_pred: torch.Tensor
        Predicted treatment
    y0_pred: torch.Tensor
        Predicted target variable under control
    y1_pred: torch.Tensor
        Predicted target variable under treatment
    eps: torch.Tensor
        Trainable epsilon parameter
    alpha: float
        loss component weighting hyperparameter between 0 and 1
    beta: float
        targeted regularization hyperparameter between 0 and 1
    Returns
    -------
    loss: torch.Tensor
    """
    vanilla_loss = default_loss(y_true, t_true, t_pred, y0_pred, y1_pred, alpha)
    t_pred = (t_pred + 0.01) / 1.02

    y_pred = t_true * y1_pred + (1 - t_true) * y0_pred

    # clever covariate
    h = (t_true / t_pred) - ((1 - t_true) / (1 - t_pred))

    y_pert = y_pred + eps * h
    targeted_regularization = torch.sum((y_true - y_pert)**2)

    # final
    loss = vanilla_loss + beta * targeted_regularization
    return loss

class EarlyStopper:
    def __init__(self, patience=15, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf

    def early_stop(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False

class Dragonnet:
    """
    Main class for the Dragonnet model

    Parameters
    ----------
    input_dim: int
        Input demension for convariates (X - features)
    shared_hidden: int, default=200
        The number of hidden layers in the dragon body
    outcome_hidden: int, default=100
        The number of hidden layers in the dragon accuracy head
    alpha: float, default=1.0
        loss component weighting hyperparameter between 0 and 1
    beta: float, default=1.0
        targeted regularization hyperparameter between 0 and 1
    epochs: int, default=200
        Number training epochs
    batch_size: int, default=64
        Training batch size
    learning_rate: float, default=1e-3
        Learning rate
    data_loader_num_workers: int, default=4
        Number of workers for data loader
    loss_type: str, {'tarreg', 'default'}, default='tarreg'
        Loss function to use
    device=None
        Whether we use the CPU or GPU to train
    """
    def __init__(
            self,
            input_dim, # Input demension for convariates (X - features)
            shared_hidden=200, # The number of hidden layers in the dragon body
            outcome_hidden=100, # The number of hidden layers in the dragon accuracy head
            alpha=1.0, #
            beta=1.0,
            epochs=30,
            batch_size=32,
            learning_rate=0.0005,
            data_loader_num_workers=2,
            loss_type="tarreg",
            device=None,
            seed=42
    ):
        # 1. Thiết lập Device
        if device:
            self.model_device = device
        else:
            self.model_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        # 2. Khởi tạo Model và đưa lên Device ngay lập tức
        self.model = DragonNetBase(input_dim=input_dim, shared_hidden=shared_hidden, outcome_hidden=outcome_hidden)
        self.model.to(self.model_device) # Move model to GPU
        self.epochs = epochs
        self.batch_size = batch_size
        self.num_workers = data_loader_num_workers
        self.seed = seed

        # Optimizer phải được khởi tạo SAU KHI model đã được move lên GPU
        self.optim = torch.optim.Adam(self.model.parameters(), lr=learning_rate)
        self.train_dataloader = None
        self.valid_dataloader = None

        if loss_type == "tarreg":
            self.loss_f = partial(tarreg_loss, alpha=alpha, beta=beta)
        elif loss_type == "default":
            self.loss_f = partial(default_loss, alpha=alpha)

    def create_dataloaders(self, X, y, T, valid_perc=None):
        """
        Utility function to create train and validation data loader:

        Parameters
        ----------
        X: np.array
            covariates
        y: np.array
            target variable
        T: np.array
            treatment
        """
        if valid_perc:
            X_train, X_test, y_train, y_test, T_train, T_test = train_test_split(
                X, y, T, test_size=valid_perc, random_state=self.seed
            )
            # Không cần .to(device) ở đây để tiết kiệm VRAM, sẽ move theo batch
            X_train = torch.Tensor(X_train)
            X_test = torch.Tensor(X_test)
            y_train = torch.Tensor(y_train).reshape(-1, 1)
            y_test = torch.Tensor(y_test).reshape(-1, 1)
            T_train = torch.Tensor(T_train).reshape(-1, 1)
            T_test = torch.Tensor(T_test).reshape(-1, 1)
            train_dataset = TensorDataset(X_train, T_train, y_train)
            valid_dataset = TensorDataset(X_test, T_test, y_test)
            self.train_dataloader = DataLoader(train_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
            self.valid_dataloader = DataLoader(valid_dataset, batch_size=self.batch_size, num_workers=self.num_workers)
        else:
            X = torch.Tensor(X)
            T = torch.Tensor(T).reshape(-1, 1)
            y = torch.Tensor(y).reshape(-1, 1)
            train_dataset = TensorDataset(X, T, y)
            self.train_dataloader = DataLoader(
                train_dataset, batch_size=self.batch_size, num_workers=self.num_workers
            )

    def fit(self, X, y, T, valid_perc=None):
        """
        Function used to train the dragonnet model

        Parameters
        ----------
        x: np.array
            covariates
        y: np.array
            target variable
        t: np.array
            treatment
        valid_perc: float
            Percentage of data to allocate to validation set
        """
        self.train_losses, self.valid_losses = [], []
        self.create_dataloaders(X, y, T, valid_perc)
        early_stopper = EarlyStopper(patience=10, min_delta=0)
        for epoch in range(self.epochs):
            running_loss_train = 0.0
            for batch, (X, tr, y1) in enumerate(self.train_dataloader):
                # <--- QUAN TRỌNG: Move batch data lên GPU
                X = X.to(self.model_device)
                tr = tr.to(self.model_device)
                y1 = y1.to(self.model_device)

                self.optim.zero_grad()

                y0_pred, y1_pred, t_pred, eps = self.model(X)
                loss = self.loss_f(y1, tr, t_pred, y0_pred, y1_pred, eps)

                loss.backward()
                self.optim.step()
                running_loss_train += loss.item()

            train_loss = running_loss_train/len(self.train_dataloader)
            self.train_losses.append(train_loss)
            if self.valid_dataloader:
                self.model.eval()
                valid_loss = self.validate_step()
                self.valid_losses.append(valid_loss)
                # print(
                #     f"epoch: {epoch}--------- train_loss: {train_loss:.4f} ----- valid_loss: {valid_loss}"
                # )
                self.model.train()
                if early_stopper.early_stop(valid_loss):
                    print("Early stopping activated")
                    break
            else:
                # print(f"epoch: {epoch}--------- train_loss: {train_loss:.4f}")
                pass

    def validate_step(self):
        """
        Calculates validation loss

        Returns
        -------
        valid_loss: torch.Tensor
            validation loss
        """
        self.model.eval()
        valid_loss = []
        with torch.no_grad():
            for batch, (X, tr, y1) in enumerate(self.valid_dataloader):
                # <--- QUAN TRỌNG: Move batch data lên GPU
                X = X.to(self.model_device)
                tr = tr.to(self.model_device)
                y1 = y1.to(self.model_device)

                y0_pred, y1_pred, t_pred, eps = self.model(X)
                loss = self.loss_f(y1, tr, t_pred, y0_pred, y1_pred, eps)
                valid_loss.append(loss)
        return torch.Tensor(valid_loss).mean()


    def predict(self, X):
        """
        Function used to predict on covariates.

        Parameters
        ----------
        X: torch.Tensor or numpy.array
            covariates

        Returns
        -------
        y0_pred: torch.Tensor
            outcome under control
        y1_pred: torch.Tensor
            outcome under treatment
        t_pred: torch.Tensor
            predicted treatment
        eps: torch.Tensor
            trainable epsilon parameter
        """
        self.model.eval()
        X = torch.Tensor(X).to(self.model_device) # <--- Move input lên GPU
        with torch.no_grad():
            y0_pred, y1_pred, t_pred, eps = self.model(X)
        return (
            y0_pred.cpu().numpy(),
            y1_pred.cpu().numpy(),
            t_pred.cpu().numpy(),
            eps.cpu().numpy()
        )

# Running

In [5]:
import torch
import numpy as np
from sklearn.metrics import mean_squared_error
from sklift.metrics import uplift_auc_score
from sklearn.model_selection import train_test_split
models = {} # Store one model per treatment

treatment_list = [1, 2, 3, 4, 5, 6]
models = {} # Store one model per treatment

for treat_id in treatment_list:
    print(f"\n=== Training DragonNet for Control vs Treatment {treat_id} ===")
    
    # 1. Filter Data: Keep only Control (0) and Current Treatment (treat_id)
    mask = np.isin(T, [0, treat_id])
    X_sub = X[mask]
    Y_sub = Y[mask]
    T_sub = T[mask]

    # 2. Binarize Treatment: Map treat_id -> 1, Control -> 0
    T_sub_binary = np.where(T_sub == treat_id, 1, 0)
    
    # Split train test
    X_train, X_test, Y_train, Y_test, T_train, T_test = train_test_split(
        X_sub, Y_sub, T_sub_binary, test_size=0.2, random_state=42, stratify=T_sub
    )
    
    # 2. Binarize Treatment: Map treat_id -> 1, Control -> 0
    # This is crucial for your model's binary propensity head
    T_sub_binary = np.where(T_sub == treat_id, 1, 0)
    
    # 3. Initialize Model
    dragonnet_model = Dragonnet(
        input_dim=X_sub.shape[1],
        seed=42
    )
    
    # 4. Train
    dragonnet_model.fit(X_train, Y_train, T_train, valid_perc=0.2)
    
    # 5. Predict and Evaluate
    y0_pred, y1_pred, t_pred, eps = dragonnet_model.predict(X_test)
    y_pred_final = T_test.flatten() * y1_pred.flatten() + (1 - T_test.flatten()) * y0_pred.flatten()
    
    mse = mean_squared_error(Y_test, y_pred_final)
    uplift = y1_pred - y0_pred
    auc = uplift_auc_score(Y_test, uplift, T_test)
    
    print("MSE: ", mse)
    print("Uplift AUC: ", auc)
    
    # 6. Store model
    models[f"treatment_{treat_id}"] = dragonnet_model

print("\nAll models trained.")


=== Training DragonNet for Control vs Treatment 1 ===
Early stopping activated
MSE:  0.024297721258878587
Uplift AUC:  0.014250053336391014

=== Training DragonNet for Control vs Treatment 2 ===
Early stopping activated
MSE:  0.026008259443829413
Uplift AUC:  0.002905677829647605

=== Training DragonNet for Control vs Treatment 3 ===
Early stopping activated
MSE:  0.024332807142681167
Uplift AUC:  -0.004758107745192709

=== Training DragonNet for Control vs Treatment 4 ===
Early stopping activated
MSE:  0.02542913326268863
Uplift AUC:  0.004621955306152028

=== Training DragonNet for Control vs Treatment 5 ===
Early stopping activated
MSE:  0.02557554413435204
Uplift AUC:  -0.007403355088457807

=== Training DragonNet for Control vs Treatment 6 ===
Early stopping activated
MSE:  0.024991057266101034
Uplift AUC:  0.003961910657454211

All models trained.
