# Datasets benchmarks

In this notebook we well benchmark data generators processes

In [1]:

import numpy as np
import pandas as pd

# If available in your environment:
from causalkit.data import CausalDatasetGenerator

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Number of observations
n = 10000

# Confounders schema
# These are illustrative product/behavior/user attributes
confounder_specs = [
    {"name": "tenure_months", "dist": "normal", "mu": 18, "sd": 12},  # app tenure
    {"name": "sessions_per_week", "dist": "normal", "mu": 6, "sd": 3},  # engagement
    {"name": "spend_last_30d", "dist": "uniform", "a": 0, "b": 300},  # recent spend
    {"name": "premium_user", "dist": "bernoulli", "p": 0.2},  # subscription
    {"name": "urban_resident", "dist": "bernoulli", "p": 0.65},  # geography
]

# True causal effect on ARPPU (on the natural/mean scale)
theta_arppu = 4.0

# Outcome noise (ARPPU is positive and often heavy-tailed; this is a simple Gaussian noise)
sigma_y = 8.0

# Target share of users exposed to the feature
target_t_rate = 0.4

# How confounders affect ARPPU (baseline; additive)
# Order must match confounder_specs
beta_y = np.array([
    0.08,  # + per tenure month
    0.60,  # + per weekly session
    0.03,  # + per recent spend unit (scaled here)
    6.00,  # premium users have higher ARPPU
    2.00,  # urban residents slightly higher ARPPU
], dtype=float)

# How confounders affect feature exposure (log-odds scale)
beta_t = np.array([
    0.02,  # tenure increases likelihood of exposure
    0.15,  # more sessions -> more likely to get feature
    0.004,  # recent spend -> more likely to get feature
    1.00,  # premium users prioritized
    0.35,  # urban residents slightly more likely
], dtype=float)

gen = CausalDatasetGenerator(
    theta=theta_arppu,
    outcome_type="continuous",
    sigma_y=sigma_y,
    target_t_rate=target_t_rate,
    seed=SEED,
    confounder_specs=confounder_specs,
    beta_y=beta_y,
    beta_t=beta_t,
)

# Create dataset
causal_data = gen.to_causal_data(
    n=n,
    confounders = [
    "tenure_months",
    "sessions_per_week",
    "spend_last_30d",
    "premium_user",
    "urban_resident",
]
)

# Show first few rows
causal_data.df.head()


Unnamed: 0,y,t,tenure_months,sessions_per_week,spend_last_30d,premium_user,urban_resident
0,8.406833,0.0,21.656605,6.528831,108.828852,1.0,0.0
1,20.729293,0.0,5.520191,8.69737,282.721931,1.0,0.0
2,21.206321,1.0,27.005414,1.583188,76.56066,0.0,1.0
3,12.788396,1.0,29.286777,5.144106,250.488359,0.0,1.0
4,25.289485,0.0,-5.412422,8.48746,269.056688,0.0,1.0


In [2]:
from causalkit.inference.ate import dml_ate

# Estimate Average Treatment Effect (ATE)
ate_result = dml_ate(causal_data, n_folds=4, confidence_level=0.95)
ate_result

{'coefficient': 4.090571940507052,
 'std_error': 0.22198970848317795,
 'p_value': 0.0,
 'confidence_interval': (3.6554801069414777, 4.525663774072627),
 'model': <causalkit.inference.estimators.irm.IRM at 0x153352120>}

In [3]:
from causalkit.inference.att import dml_att

# Estimate Average Treatment Effect (ATE)
att_result = dml_att(causal_data, n_folds=4, confidence_level=0.95)
att_result

{'coefficient': 3.9104555564866956,
 'std_error': 0.23592251483262563,
 'p_value': 0.0,
 'confidence_interval': (3.4480559242726327, 4.3728551887007585),
 'model': <causalkit.inference.estimators.irm.IRM at 0x10769b250>}

In [4]:

import numpy as np
import pandas as pd

# If available in your environment:
from causalkit.data import CausalDatasetGenerator

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Number of observations
n = 10000

# Confounders schema
# These are illustrative product/behavior/user attributes
confounder_specs = [
    {"name": "tenure_months", "dist": "normal", "mu": 18, "sd": 12},  # app tenure
    {"name": "sessions_per_week", "dist": "normal", "mu": 6, "sd": 3},  # engagement
    {"name": "spend_last_30d", "dist": "uniform", "a": 0, "b": 300},  # recent spend
    {"name": "premium_user", "dist": "bernoulli", "p": 0.2},  # subscription
    {"name": "urban_resident", "dist": "bernoulli", "p": 0.65},  # geography
]

# True causal effect on ARPPU (on the natural/mean scale)
theta_arppu = 4.0

# Outcome noise (ARPPU is positive and often heavy-tailed; this is a simple Gaussian noise)
sigma_y = 8.0

# Target share of users exposed to the feature
target_t_rate = 0.4

# How confounders affect ARPPU (baseline; additive)
# Order must match confounder_specs
beta_y = np.array([
    0.08,  # + per tenure month
    0.60,  # + per weekly session
    0.03,  # + per recent spend unit (scaled here)
    6.00,  # premium users have higher ARPPU
    2.00,  # urban residents slightly higher ARPPU
], dtype=float)

# How confounders affect feature exposure (log-odds scale)
beta_t = np.array([
    0.02,  # tenure increases likelihood of exposure
    0.15,  # more sessions -> more likely to get feature
    0.9,  # recent spend -> more likely to get feature
    1.00,  # premium users prioritized
    0.35,  # urban residents slightly more likely
], dtype=float)

gen = CausalDatasetGenerator(
    theta=theta_arppu,
    outcome_type="continuous",
    sigma_y=sigma_y,
    target_t_rate=target_t_rate,
    seed=SEED,
    confounder_specs=confounder_specs,
    beta_y=beta_y,
    beta_t=beta_t,
)

# Create dataset
causal_data = gen.to_causal_data(
    n=n,
    confounders = [
    "tenure_months",
    "sessions_per_week",
    "spend_last_30d",
    "premium_user",
    "urban_resident",
]
)

# Show first few rows
causal_data.df.head()


Unnamed: 0,y,t,tenure_months,sessions_per_week,spend_last_30d,premium_user,urban_resident
0,12.406833,1.0,21.656605,6.528831,108.828852,1.0,0.0
1,24.729293,1.0,5.520191,8.69737,282.721931,1.0,0.0
2,21.206321,1.0,27.005414,1.583188,76.56066,0.0,1.0
3,12.788396,1.0,29.286777,5.144106,250.488359,0.0,1.0
4,29.289485,1.0,-5.412422,8.48746,269.056688,0.0,1.0


In [5]:
from causalkit.inference.ate import dml_ate

# Estimate Average Treatment Effect (ATE)
ate_result = dml_ate(causal_data, n_folds=4, confidence_level=0.95)
ate_result

{'coefficient': 6.691277054034963,
 'std_error': 0.2811688205220031,
 'p_value': 0.0,
 'confidence_interval': (6.14019629223623, 7.242357815833695),
 'model': <causalkit.inference.estimators.irm.IRM at 0x153545f90>}

In [6]:
from causalkit.inference.att import dml_att

# Estimate Average Treatment Effect (ATE)
att_result = dml_att(causal_data, n_folds=4, confidence_level=0.95)
att_result

{'coefficient': 5.557809684780822,
 'std_error': 0.16240854670087557,
 'p_value': 0.0,
 'confidence_interval': (5.239494782465615, 5.8761245870960295),
 'model': <causalkit.inference.estimators.irm.IRM at 0x153362c40>}

In [7]:
from causalkit.eda import CausalEDA
eda = CausalEDA(causal_data)

# shape of data
eda.data_shape()

{'n_rows': 10000, 'n_columns': 7}

In [8]:
# Shows means of confounders for control/treated groups, absolute differences, and SMD values
confounders_balance_df = eda.confounders_means()
display(confounders_balance_df)

Unnamed: 0_level_0,mean_t_0,mean_t_1,abs_diff,smd
confounders,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
spend_last_30d,26.81953,177.132999,150.313468,2.927622
urban_resident,0.662323,0.647359,0.014964,-0.031474
premium_user,0.193768,0.198907,0.005139,0.012936
sessions_per_week,6.052819,6.062821,0.010002,0.003322
tenure_months,17.883618,17.875583,0.008035,-0.00067


In [9]:
# Propensity model fit
ps_model = eda.fit_propensity()

# ROC AUC - shows how predictable treatment is from confounders
roc_auc_score = ps_model.roc_auc
print("ROC AUC from PropensityModel:", round(roc_auc_score, 4))

ROC AUC from PropensityModel: 0.9998


In [10]:
# Positivity check - assess overlap between treatment groups
positivity_result = ps_model.positivity_check()
print("Positivity check from PropensityModel:", positivity_result)

Positivity check from PropensityModel: {'bounds': (0.05, 0.95), 'share_below': 0.1701, 'share_above': 0.8183, 'flag': True}


In [11]:
# SHAP values - feature importance for treatment assignment from confounders
shap_values_df = ps_model.shap
display(shap_values_df)

Unnamed: 0,feature,shap_mean,shap_mean_abs,odds_mult_abs,exact_pp_change_abs,exact_pp_change_signed
0,num__spend_last_30d,-0.000424,5.302985,200.935675,0.175716,-6.2e-05
1,num__tenure_months,-0.000442,0.254931,1.290372,0.034106,-6.4e-05
2,num__urban_resident,0.000445,0.178274,1.195153,0.02447,6.5e-05
3,num__sessions_per_week,0.00034,0.177695,1.194461,0.024395,5e-05
4,num__premium_user,8.1e-05,0.094717,1.099348,0.013365,1.2e-05
