# Datasets benchmarks

In this notebook we well benchmark data generators processes

In [1]:

import numpy as np
import pandas as pd

# If available in your environment:
from causalkit.data import CausalDatasetGenerator

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Number of observations
n = 10000

# Confounders schema
# These are illustrative product/behavior/user attributes
confounder_specs = [
    {"name": "tenure_months", "dist": "normal", "mu": 18, "sd": 12},  # app tenure
    {"name": "sessions_per_week", "dist": "normal", "mu": 6, "sd": 3},  # engagement
    {"name": "spend_last_30d", "dist": "uniform", "a": 0, "b": 300},  # recent spend
    {"name": "premium_user", "dist": "bernoulli", "p": 0.2},  # subscription
    {"name": "urban_resident", "dist": "bernoulli", "p": 0.65},  # geography
]

# True causal effect on ARPPU (on the natural/mean scale)
theta_arppu = 4.0

# Outcome noise (ARPPU is positive and often heavy-tailed; this is a simple Gaussian noise)
sigma_y = 8.0

# Target share of users exposed to the feature
target_t_rate = 0.4

# How confounders affect ARPPU (baseline; additive)
# Order must match confounder_specs
beta_y = np.array([
    0.08,  # + per tenure month
    0.60,  # + per weekly session
    0.03,  # + per recent spend unit (scaled here)
    6.00,  # premium users have higher ARPPU
    2.00,  # urban residents slightly higher ARPPU
], dtype=float)

# How confounders affect feature exposure (log-odds scale)
beta_t = np.array([
    0.02,  # tenure increases likelihood of exposure
    0.15,  # more sessions -> more likely to get feature
    0.004,  # recent spend -> more likely to get feature
    1.00,  # premium users prioritized
    0.35,  # urban residents slightly more likely
], dtype=float)

gen = CausalDatasetGenerator(
    theta=theta_arppu,
    outcome_type="continuous",
    sigma_y=sigma_y,
    target_t_rate=target_t_rate,
    seed=SEED,
    confounder_specs=confounder_specs,
    beta_y=beta_y,
    beta_t=beta_t,
)

# Create dataset
causal_data = gen.to_causal_data(
    n=n,
    confounders = [
    "tenure_months",
    "sessions_per_week",
    "spend_last_30d",
    "premium_user",
    "urban_resident",
]
)

# Show first few rows
causal_data.df.head()


TypeError: CausalDatasetGenerator.__init__() got an unexpected keyword argument 'target_t_rate'. Did you mean 'target_d_rate'?

In [2]:
from causalkit.inference.ate import dml_ate

# Estimate Average Treatment Effect (ATE)
ate_result = dml_ate(causal_data, n_folds=4, confidence_level=0.95)
ate_result

{'coefficient': 3.959355952103991,
 'std_error': 0.2232106706596863,
 'p_value': 0.0,
 'confidence_interval': (3.521871076645975, 4.3968408275620074),
 'model': <causalkit.inference.estimators.irm.IRM at 0x149939010>}

In [None]:
from causalkit.inference.atte import dml_atte

# Estimate Average Treatment Effect (ATE)
att_result = dml_atte(causal_data, n_folds=4, confidence_level=0.95)
att_result

In [None]:

import numpy as np
import pandas as pd

# If available in your environment:
from causalkit.data import CausalDatasetGenerator

# Reproducibility
SEED = 42
np.random.seed(SEED)

# Number of observations
n = 10000

# Confounders schema
# These are illustrative product/behavior/user attributes
confounder_specs = [
    {"name": "tenure_months", "dist": "normal", "mu": 18, "sd": 12},  # app tenure
    {"name": "sessions_per_week", "dist": "normal", "mu": 6, "sd": 3},  # engagement
    {"name": "spend_last_30d", "dist": "uniform", "a": 0, "b": 300},  # recent spend
    {"name": "premium_user", "dist": "bernoulli", "p": 0.2},  # subscription
    {"name": "urban_resident", "dist": "bernoulli", "p": 0.65},  # geography
]

# True causal effect on ARPPU (on the natural/mean scale)
theta_arppu = 4.0

# Outcome noise (ARPPU is positive and often heavy-tailed; this is a simple Gaussian noise)
sigma_y = 8.0

# Target share of users exposed to the feature
target_t_rate = 0.4

# How confounders affect ARPPU (baseline; additive)
# Order must match confounder_specs
beta_y = np.array([
    0.08,  # + per tenure month
    0.60,  # + per weekly session
    0.03,  # + per recent spend unit (scaled here)
    6.00,  # premium users have higher ARPPU
    2.00,  # urban residents slightly higher ARPPU
], dtype=float)

# How confounders affect feature exposure (log-odds scale)
beta_t = np.array([
    0.02,  # tenure increases likelihood of exposure
    0.15,  # more sessions -> more likely to get feature
    0.9,  # recent spend -> more likely to get feature
    1.00,  # premium users prioritized
    0.35,  # urban residents slightly more likely
], dtype=float)

gen = CausalDatasetGenerator(
    theta=theta_arppu,
    outcome_type="continuous",
    sigma_y=sigma_y,
    target_t_rate=target_t_rate,
    seed=SEED,
    confounder_specs=confounder_specs,
    beta_y=beta_y,
    beta_t=beta_t,
)

# Create dataset
causal_data = gen.to_causal_data(
    n=n,
    confounders = [
    "tenure_months",
    "sessions_per_week",
    "spend_last_30d",
    "premium_user",
    "urban_resident",
]
)

# Show first few rows
causal_data.df.head()


In [None]:
from causalkit.inference.ate import dml_ate

# Estimate Average Treatment Effect (ATE)
ate_result = dml_ate(causal_data, n_folds=4, confidence_level=0.95)
ate_result

In [None]:
from causalkit.inference.atte import dml_atte

# Estimate Average Treatment Effect (ATE)
att_result = dml_atte(causal_data, n_folds=4, confidence_level=0.95)
att_result

In [None]:
from causalkit.eda import CausalEDA
eda = CausalEDA(causal_data)

# shape of data
eda.data_shape()

In [None]:
# Shows means of confounders for control/treated groups, absolute differences, and SMD values
confounders_balance_df = eda.confounders_means()
display(confounders_balance_df)

In [None]:
# Propensity model fit
ps_model = eda.fit_propensity()

# ROC AUC - shows how predictable treatment is from confounders
roc_auc_score = ps_model.roc_auc
print("ROC AUC from PropensityModel:", round(roc_auc_score, 4))

In [None]:
# Positivity check - assess overlap between treatment groups
positivity_result = ps_model.positivity_check()
print("Positivity check from PropensityModel:", positivity_result)

In [None]:
# SHAP values - feature importance for treatment assignment from confounders
shap_values_df = ps_model.shap
display(shap_values_df)