# Using CausalKit to infer binary treatment with continuous and binary outcome

## Continuos outcome

### Dataset 1

In [9]:
# 1. Generate data
from causalkit.data import  CausalData, CausalDatasetGenerator
import numpy as np

# Python
import numpy as np
from causalkit.data import CausalDatasetGenerator

# Domain setup:
# - outcome_type='continuous' for LTV
# - theta: average causal lift in LTV from receiving the feature
# - target_t_rate: expected adoption/exposure rate of the feature
# - beta_y/beta_t: how confounders influence outcome and treatment (introduces confounding)

gen = CausalDatasetGenerator(
    theta=2.5,                      # average uplift in LTV from receiving the feature
    outcome_type="continuous",
    sigma_y=5.0,                    # noise level for LTV
    target_t_rate=0.1,             # ~30% users receive the feature
    seed=123,

    # Confounders (5 total)
    confounder_specs=[
        {"name": "tenure_days",       "dist": "normal",   "mu": 120, "sd": 60},   # days since install
        {"name": "prior_30d_spend",   "dist": "uniform",  "a": 0.0,  "b": 100.0}, # prior spend in last 30 days
        {"name": "engagement_rate",   "dist": "uniform",  "a": 0.0,  "b": 1.0},   # daily active ratio
        {"name": "premium_user",      "dist": "bernoulli","p": 0.15},             # subscription flag
        {"name": "android",           "dist": "bernoulli","p": 0.55},             # OS flag
    ],

    # Linear effects of confounders
    # Order matches the confounder_specs above
    beta_y=np.array([ 0.02,  0.10,  8.0,  5.0,  0.5]),   # effect on LTV baseline
    beta_t=np.array([ 0.01,  0.02,  1.0,  0.8, -0.3]),   # effect on probability of receiving feature (log-odds)
)

# Create CausalData directly (treatment='t', outcome='y' are set internally)
# Optionally pass the confounder names to keep only a subset; here we include all.
causal_data = gen.to_causal_data(
    n=50_000,
    cofounders=["tenure_days", "prior_30d_spend", "engagement_rate", "premium_user", "android"]
)

# Example access
causal_data.df.head()

Unnamed: 0,y,t,tenure_days,prior_30d_spend,engagement_rate,premium_user,android
0,8.698079,0.0,60.652719,26.563015,0.757982,0.0,1.0
1,16.124146,1.0,97.932801,65.694021,0.48761,0.0,0.0
2,10.631496,0.0,197.275516,19.183593,0.752212,0.0,0.0
3,7.171113,0.0,131.638465,25.349005,0.144235,0.0,1.0
4,17.188792,0.0,175.213854,34.416637,0.833235,0.0,0.0


In [10]:
# 3. Run EDA - eda.py
from causalkit.eda.eda import CausalEDA
eda = CausalEDA(causal_data, n_splits=3, random_state=42)
health = eda.data_health_check()
missing = eda.missingness_report()
summary = eda.summaries()
ps = eda.fit_propensity()
auc = eda.treatment_predictability_auc(ps)
positivity = eda.positivity_check(ps)
balance = eda.balance_table(ps, estimand='ATE')
weights = eda.weight_diagnostics(ps, estimand='ATE')

print('Health:', health)
print('Treatment rate:', summary['treatment_rate'])
print('AUC:', auc)
print('Positivity flag:', positivity['flag'])
print('Balance (head):')
print(balance.head())



Health: {'constant_columns': [], 'n_duplicates': 0, 'n_rows': 50000}
Treatment rate: 0.10066
AUC: 0.7472880072315301
Positivity flag: True
Balance (head):
         covariate  SMD_unweighted  SMD_weighted  flag_unw  flag_w
0      tenure_days        0.587157     -0.002728      True   False
1  prior_30d_spend        0.540571     -0.017009      True   False
2  engagement_rate        0.294137      0.014214      True   False
3     premium_user        0.294061     -0.003439      True   False
4          android       -0.168108      0.016012      True   False


In [11]:
summary

{'treatment_rate': np.float64(0.10066),
 'outcome_by_treatment':      count       mean       std
 t                              
 0.0  44967  12.101917  6.523806
 1.0   5033  18.101835  6.471955,
 'naive_diff': np.float64(5.999918135630265)}

In [12]:
positivity

{'bounds': (0.05, 0.95),
 'share_below': 0.3451,
 'share_above': 0.0,
 'flag': True}

In [13]:
# 4. Run inference - inference (DoubleML)
from causalkit.inference.att import dml

# ATT (optional)
res_att = dml(causal_data, n_folds=3, n_rep=1, confidence_level=0.95)
print('ATT:', res_att['coefficient'], 'CI:', res_att['confidence_interval'])


ATT: 2.6140962339787324 CI: (np.float64(2.4530997174335343), np.float64(2.7750927505239305))


In [7]:
from causalkit.inference.ate import dml
res_ate = dml(causal_data)
print('ATE', res_ate['coefficient'], 'CI:', res_att['confidence_interval'])



ATE 2.5204425167871167 CI: (2.1412362716623714, 2.8678701593925475)
