# Observational data with binary treatment

This notebook demonstrates a complete workflow for observational data with a binary treatment using only the `causalkit` library.

Steps:
1. Gather data (synthetic observational)
2. Make CausalData object
3. Run EDA
4. Run inference (ATE via DoubleML; ATT optional)
5. Do GATE analysis


In [16]:
# 1. Gather data - generators.py
from causalkit.data import  CausalData, CausalDatasetGenerator
import numpy as np


gen = CausalDatasetGenerator(
        theta=1.5,
        beta_y=np.array([0.5, -0.4, 0.15, 0.5]),  # effect of confounders on outcome
        beta_t=np.array([0.2, 0.5, -0.3, 0.6]),   # effect of confounders on treatment (observational)
        target_t_rate=0.34,
        outcome_type='continuous',                 # "continuous" or "binary"
        sigma_y=1.0,                               # noise for continuous outcomes
        seed=42,
        confounder_specs=[
            {"name": "age", "dist": "normal", "mu": 45, "sd": 12},         # numeric
            {"name": "smoker", "dist": "bernoulli", "p": 0.25},            # binary
            {"name": "bmi", "dist": "normal", "mu": 27, "sd": 4},          # numeric
            {"name": "urban", "dist": "bernoulli", "p": 0.6},              # binary
        ],)
df = gen.generate(10_000)


df.head()


Unnamed: 0,y,t,age,smoker,bmi,urban,propensity,mu0,mu1,cate
0,26.895838,0.0,48.656605,1.0,20.651225,0.0,0.797173,27.025986,28.525986,1.5
1,21.437461,0.0,32.520191,0.0,32.502713,1.0,0.004898,21.635502,23.135502,1.5
2,32.789667,1.0,54.005414,1.0,32.180623,1.0,0.396435,31.929801,33.429801,1.5
3,34.69238,0.0,56.286777,0.0,35.945336,1.0,0.168895,34.035189,35.535189,1.5
4,14.699042,0.0,21.587578,0.0,31.046206,0.0,0.000469,15.45072,16.95072,1.5


In [17]:
# 2. Make CausalData object
causal_data = CausalData(
    df=df,
    outcome='y',
    treatment='t',
    cofounders=['age', 'smoker', 'bmi', 'urban']
)

In [18]:
# 3. Run EDA - eda.py
from causalkit.eda.eda import CausalEDA
eda = CausalEDA(causal_data, n_splits=3, random_state=42)
health = eda.data_health_check()
missing = eda.missingness_report()
summary = eda.summaries()
ps = eda.fit_propensity()
auc = eda.treatment_predictability_auc(ps)
positivity = eda.positivity_check(ps)
balance = eda.balance_table(ps, estimand='ATE')
weights = eda.weight_diagnostics(ps, estimand='ATE')

print('Health:', health)
print('Treatment rate:', summary['treatment_rate'])
print('AUC:', auc)
print('Positivity flag:', positivity['flag'])
print('Balance (head):')
print(balance.head())


Health: {'constant_columns': [], 'n_duplicates': 0, 'n_rows': 10000}
Treatment rate: 0.3435
AUC: 0.907425576282855
Positivity flag: True
Balance (head):
  covariate  SMD_unweighted  SMD_weighted  flag_unw  flag_w
0       age        1.506023      0.337686      True    True
2       bmi       -0.631012     -0.156250      True    True
3     urban        0.169672      0.020909      True   False
1    smoker        0.103040      0.008485      True   False


In [14]:
# 4. Run inference - inference (DoubleML)
from causalkit.inference.att import dml


# ATT (optional)
res_att = dml(causal_data, n_folds=3, n_rep=1, confidence_level=0.95)
print('ATT:', res_att['coefficient'], 'CI:', res_att['confidence_interval'])


ATT: 1.805562139331003 CI: (np.float64(1.6975502351676948), np.float64(1.9135740434943114))


In [15]:
# 5. Do GATE analysis - inference
from causalkit.inference import gate_esimand
gate_df = gate_esimand(causal_data, n_groups=5, n_folds=3, n_rep=1, confidence_level=0.95)
print(gate_df.head())


   group     n     theta  std_error  p_value  ci_lower  ci_upper
0      0  2000 -2.018809   0.195789      0.0 -2.402547 -1.635070
1      1  2000  0.940914   0.006223      0.0  0.928717  0.953110
2      2  2000  1.828226   0.005560      0.0  1.817329  1.839123
3      3  2000  2.805419   0.007507      0.0  2.790706  2.820132
4      4  2000  6.252275   0.100588      0.0  6.055127  6.449424


Notes:
- The dataset here is synthetic and small for demonstration.
- All steps use only the `causalkit` library interfaces.
- For real data, ensure appropriate feature engineering and diagnostics.
