In [1]:
!pip install -e ../CauseML

Obtaining file:///home/jovyan/work/CauseML
Installing collected packages: cause-ml
  Found existing installation: cause-ml 0.0.11
    Uninstalling cause-ml-0.0.11:
      Successfully uninstalled cause-ml-0.0.11
  Running setup.py develop for cause-ml
Successfully installed cause-ml


In [2]:
from maccabee.data_generation import DataGeneratingProcess, data_generating_method

In [3]:
from maccabee.parameters import build_parameters_from_axis_levels
from maccabee.constants import Constants
import maccabee.data_sources as data_sources
from maccabee.data_generation import DataGeneratingProcessSampler
from maccabee.modeling.models import LinearRegressionCausalModel
from maccabee.benchmarking import run_sampled_dgp_benchmark, run_concrete_dgp_benchmark
from maccabee.utilities import evaluate_expression



In [4]:
import pandas as pd
import sympy as sp

## Model Demo

In [5]:
covar_data_source = data_sources.load_random_normal_covariates(
    n_covars = 10, n_observations=2000)
dgp_params = build_parameters_from_axis_levels({
    Constants.AxisNames.OUTCOME_NONLINEARITY: Constants.AxisLevels.LOW,
    Constants.AxisNames.TREATMENT_NONLINEARITY: Constants.AxisLevels.LOW,
})

dgp_sampler = DataGeneratingProcessSampler(
    parameters=dgp_params,
    data_source=covar_data_source,
    dgp_kwargs={"analysis_mode": False})

dgp = dgp_sampler.sample_dgp()
dataset = dgp.generate_dataset()

In [6]:
dataset.observed_data

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,T,Y
233,0.646652,-0.005477,0.304405,-0.021740,0.419244,-0.060445,0.456098,0.172520,-0.429364,-0.371017,1,-1.189186
1798,-0.241170,0.138598,-0.467572,0.257349,-0.056115,0.150686,0.063894,0.235212,-0.303095,0.229418,0,-0.026098
1987,0.311417,0.048014,0.021688,-0.311008,0.166351,-0.290156,-0.014122,0.559657,0.357604,-0.469322,1,0.182806
522,-0.228109,-0.314905,-0.439178,-0.360470,-0.106045,0.233543,0.574437,-0.227116,-0.297400,0.226301,1,-2.159997
383,-0.188641,-0.469164,0.254269,-0.760608,0.227939,0.292531,-0.212031,0.111299,0.161028,0.084267,0,1.435689
...,...,...,...,...,...,...,...,...,...,...,...,...
1780,-0.111548,-0.345937,0.148628,0.038056,0.083210,0.302151,0.028320,0.056581,-0.060523,-0.314784,1,0.209440
898,-0.407591,-0.829459,-0.108428,0.340119,-0.158264,0.318645,0.613911,-0.367022,0.076102,0.019889,1,-1.610043
1494,-0.289327,-0.291490,-0.203806,-0.087229,-0.156469,-0.108436,0.444104,0.306043,-0.131186,-0.082318,0,-0.457372
964,-0.295636,-0.308108,0.314402,0.146426,0.609600,0.321162,-0.271904,0.292936,-0.214338,0.401297,1,2.630337


In [7]:
dataset.ATE

-0.02499999999999998

In [8]:
model = LinearRegressionCausalModel(dataset)
model.fit()
model.estimate(estimand=Constants.Model.ATE_ESTIMAND)

-0.04450373023316426

## Sampled DGP Benchmarking Demo

In [9]:
%%time

HIGH, MEDIUM, LOW = Constants.AxisLevels.HIGH, Constants.AxisLevels.MEDIUM, Constants.AxisLevels.LOW
param_grid = dgp_params = {
    Constants.AxisNames.TREATMENT_NONLINEARITY: [MEDIUM, LOW],
    Constants.AxisNames.OUTCOME_NONLINEARITY: [LOW]
#     Constants.AxisNames.TREATMENT_NONLINEARITY: [HIGH, MEDIUM, LOW],
#     Constants.AxisNames.OUTCOME_NONLINEARITY: [HIGH, MEDIUM, LOW]
}

covar_data_source = data_sources.load_random_normal_covariates(
    n_covars=10,
    n_observations=750)

result = run_sampled_dgp_benchmark(
    model_class=LinearRegressionCausalModel,
    estimand=Constants.Model.ATE_ESTIMAND,
    data_source=covar_data_source,
    param_grid=param_grid,
    num_dgp_samples=1,
    num_data_samples_per_dgp=1,
    dgp_kwargs={"analysis_mode": False},
    enable_ray_multiprocessing=False)

CPU times: user 1.69 s, sys: 90 ms, total: 1.78 s
Wall time: 1.71 s


In [10]:
pd.DataFrame(result)

Unnamed: 0,param_outcome_nonlinearity,param_treatment_nonlinearity,absolute mean percentage bias,root mean squared error
0,LOW,MEDIUM,13.066101,0.007317
1,LOW,LOW,16.496678,0.036953


### Concrete DGP Benchmarking Demo

TODO: better to show a very simple DGP example here rather than using genmatch.

#### GenMatch Synthetic DGP

In [11]:
%load_ext autoreload
%autoreload 1

In [12]:
%aimport maccabee.examples.genmatch

In [13]:
from maccabee.examples.genmatch import GenmatchDataGeneratingProcess, GENMATCH_SPECS
from maccabee.examples.genmatch import LogisticPropensityMatchingCausalModel

In [14]:
genmatch_A_dgp = GenmatchDataGeneratingProcess(
    *GENMATCH_SPECS["E"],
    n_observations=100, analysis_mode=True)
genmatch_A_dgp.treatment_logit_expression

0.4*X_1*X_3 + 0.8*X_1 - 0.25*X_2**2 - 0.175*X_2*X_4 - 0.25*X_2 + 0.6*X_3 - 0.2*X_4*X_5 - 0.4*X_4 - 0.4*X_5*X_6 - 0.8*X_5 - 0.5*X_6 + 0.7*X_7

In [15]:
genmatch_dataset = genmatch_A_dgp.generate_dataset()
genmatch_dataset.observed_data

Unnamed: 0,X_0,X_1,X_2,X_3,X_4,X_5,X_6,X_7,X_8,X_9,X_10,T,Y
0,1.0,1.0,-0.232965,1.0,-0.808452,1.0,1.0,-1.562857,0.0,1.0,-0.808036,0,-4.434531
1,1.0,1.0,0.170274,0.0,0.704646,1.0,1.0,1.066438,0.0,1.0,0.772706,1,-4.141324
2,1.0,1.0,1.981547,1.0,-0.332395,0.0,1.0,0.768983,1.0,1.0,0.673765,1,-4.631699
3,1.0,0.0,-0.035995,0.0,0.344226,1.0,1.0,-0.633510,0.0,1.0,-0.874435,0,-4.323240
4,1.0,0.0,0.236985,0.0,0.850624,1.0,1.0,-0.604283,0.0,1.0,0.226504,0,-4.236548
...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,1.0,1.0,-0.564505,1.0,1.097291,0.0,0.0,0.145542,0.0,1.0,0.973414,0,-4.233149
96,1.0,0.0,1.505190,0.0,0.166493,0.0,0.0,-1.662765,0.0,0.0,0.544410,0,-4.283620
97,1.0,0.0,-0.408158,0.0,-0.119618,1.0,1.0,1.660629,0.0,0.0,-1.000997,0,-3.939399
98,1.0,0.0,1.298080,1.0,-0.274254,0.0,0.0,-1.491888,1.0,0.0,0.157328,1,-4.641553


In [16]:
genmatch_dataset.ATE

-0.40000000000000013

In [17]:
model = LogisticPropensityMatchingCausalModel(genmatch_dataset)
model.fit()

In [18]:
model.estimate_ATE()

-0.06359618844879382

#### Benchmark

In [39]:
%%time

# Concrete Benchmark
dgp = GenmatchDataGeneratingProcess(
    *GENMATCH_SPECS["E"],
    n_observations=1000, analysis_mode=True)

result = run_concrete_dgp_benchmark(
    dgp=dgp,
    model_class=LogisticPropensityMatchingCausalModel,
    estimand=Constants.Model.ATE_ESTIMAND,
    num_samples_from_dgp=1000,
    enable_ray_multiprocessing=False)

CPU times: user 5min 56s, sys: 6min 18s, total: 12min 14s
Wall time: 2min 4s


In [40]:
result

{'absolute mean percentage bias': 7.95932280928424,
 'root mean squared error': 0.06958073391560818}