In [2]:
!pip install -e ../Maccabee

Obtaining file:///home/jovyan/work/Maccabee
Installing collected packages: maccabee
  Found existing installation: maccabee 0.0.12
    Uninstalling maccabee-0.0.12:
      Successfully uninstalled maccabee-0.0.12
  Running setup.py develop for maccabee
Successfully installed maccabee


In [1]:
from maccabee.data_generation import DataGeneratingProcess, data_generating_method

In [2]:
from maccabee.parameters import build_parameters_from_axis_levels
from maccabee.constants import Constants
import maccabee.data_sources as data_sources
from maccabee.data_generation import DataGeneratingProcessSampler
from maccabee.modeling.models import LinearRegressionCausalModel
from maccabee.benchmarking import run_sampled_dgp_benchmark, run_concrete_dgp_benchmark
from maccabee.utilities import evaluate_expression



In [3]:
import pandas as pd
import sympy as sp

## Sampled DGP

In [4]:
covar_data_source = data_sources.build_random_normal_datasource(
    n_covars = 10, n_observations=2000)

dgp_params = build_parameters_from_axis_levels({
    Constants.AxisNames.OUTCOME_NONLINEARITY: Constants.AxisLevels.LOW,
    Constants.AxisNames.TREATMENT_NONLINEARITY: Constants.AxisLevels.LOW,
})

dgp_sampler = DataGeneratingProcessSampler(
    parameters=dgp_params,
    data_source=covar_data_source,
    dgp_kwargs={"data_analysis_mode": False})

dgp = dgp_sampler.sample_dgp()
dataset = dgp.generate_dataset()

In [5]:
dataset.observed_data

Unnamed: 0,X0,X1,X2,X3,X4,X5,X6,X7,X8,X9,T,Y
600,-0.405318,0.100812,0.579347,0.212204,0.548969,-0.056869,-0.514997,0.219140,-0.253726,-0.762086,1,1.745364
1066,0.586723,0.000387,-0.005994,0.061997,-0.397050,0.254824,0.331157,0.054626,0.228711,0.396712,0,-0.487985
1802,-0.105629,-0.709602,0.363026,0.025431,0.094169,0.404510,0.409021,-0.200269,0.388250,-0.379195,0,0.650681
1633,-0.245857,0.484996,0.219001,-0.379394,-0.021773,-0.109740,-0.244159,0.016285,-0.435720,0.309933,0,-0.896683
1495,0.309470,0.244898,-0.514285,-0.451192,-0.398230,0.171943,0.107324,-0.192402,-0.414204,-0.027093,1,0.603394
...,...,...,...,...,...,...,...,...,...,...,...,...
483,0.446754,0.051512,0.215045,0.058827,-0.006468,0.174854,-0.019864,0.217381,0.088307,-0.211365,1,1.532349
77,0.130201,0.178595,0.152411,-0.045492,0.007897,0.125439,0.048031,0.047398,0.015854,-0.148821,0,0.087759
896,-0.275717,0.160019,0.140193,-0.125477,0.353370,0.028176,-0.271221,0.234343,-0.121098,-0.380013,0,1.004778
113,0.017979,-0.121396,0.136636,-0.101744,0.229127,0.336448,0.047269,0.356059,0.082027,-0.182870,1,2.180456


In [6]:
dataset.ATE

0.9139999999999999

In [7]:
model = LinearRegressionCausalModel(dataset)
model.fit()
model.estimate(estimand=Constants.Model.ATE_ESTIMAND)

0.9227952594822544

## Sampled DGP Benchmarking Demo

In [5]:
Constants.AxisLevels.LEVELS

('LOW', 'MEDIUM', 'HIGH')

In [17]:
%%time

LOW, MEDIUM, HIGH = Constants.AxisLevels.LEVELS
param_grid = {
    Constants.AxisNames.TREATMENT_NONLINEARITY: [LOW],
    Constants.AxisNames.OUTCOME_NONLINEARITY: [HIGH, MEDIUM, LOW]
}

covar_data_source = data_sources.build_lalonde_datasource()

avg_result, rr = run_sampled_dgp_benchmark(
    model_class=LinearRegressionCausalModel,
    estimand=Constants.Model.ATE_ESTIMAND,
    data_source_generator=lambda: covar_data_source,
    param_grid=param_grid,
    num_dgp_samples=15,
    num_data_samples_per_dgp=50,
    dgp_kwargs={"data_analysis_mode": False},
    enable_ray_multiprocessing=False)

CPU times: user 1min 23s, sys: 19.1 s, total: 1min 42s
Wall time: 1min 5s


In [18]:
pd.DataFrame(avg_result).drop(["absolute mean bias %", "absolute mean bias % (std)"], axis=1)

Unnamed: 0,param_outcome_nonlinearity,param_treatment_nonlinearity,root mean squared error,root mean squared error (std)
0,HIGH,LOW,0.090013,0.046491
1,MEDIUM,LOW,0.065801,0.079909
2,LOW,LOW,0.012933,0.00127


### Concrete DGP Benchmarking Demo

#### Concrete DGP

In [89]:
from maccabee.data_generation import DataGeneratingProcess, data_generating_method
from maccabee.constants import Constants
from maccabee.utilities import evaluate_expression
from maccabee.modeling.models import CausalModel
import numpy as np
import sympy as sp
import pandas as pd

class CustomConcreteDataGeneratingProcess(DataGeneratingProcess):
    def __init__(self, n_observations):

        super().__init__(n_observations, data_analysis_mode=False)

        # Var count
        self.n_vars = 3
        self.covar_names = ["A", "B", "C"]
        self.A, self.B, self.C  = sp.symbols(self.covar_names)

        self.treatment_assignment_function = 1/(1 + sp.exp(-1*(self.A + self.B)))
        self.base_outcome_function = 4*self.C

    @data_generating_method(Constants.DGPVariables.COVARIATES_NAME, [])
    def _generate_observed_covars(self, input_vars):
        X = np.random.normal(loc=0.0, scale=1.0, size=(
            self.n_observations, self.n_vars))
        
        return pd.DataFrame(X, columns=self.covar_names)

    @data_generating_method(Constants.DGPVariables.PROPENSITY_SCORE_NAME,
                            [Constants.DGPVariables.COVARIATES_NAME])
    def _generate_true_propensity_scores(self, input_vars):
        observed_covariate_data = input_vars[Constants.DGPVariables.COVARIATES_NAME]

        return evaluate_expression(
            self.treatment_assignment_function,
            observed_covariate_data)

    @data_generating_method(
        Constants.DGPVariables.POTENTIAL_OUTCOME_WITHOUT_TREATMENT_NAME,
        [Constants.DGPVariables.COVARIATES_NAME])
    def _generate_outcomes_without_treatment(self, input_vars):
        observed_covariate_data = input_vars[Constants.DGPVariables.COVARIATES_NAME]

        return evaluate_expression(
            self.base_outcome_function,
            observed_covariate_data)
    
    @data_generating_method(Constants.DGPVariables.OUTCOME_NOISE_NAME, [])
    def _generate_outcome_noise_samples(self, input_vars):
        return np.random.normal(loc=0, scale=100, size=self.n_observations)

    @data_generating_method(
        Constants.DGPVariables.TREATMENT_EFFECT_NAME,
        [Constants.DGPVariables.COVARIATES_NAME])
    def _generate_treatment_effects(self, input_vars):
        return 2

In [90]:
concrete_dgp = CustomConcreteDataGeneratingProcess(n_observations=100)
dataset = concrete_dgp.generate_dataset()
dataset.observed_data.head()

Unnamed: 0,A,B,C,T,Y
0,0.800261,-0.040398,-0.217283,1,1.13087
1,-2.850955,1.476802,0.944254,0,3.777016
2,0.304231,0.561959,-0.813668,1,-1.254671
3,-1.359757,0.076948,-1.380953,0,-5.523811
4,0.539034,0.207883,-0.98907,1,-1.956281


In [91]:
# from maccabee.examples.genmatch import GenmatchDataGeneratingProcess, GENMATCH_SPECS
# from maccabee.examples.genmatch import LogisticPropensityMatchingCausalModel

In [92]:
dataset.ATE

2.0

In [93]:
model = LinearRegressionCausalModel(dataset)
model.fit()

In [94]:
model.estimate_ATE()

1.999999999999999

#### Benchmark

In [95]:
%%time

result = run_concrete_dgp_benchmark(
    dgp=concrete_dgp,
    model_class=LinearRegressionCausalModel,
    estimand=Constants.Model.ATE_ESTIMAND,
    num_samples_from_dgp=100,
    enable_ray_multiprocessing=False)

CPU times: user 1.37 s, sys: 10 ms, total: 1.38 s
Wall time: 1.37 s


In [99]:
result["root mean squared error"]

1.9625527686514803e-15