In [1]:
# Hidden Config Cell

#!python -m pip install -e ../../../../Maccabee > /dev/null

In [2]:
from maccabee.data_generation import ConcreteDataGeneratingProcess, data_generating_method
from maccabee.constants import Constants
from maccabee.data_generation.utils import evaluate_expression
import numpy as np
import sympy as sp
import pandas as pd

DGPVariables = Constants.DGPVariables

class CustomConcreteDataGeneratingProcess(ConcreteDataGeneratingProcess):
    def __init__(self, n_observations):

        super().__init__(n_observations, data_analysis_mode=False)

        # Three covariates - A, B and C.
        self.n_vars = 3
        self.covar_names = ["A", "B", "C"]
        self.A, self.B, self.C  = sp.symbols(self.covar_names)

        # Linear treatment assignment logit
        self.treatment_assignment_function = 1/(1 + sp.exp(-1*(5*self.A + -7*self.B)))

        # Linear untreated outcome function.
        self.base_outcome_function = 6*self.C

    @data_generating_method(DGPVariables.COVARIATES_NAME, [])
    def _generate_observed_covars(self, input_vars):
        X = np.random.normal(loc=0.0, scale=1.0, size=(
          self.n_observations, self.n_vars))

        return pd.DataFrame(X, columns=self.covar_names)

    @data_generating_method(DGPVariables.PROPENSITY_SCORE_NAME,
                          [DGPVariables.COVARIATES_NAME])
    def _generate_true_propensity_scores(self, input_vars):
        observed_covariate_data = input_vars[DGPVariables.COVARIATES_NAME]

        return evaluate_expression(
          self.treatment_assignment_function,
          observed_covariate_data)

    @data_generating_method(
      DGPVariables.POTENTIAL_OUTCOME_WITHOUT_TREATMENT_NAME,
      [DGPVariables.COVARIATES_NAME])
    def _generate_outcomes_without_treatment(self, input_vars):
        observed_covariate_data = input_vars[DGPVariables.COVARIATES_NAME]

        return evaluate_expression(
          self.base_outcome_function,
          observed_covariate_data)

    @data_generating_method(DGPVariables.OUTCOME_NOISE_NAME, [])
    def _generate_outcome_noise_samples(self, input_vars):
        return np.random.normal(loc=0, scale=0.25, size=self.n_observations)

    @data_generating_method(
      DGPVariables.TREATMENT_EFFECT_NAME,
      [DGPVariables.COVARIATES_NAME])
    def _generate_treatment_effects(self, input_vars):
        return 2

In [11]:
concrete_dgp = CustomConcreteDataGeneratingProcess(n_observations=100)
dataset = concrete_dgp.generate_dataset()

In [12]:
dataset.observed_data.head()

Unnamed: 0,A,B,C,T,Y
0,-1.616131,-0.76465,-0.191657,0,-0.937014
1,-0.577143,-0.209127,-0.554967,1,-0.993029
2,-0.247293,1.054256,-0.307911,0,-1.812844
3,-0.645216,0.106769,-1.083359,0,-6.651235
4,0.915464,0.672721,0.082086,0,0.801945


In [15]:
# ground truth
dataset.ATE 

2.0

In [16]:
from maccabee.modeling.models import LinearRegressionCausalModel

# Build and fit model
model = LinearRegressionCausalModel(dataset)
model.fit()

# estimate
model.estimate_ATE() 

1.9379849734528494

In [17]:
from maccabee.benchmarking import benchmark_model_using_concrete_dgp

aggregated_results, raw_results, _, _ = benchmark_model_using_concrete_dgp(
  dgp=concrete_dgp,
  model_class=LinearRegressionCausalModel,
  estimand=Constants.Model.ATE_ESTIMAND,
  num_sampling_runs_per_dgp=10,
  num_samples_from_dgp=250)

In [20]:
aggregated_results["RMSE"], aggregated_results["AMBP"] 

(0.083, 0.229)