In [1]:
!pip install -e ../CauseML

Obtaining file:///home/jovyan/work/CauseML
Installing collected packages: cause-ml
  Found existing installation: cause-ml 0.0.11
    Uninstalling cause-ml-0.0.11:
      Successfully uninstalled cause-ml-0.0.11
  Running setup.py develop for cause-ml
Successfully installed cause-ml


In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
from cause_ml.parameters import build_parameters_from_axis_levels
from cause_ml.constants import Constants
from cause_ml.data_generation import DataGeneratingProcessSampler
import cause_ml.data_sources as data_sources
from cause_ml.benchmarking import run_benchmark

In [5]:
import pandas as pd
from sklearn.model_selection import ParameterGrid
from collections import defaultdict
import ray



## Model Demo

In [8]:
covar_data_source = data_sources.load_random_normal_covariates(n_covars = 10, n_observations=1000)
dgp_params = build_parameters_from_axis_levels({
    Constants.AxisNames.OUTCOME_NONLINEARITY: Constants.AxisLevels.LOW,
    Constants.AxisNames.TREATMENT_NONLINEARITY: Constants.AxisLevels.LOW,
})

dgp_sampler = DataGeneratingProcessSampler(
    parameters=dgp_params, data_source=covar_data_source)

dgp = dgp_sampler.sample_dgp()
dataset = dgp.generate_data()

In [9]:
dataset.ATE

-1.013

In [10]:
model = LinearRegressionCausalModel(dataset)
model.fit()
model.estimate(estimand=Constants.Model.ATE_ESTIMAND)

-1.0170460400173909

## Benchmarking Demo

In [90]:
# metrics = {
#     "bias": lambda res, dataset: res - dataset.ATE,
#     "squared_error": lambda res, dataset: (res - dataset.ATE)**2
# }

# def run_experiment(data_source, param_grid, num_dgp_samples, num_data_samples_per_dgp):
#     results = defaultdict(list)
    
#     for param_spec in ParameterGrid(param_grid):
#         dgp_params = build_parameters_from_axis_levels(param_spec)
#         dgp_sampler = DataGeneratingProcessSampler(
#             parameters=dgp_params, data_source=covar_data_source)
        
#         n_samples_per_param_spec = num_dgp_samples*num_data_samples_per_dgp
#         param_spec_results = defaultdict(lambda: np.empty(n_samples_per_param_spec))
        
#         for dgp_sample_index in range(num_dgp_samples):
#             # Sample DGP
#             dgp = dgp_sampler.sample_dgp()
            
#             for data_index in range(num_data_samples_per_dgp):
#                 result_id = dgp_sample_index*num_data_samples_per_dgp + data_index
                
#                 # Sample data
#                 dataset = dgp.generate_data()
                
#                 # Apply model
#                 model = LinearRegressionCausalModel(dataset)
#                 model.fit()
#                 ATE_estimate = model.estimate(estimand=Constants.Model.ATE_ESTIMAND)
                
#                 # Score model - bias, squared error
#                 for metric_name, metric_func in metrics.items():
#                     param_spec_results[metric_name][result_id] = metric_func(ATE_estimate, dataset)
        
#         for param_name, param_value in param_spec.items():
#             results[f"param_{param_name.lower()}"].append(param_value)
            
#         for metric, metric_results in param_spec_results.items():
#             results[f"mean_{metric}"].append(np.mean(metric_results))
#             results[f"std_{metric}"].append(np.std(metric_results))
            
#     return results

In [91]:
# HIGH, MEDIUM, LOW = Constants.AxisLevels.HIGH, Constants.AxisLevels.MEDIUM, Constants.AxisLevels.LOW

# param_grid = dgp_params = {
#     Constants.AxisNames.TREATMENT_NONLINEARITY: [MEDIUM, LOW],
#     Constants.AxisNames.OUTCOME_NONLINEARITY: [MEDIUM, LOW]
# }
# covar_data_source = data_sources.load_random_normal_covariates(n_covars = 10, n_observations=1000)
# result = run_experiment(covar_data_source, param_grid, 2, 2)

In [92]:
# pd.DataFrame(result)

Unnamed: 0,param_outcome_nonlinearity,param_treatment_nonlinearity,mean_bias,std_bias,mean_squared_error,std_squared_error
0,HIGH,MEDIUM,0.138335,0.023464,0.019687,0.006492
1,HIGH,LOW,0.074791,0.004001,0.00561,0.000599
2,MEDIUM,MEDIUM,0.148252,0.036955,0.023344,0.010957
3,MEDIUM,LOW,0.036649,0.012624,0.001502,0.000925
4,LOW,MEDIUM,0.099378,0.092306,0.018396,0.018346
5,LOW,LOW,0.001269,0.00362,1.5e-05,9e-06


In [11]:
ray.is_initialized()

NameError: name 'ray' is not defined

In [9]:
# metrics = {
#     "absolute mean bias": lambda estimate_vals, true_vals: np.abs(
#         np.mean(estimate_vals - true_vals)),
#     "root mean squared error": lambda estimate_vals, true_vals: np.sqrt(
#         np.mean((estimate_vals - true_vals)**2))
# }

# @ray.remote
# def sample_dgp(dgp_sampler):
#     return dgp_sampler.sample_dgp()
        
# @ray.remote
# def sample_data(dgp):
#     # Sample data
#     dataset = dgp.generate_data()
#     return dataset

# # Constants.Model.ATE_ESTIMAND
# @ray.remote
# def fit_and_apply_model(model_class, estimand, dataset):
#     model = model_class(dataset)
#     model.fit()
#     estimate_val = model.estimate(estimand=estimand)
#     true_val = dataset.ground_truth(estimand=estimand)
    
#     return estimate_val, true_val

# def run_experiment(model_class, estimand,
#                    data_source, param_grid,
#                    num_dgp_samples=1,
#                    num_data_samples_per_dgp=1):
    
#     results = defaultdict(list)
    
#     for param_spec in ParameterGrid(param_grid):
#         dgp_params = build_parameters_from_axis_levels(param_spec)
#         dgp_sampler = DataGeneratingProcessSampler(
#             parameters=dgp_params, data_source=covar_data_source)
        
#         async_sample_effect_data = []
#         for _ in range(num_dgp_samples):
#             dgp = sample_dgp.remote(dgp_sampler) 
#             for _ in range(num_data_samples_per_dgp):
#                 dataset = sample_data.remote(dgp)
#                 effect_data = fit_and_apply_model.remote(
#                     model_class, estimand, dataset)
#                 async_sample_effect_data.append(effect_data)
        
#         sample_effect_data = np.array(ray.get(async_sample_effect_data))
#         estimate_vals = sample_effect_data[:, 0]
#         true_vals = sample_effect_data[:, 1]
                
#         for param_name, param_value in param_spec.items():
#             results[f"param_{param_name.lower()}"].append(param_value)
            
#         for metric_name, metric_func in metrics.items():
#             results[metric_name].append(
#                 metric_func(estimate_vals, true_vals))
            
#     return results

In [40]:
# metrics = {
#     "absolute mean bias": lambda estimate_vals, true_vals: np.abs(
#         np.mean(estimate_vals - true_vals)),
#     "root mean squared error": lambda estimate_vals, true_vals: np.sqrt(
#         np.mean((estimate_vals - true_vals)**2))
# }

# def _sample_dgp(dgp_sampler):
#     return dgp_sampler.sample_dgp()
        
# def _sample_data(dgp):
#     # Sample data
#     dataset = dgp.generate_data()
#     return dataset

# def _fit_and_apply_model(model_class, estimand, dataset):
#     model = model_class(dataset)
#     model.fit()
#     estimate_val = model.estimate(estimand=estimand)
#     true_val = dataset.ground_truth(estimand=estimand)
    
#     return estimate_val, true_val

In [47]:
# def run_benchmark(model_class, estimand,
#                    data_source, param_grid,
#                    num_dgp_samples=1,
#                    num_data_samples_per_dgp=1,
#                    enable_ray_multiprocessing=False):
    
#     if enable_ray_multiprocessing:
#         if not ray.is_initialized():
#             ray.init()
            
#         sample_dgp = ray.remote(_sample_dgp).remote
#         sample_data = ray.remote(_sample_data).remote
#         fit_and_apply_model = ray.remote(_fit_and_apply_model).remote
#     else:
#         sample_dgp = _sample_dgp
#         sample_data = _sample_data
#         fit_and_apply_model = fit_and_apply_model
    
#     results = defaultdict(list)
    
#     for param_spec in ParameterGrid(param_grid):
#         dgp_params = build_parameters_from_axis_levels(param_spec)
#         dgp_sampler = DataGeneratingProcessSampler(
#             parameters=dgp_params, data_source=covar_data_source)
        
#         async_sample_effect_data = []
#         for _ in range(num_dgp_samples):
#             dgp = sample_dgp(dgp_sampler) 
#             for _ in range(num_data_samples_per_dgp):
#                 dataset = sample_data(dgp)
#                 effect_data = fit_and_apply_model(
#                     model_class, estimand, dataset)
#                 async_sample_effect_data.append(effect_data)
        
#         if enable_ray_multiprocessing:
#             sample_effect_data = ray.get(async_sample_effect_data)
#         else:
#             sample_effect_data = async_sample_effect_data
        
#         sample_effect_data = np.array(sample_effect_data)
            
#         estimate_vals = sample_effect_data[:, 0]
#         true_vals = sample_effect_data[:, 1]
                
#         for param_name, param_value in param_spec.items():
#             results[f"param_{param_name.lower()}"].append(param_value)
            
#         for metric_name, metric_func in metrics.items():
#             results[metric_name].append(
#                 metric_func(estimate_vals, true_vals))
            
#     return results

In [13]:
%%time

HIGH, MEDIUM, LOW = Constants.AxisLevels.HIGH, Constants.AxisLevels.MEDIUM, Constants.AxisLevels.LOW
param_grid = dgp_params = {
    Constants.AxisNames.TREATMENT_NONLINEARITY: [HIGH, MEDIUM, LOW],
    Constants.AxisNames.OUTCOME_NONLINEARITY: [HIGH, MEDIUM, LOW]
}

covar_data_source = data_sources.load_random_normal_covariates(
    n_covars=10,
    n_observations=500)

result = run_benchmark(
    model_class=LinearRegressionCausalModel,
    estimand=Constants.Model.ATE_ESTIMAND,
    data_source=covar_data_source,
    param_grid=param_grid,
    num_dgp_samples=1,
    num_data_samples_per_dgp=1,
    enable_ray_multiprocessing=True)

NameError: name 'build_parameters_from_axis_levels' is not defined

In [52]:
pd.DataFrame(result)

Unnamed: 0,param_outcome_nonlinearity,param_treatment_nonlinearity,absolute mean bias,root mean squared error
0,HIGH,HIGH,0.275561,0.323282
1,HIGH,MEDIUM,0.232124,0.251298
2,HIGH,LOW,0.127784,0.173943
3,MEDIUM,HIGH,0.164795,0.192024
4,MEDIUM,MEDIUM,0.205397,0.260447
5,MEDIUM,LOW,0.075545,0.151337
6,LOW,HIGH,0.139639,0.176076
7,LOW,MEDIUM,0.111783,0.193916
8,LOW,LOW,0.001163,0.032452
