In [1]:
import numpy as np
import pandas as pd
import torch
torch.set_default_tensor_type(torch.FloatTensor) 
import copy
import sys
import os

notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Add the parent directory to the Python path
sys.path.append(parent_dir)


from rct_data_generator import *
from outcome_models import *
from plotting_functions import *
from mcmc_bayes_update import *
from eig_comp_utils import *
from research_exp_utils import *


  _C._set_default_tensor_type(t)


### 1. Simulating the data

In [2]:
rng = np.random.RandomState(42)

n_rct_before_split = 10**5
n_host = 200
sigma_prior = 1
sigma_rand_error = 1
include_intercept = 1  # 0 if no intercept
power_x, power_x_t = 1, 0 # keep them 1 and 0 for BART

X0 = np.random.beta(12, 3, size=n_rct_before_split)
X1 = np.random.normal(loc=4, scale=1, size=n_rct_before_split)
X2 = np.random.beta(1, 7, size=n_rct_before_split)
x_distributions = {"X_0": X0, "X_1": X1, "X_2": X2}
d = (
    include_intercept
    + len(x_distributions) * (power_x)
    + 1
    + len(x_distributions) * (power_x_t)
)

p_assigned_to_host = lambda X, T, eps: sigmoid(
    1 + 2 * X["X_0"] - X["X_1"] + 2 * T + eps
)
p_assigned_to_cand2 = lambda X, T, eps: sigmoid(
    1 + 2 * X["X_0"] - X["X_1"] + 2 * T + eps
)


causal_param_first_index = power_x*len(x_distributions) + include_intercept 

outcome_function = (
    # y = 1 + 1*X_0 - 1*X_1 + 1*X_2 + 4*T + 2*X_0*T + 2*X_1*T + 0*X_2*T + eps
    lambda X, T, eps: include_intercept  # intercept, non-causal => 0 no intercept
    + 1 * X["X_0"]  # non-causal
    - 1 * X["X_1"]  # non-causal
    + 1 * X["X_2"]  # non-causal
    + 4 * T  # causal
    + 2 * X["X_0"] * T  # causal
    + 2 * X["X_1"] * T  # causal
    + 0 * X["X_2"] * T  # causal
    + eps
)


Simulation parameters

In [3]:
# Prior parameters for Bayesian update on host
prior_mean = np.array([0, 1, 0, 0, 0])
assert len(prior_mean) == d, "Shape error"

beta_0, sigma_0_sq, inv_cov_0 = (
    prior_mean,
    sigma_rand_error**2,
    1 / sigma_prior * np.eye(len(prior_mean)),
)
prior_hyperparameters = {
    "beta_0": beta_0,
    "sigma_0_sq": sigma_0_sq,
    "inv_cov_0": inv_cov_0,
}


In [4]:
n_both_candidates_list = [200, 500, 1000]
proportion = 1  # n_cand2 = prorportion * n_both_candidates_list
std_true_y = 1

data_parameters = {
    "n_both_candidates_list": n_both_candidates_list,
    "proportion": proportion,
    "n_rct_before_split": n_rct_before_split,
    "x_distributions": x_distributions,
    "p_assigned_to_cand2": p_assigned_to_cand2,
    "p_assigned_to_host": p_assigned_to_host,
    "n_host": n_host,
    "power_x": power_x,
    "power_x_t": power_x_t,
    "outcome_function": outcome_function,
    "std_true_y": std_true_y,
    "causal_param_first_index": causal_param_first_index,
}

In [5]:
n_seeds = 3

text_l1 = "p_assigned_to_host=p_assigned_to_cand2, n_samples_for_expectation = 30,"
text_l2 = "m_samples_for_expectation = 500, n_host = 200, sigma_prior = 1, sigma_rand_error = 1,"
text_l3 = "outcome_function = 1 + 1 * X[:,0] - 1 * X[:,1] + 1 * X[:,2] + 1 * T + 2* X[:,0]*T + 2* X[:,1]*T +2* X[:,2]*T + eps"

In [6]:
n_samples_outer_expectation_obs = 100
n_samples_inner_expectation_obs = 200
n_samples_outer_expectation_caus = 100
n_samples_inner_expectation_caus = 200

sampling_parameters = {'n_samples_inner_expectation_obs':n_samples_inner_expectation_obs, 'n_samples_outer_expectation_obs':n_samples_outer_expectation_obs, \
                       'n_samples_inner_expectation_caus':n_samples_inner_expectation_caus, 'n_samples_outer_expectation_caus':n_samples_outer_expectation_caus}

prior_hyperparameters = {'sigma_0_sq':1, 'p_categorical_pr':0, 'p_categorical_trt':0 }
predictive_model_parameters={"num_trees_pr":200,"num_trees_trt":100}
conditional_model_param={"num_trees_pr":200}

In [7]:
exact_data = generate_exact_data_varying_sample_size(
    data_parameters, include_intercept=bool(include_intercept)
)
exact_data[200]["host"]

Unnamed: 0,intercept,X_0,X_1,X_2,T,Y
0,1.0,0.684468,3.731272,0.005716,1.0,11.576195
1,1.0,0.726912,2.990434,0.124904,1.0,12.197125
2,1.0,0.842183,2.393865,0.079854,1.0,9.126950
3,1.0,0.971674,4.241098,0.090034,1.0,12.164292
4,1.0,0.850703,2.857284,0.062603,1.0,11.643892
...,...,...,...,...,...,...
195,1.0,0.908877,3.147704,0.413192,0.0,-0.064853
196,1.0,0.878676,4.792592,0.000466,1.0,13.395566
197,1.0,0.752993,2.713725,0.078947,1.0,9.917758
198,1.0,0.815468,4.384909,0.163494,1.0,11.419038


In [None]:

dict_additional_plots_obs_from_samples, dict_additional_plots_caus_from_samples = (
    bart_eig_from_samples_varying_sample_size(
        exact_data,
        data_parameters,
        prior_hyperparameters,
        predictive_model_parameters,
        conditional_model_param,
        sampling_parameters
    )
)

Sampling from Posterior
Getting conditional samples


  self.params["tau_trt"] = 0.1 * np.var(y) / self.params["num_trees_trt"]
100%|██████████| 100/100 [03:28<00:00,  2.09s/it]


Sampling from Posterior
Getting conditional samples


100%|██████████| 100/100 [03:29<00:00,  2.10s/it]


Sampling from Posterior
Getting conditional samples


100%|██████████| 100/100 [03:31<00:00,  2.12s/it]


Sampling from Posterior
Getting conditional samples


100%|██████████| 100/100 [03:28<00:00,  2.09s/it]


In [None]:
EIG_obs_closed_form_across_seeds, EIG_caus_closed_form_across_seeds = [], []

for i in range(n_seeds):
    nonexact_data = generate_data_varying_sample_size(
        data_parameters, include_intercept=bool(include_intercept)
    )
    EIGs = bart_eig_from_samples_varying_sample_size(
        nonexact_data,
        data_parameters,
        prior_hyperparameters,
        predictive_model_parameters,
        conditional_model_param,
        sampling_parameters
    )
    EIG_obs_closed_form_across_seeds.append(
        [cand_values for cand_values in EIGs[0].values()]
    )
    EIG_caus_closed_form_across_seeds.append(
        [cand_values for cand_values in EIGs[1].values()]
    )


EIG_obs_closed_form_across_seeds = np.vstack(EIG_obs_closed_form_across_seeds)  
EIG_caus_closed_form_across_seeds = np.vstack(EIG_caus_closed_form_across_seeds)


Sampling from Posterior


In [None]:
text_l1 = "p_assigned_to_host=p_assigned_to_cand2, approx 0.8 treated in host"
text_l2 = (
    "n_samples_inner_expectation_obs = "
    + str(n_samples_inner_expectation_obs)
    + "n_samples_outer_expectation_obs = "
    + str(n_samples_outer_expectation_obs)
    + ", n_host = "
    + str(n_host)
    + ", sigma_prior = sigma_rand_error = "
    + str(sigma_rand_error)
)
text_l4 = (
    "n_host = 200, sigma_prior = sigma_rand_error = 1, n_samples_inner_expectation_caus = "
    + str(n_samples_inner_expectation_caus)
)

In [None]:
path_obs_closed_form = ""

plot_array(
    x=n_both_candidates_list,  # sample sizes
    arr=EIG_obs_closed_form_across_seeds,
    axis_names=["Sample size of candidate datasets", "EIG predictive"],
    dict_additional_plots=dict_additional_plots_obs_from_samples,
    text=text_l1 + "\n" + text_l2 + "\n" + text_l3,
    title="EIG predictive complementary - EIG predictive twin",
)

In [None]:
# path_obs_samples = '/Users/lucile/causal_info_gain/plots/eig_obs_samples'
path_caus_samples = 0

plot_array(
    x = n_both_candidates_list,
    arr = EIG_caus_closed_form_across_seeds,
    axis_names=["Sample size of candidate datasets", "EIG predictive"],
    dict_additional_plots=dict_additional_plots_caus_from_samples,
    text=text_l1 + "\n" + text_l2 + "\n" + text_l3 + "\n" + text_l4,
    title="EIG predictive complementary - EIG predictive twin"
)