In [1]:
import numpy as np
import pandas as pd
import torch
torch.set_default_tensor_type(torch.FloatTensor) 
import copy
import sys
import os

notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Add the parent directory to the Python path
sys.path.append(parent_dir)

from rct_data_generator import *
from outcome_models import *
from plotting_functions import *
from mcmc_bayes_update import *
from eig_comp_utils import *
from research_exp_utils import *
import uci_dataset as dataset

  _C._set_default_tensor_type(t)


### 1. Simulating the data

In [2]:
rng = np.random.RandomState(42)

n_rct_before_split = 10**4
n_host = 200
sigma_prior = 1
sigma_rand_error = 1

power_x, power_x_t = 1, 0 # keep them 1 and 0 for BART
include_intercept = 1 # 0 if no intercept
std_true_y = 1 # Standard deviation for the true Y


In [3]:
abalone = dataset.load_abalone()

abalone['Sex'] = abalone['Sex'].map({'M': 0, 'F': 1})
abalone.dropna(inplace=True)
resampled_abalone = [abalone.sample(frac=1, replace=True) for _ in range(5*(10**3))]
# Concatenate resampled DataFrames
abalone = pd.concat(resampled_abalone, ignore_index=True)

causal_param_first_index = power_x * np.shape(abalone)[1] + include_intercept

abalone = abalone.iloc[:n_rct_before_split,:]


In [4]:
initial_x_dim = np.shape(abalone)[1]
initial_n_entire_data = np.shape(abalone)[0]
x_distributions={}

for column in abalone.columns:
    x_distributions[column] = abalone[column].values

# simulate T
T_rct = np.random.randint(2, size=initial_n_entire_data)

# TODO change this
p_assigned_to_host = lambda X, T, eps: 0.5 #sigmoid(1 + 20*X['Sex'] - X['Weight.viscera'] + 12*np.sqrt(X['Diameter']) + 12*X['Weight.shell'] + 30*T + eps)
p_assigned_to_cand2 = lambda X, T, eps: 0.5 #sigmoid(1 + 20*X['Sex'] - X['Weight.viscera'] + 12*np.sqrt(X['Diameter']) + 12* X['Weight.shell'] + 30*T + eps)

d = include_intercept + initial_x_dim*(power_x) + 1 + len(x_distributions)*(power_x_t)

outcome_function = lambda X, T, eps : include_intercept + 1 * X['Sex'] - 1 * X['Weight.viscera'] + np.log(X['Weight.whole']) - X['Height'] \
    + 4 * T + 2* X['Weight.shucked']*T + 24* X['Weight.shell']*T + 0* X['Weight.shucked']*T + eps 

# TODO fix this
# if include_intercept:
#     true_params = np.array([1, 1, -1, 1, 4, 2, 2, 0])  # intercept, non-causal => 0
# else:
#     true_params = np.array([1, -1, 1, 4, 2, 2, 0])  # copied from above

Simulation parameters

In [5]:
# Prior parameters for Bayesian update on host
prior_mean = torch.zeros(d+include_intercept)
beta_0, sigma_0_sq, inv_cov_0 = prior_mean, sigma_rand_error**2 ,  1 / sigma_prior * np.eye(len(prior_mean))
prior_hyperparameters = {'beta_0': beta_0, 'sigma_0_sq': sigma_0_sq,"inv_cov_0":inv_cov_0}

beta_0, sigma_0_sq, inv_cov_0 = (
    prior_mean,
    sigma_rand_error**2,
    1 / sigma_prior * np.eye(len(prior_mean)),
)
prior_hyperparameters = {
    "beta_0": beta_0,
    "sigma_0_sq": sigma_0_sq,
    "inv_cov_0": inv_cov_0,
}


In [6]:
n_both_candidates_list = [200, 500, 1000]
proportion = 1 #n_cand2 = prorportion * n_both_candidates_list
std_true_y = 1


data_parameters = {
    "n_both_candidates_list": n_both_candidates_list,
    "proportion": proportion,
    "n_rct_before_split": n_rct_before_split,
    "x_distributions": x_distributions,
    "p_assigned_to_cand2": p_assigned_to_cand2,
    "p_assigned_to_host": p_assigned_to_host,
    "n_host": n_host,
    "power_x": power_x,
    "power_x_t": power_x_t,
    "outcome_function": outcome_function,
    "std_true_y": std_true_y,
    "causal_param_first_index": causal_param_first_index,
}

### 2. EIG from samples for varying sample sizes

In [10]:
n_samples_outer_expectation_obs = 100
n_samples_inner_expectation_obs = 200
n_samples_outer_expectation_caus = 100
n_samples_inner_expectation_caus = 200

sampling_parameters = {'n_samples_inner_expectation_obs':n_samples_inner_expectation_obs, 'n_samples_outer_expectation_obs':n_samples_outer_expectation_obs, \
                       'n_samples_inner_expectation_caus':n_samples_inner_expectation_caus, 'n_samples_outer_expectation_caus':n_samples_outer_expectation_caus}

prior_hyperparameters = {'sigma_0_sq':1, 'p_categorical_pr':0, 'p_categorical_trt':0 }
predictive_model_parameters={"num_trees_pr":200,"num_trees_trt":100}
conditional_model_param={"num_trees_pr":200}

In [7]:
n_seeds=4
plot_additional = True

text_l1 ='p_assigned_to_host = p_assigned_to_cand2, approx 0.75 of treated in host, '
text_l2 = 'n_host = '+str(n_host)+', sigma_prior = sigma_rand_error = '+str(sigma_rand_error)
text_l3 = 'outcome_function = 1 + 1 * X[:,0] - 1 * X[:,1] + 1 * X[:,2] + 4 * T + 2* X[:,0]*T + 24* X[:,1]*T + 0* X[:,2]*T + eps'

In [8]:
exact_data = generate_exact_data_varying_sample_size(
    data_parameters, include_intercept=bool(include_intercept)
)
exact_data[200]["host"]

Unnamed: 0,intercept,Sex,Length,Diameter,Height,Weight.whole,Weight.shucked,Weight.viscera,Weight.shell,Rings,...,T*Sex,T*Length,T*Diameter,T*Height,T*Weight.whole,T*Weight.shucked,T*Weight.viscera,T*Weight.shell,T*Rings,Y
0,1.0,0.0,0.700,0.565,0.175,1.8565,0.8445,0.3935,0.5400,10.0,...,0.0,0.000,0.000,0.000,0.0000,0.0000,0.0000,0.0000,0.0,-1.195129
1,1.0,0.0,0.620,0.495,0.180,1.2555,0.5765,0.2540,0.3550,12.0,...,0.0,0.620,0.495,0.180,1.2555,0.5765,0.2540,0.3550,12.0,14.228279
2,1.0,0.0,0.635,0.510,0.170,1.3555,0.6190,0.3050,0.3900,9.0,...,0.0,0.000,0.000,0.000,0.0000,0.0000,0.0000,0.0000,0.0,1.160679
3,1.0,0.0,0.525,0.405,0.160,0.7920,0.3160,0.1455,0.2800,13.0,...,0.0,0.525,0.405,0.160,0.7920,0.3160,0.1455,0.2800,13.0,10.950731
4,1.0,1.0,0.570,0.425,0.130,0.7820,0.3695,0.1745,0.1965,8.0,...,1.0,0.570,0.425,0.130,0.7820,0.3695,0.1745,0.1965,8.0,11.177455
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,1.0,0.0,0.535,0.440,0.165,0.8750,0.2790,0.1800,0.3000,10.0,...,0.0,0.535,0.440,0.165,0.8750,0.2790,0.1800,0.3000,10.0,12.697132
196,1.0,1.0,0.610,0.485,0.180,1.2795,0.5735,0.2855,0.3550,7.0,...,0.0,0.000,0.000,0.000,0.0000,0.0000,0.0000,0.0000,0.0,1.243534
197,1.0,0.0,0.445,0.370,0.125,0.5150,0.2495,0.0870,0.1590,9.0,...,0.0,0.445,0.370,0.125,0.5150,0.2495,0.0870,0.1590,9.0,8.751747
198,1.0,0.0,0.580,0.460,0.150,1.0490,0.5205,0.1935,0.3050,10.0,...,0.0,0.580,0.460,0.150,1.0490,0.5205,0.1935,0.3050,10.0,13.109111


In [11]:

dict_additional_plots_obs_from_samples, dict_additional_plots_caus_from_samples = (
    bart_eig_from_samples_varying_sample_size(
        exact_data,
        data_parameters,
        prior_hyperparameters,
        predictive_model_parameters,
        conditional_model_param,
        sampling_parameters
    )
)

In [None]:
EIG_obs_closed_form_across_seeds, EIG_caus_closed_form_across_seeds = [], []

for i in range(n_seeds):
    nonexact_data = generate_data_varying_sample_size(
        data_parameters, include_intercept=bool(include_intercept)
    )
    EIGs = bart_eig_from_samples_varying_sample_size(
        nonexact_data,
        data_parameters,
        prior_hyperparameters,
        predictive_model_parameters,
        conditional_model_param,
        sampling_parameters
    )
    EIG_obs_closed_form_across_seeds.append(
        [cand_values for cand_values in EIGs[0].values()]
    )
    EIG_caus_closed_form_across_seeds.append(
        [cand_values for cand_values in EIGs[1].values()]
    )


EIG_obs_closed_form_across_seeds = np.vstack(EIG_obs_closed_form_across_seeds)  
EIG_caus_closed_form_across_seeds = np.vstack(EIG_caus_closed_form_across_seeds)


In [None]:
text_l1 = "p_assigned_to_host=p_assigned_to_cand2, approx 0.8 treated in host"
text_l2 = (
    "n_samples_inner_expectation_obs = "
    + str(n_samples_inner_expectation_obs)
    + "n_samples_outer_expectation_obs = "
    + str(n_samples_outer_expectation_obs)
    + ", n_host = "
    + str(n_host)
    + ", sigma_prior = sigma_rand_error = "
    + str(sigma_rand_error)
)
text_l4 = (
    "n_host = 200, sigma_prior = sigma_rand_error = 1, n_samples_inner_expectation_caus = "
    + str(n_samples_inner_expectation_caus)
)

len(data_mirror) n=509 != n_mirror (1000)


TypeError: bart_eig_from_samples_varying_sample_size() missing 2 required positional arguments: 'conditional_model_param' and 'sampling_parameters'

In [None]:
path_obs_closed_form = ""

plot_array(
    x=n_both_candidates_list,  # sample sizes
    arr=EIG_obs_closed_form_across_seeds,
    axis_names=["Sample size of candidate datasets", "EIG predictive"],
    dict_additional_plots=dict_additional_plots_obs_from_samples,
    text=text_l1 + "\n" + text_l2 + "\n" + text_l3,
    title="EIG predictive complementary - EIG predictive twin",
)

In [None]:
# path_obs_samples = '/Users/lucile/causal_info_gain/plots/eig_obs_samples'
path_caus_samples = 0

plot_array(
    x = n_both_candidates_list,
    arr = EIG_caus_closed_form_across_seeds,
    axis_names=["Sample size of candidate datasets", "EIG predictive"],
    dict_additional_plots=dict_additional_plots_caus_from_samples,
    text=text_l1 + "\n" + text_l2 + "\n" + text_l3 + "\n" + text_l4,
    title="EIG predictive complementary - EIG predictive twin"
)