In [1]:
import numpy as np
import pandas as pd
import torch
torch.set_default_tensor_type(torch.FloatTensor) 
import copy

from rct_data_generator import *
from outcome_models import *
from plotting_functions import *
from mcmc_bayes_update import *
from eig_comp_utils import *
from research_exp_utils import *
import uci_dataset as dataset

  _C._set_default_tensor_type(t)


### 1. Simulating the data

In [2]:
rng = np.random.RandomState(42)

n_rct_before_split = 10**6
n_host = 2000
sigma_prior = 1
sigma_rand_error = 1

power_x, power_x_t = 1, 1
std_true_y = 1 # Standard deviation for the true Y


# X0 = np.random.beta(12, 3, size= n_rct_before_split)
# X1 = np.random.normal(loc=4, scale=1, size=n_rct_before_split)
# X2 = np.random.beta(1, 7, size=n_rct_before_split)
# x_distributions= {0: X0, 1: X1, 2:X2}
# d = 1 + len(x_distributions)*(power_x) + 1 + len(x_distributions)*(power_x_t)


#p_assigned_to_host = lambda X_0, X_1, T, eps: sigmoid(1 + 2*X_0 - X_1 + 3*T + eps)
# p_assigned_to_host = lambda X, T, eps: sigmoid(1 + 2*X['X_0'] - X['X_1'] + 3*T + eps)
# p_assigned_to_cand2 = lambda X_0, X_1, T, eps: sigmoid(1 + 2*X_0 - X_1 + 3*T + eps) #can't take X_2, harcoded still

In [3]:
abalone = dataset.load_abalone()
abalone['Sex'] = abalone['Sex'].map({'M': 0, 'F': 1})
abalone.dropna(inplace=True)
resampled_abalone = [abalone.sample(frac=1, replace=True) for _ in range(5*(10**3))]
# Concatenate resampled DataFrames
abalone = pd.concat(resampled_abalone, ignore_index=True)

causal_param_first_index = power_x * np.shape(abalone)[1] + 1

abalone.head()


Unnamed: 0,Sex,Length,Diameter,Height,Weight.whole,Weight.shucked,Weight.viscera,Weight.shell,Rings
0,1.0,0.52,0.41,0.17,0.8705,0.3735,0.219,0.25,14
1,0.0,0.635,0.48,0.235,1.064,0.413,0.228,0.36,16
2,1.0,0.58,0.46,0.175,1.165,0.65,0.2205,0.3055,9
3,1.0,0.645,0.52,0.175,1.3345,0.667,0.2665,0.355,10
4,0.0,0.685,0.51,0.18,1.4545,0.6315,0.3105,0.3725,9


In [4]:
initial_x_dim = np.shape(abalone)[1]
initial_n_entire_data = np.shape(abalone)[0]
x_distributions={}

for column in abalone.columns:
    x_distributions[column] = abalone[column].values

# simulate T
T_rct = np.random.randint(2, size=initial_n_entire_data)
p_assigned_to_host = lambda X, T, eps: 0.5 #sigmoid(1 + 2*X['Sex'] - X['Weight.viscera'] + 12*np.sqrt(X['Diameter']) + np.log(X['Weight.shell']) + 3*T + eps)
p_assigned_to_cand2 = lambda X, T, eps: 0.5 #sigmoid(1 + 2*X['Sex'] - X['Weight.viscera'] + 12*np.sqrt(X['Diameter']) + np.log(X['Weight.shell']) + 3*T + eps)

d = 1 + initial_x_dim*(power_x) + 1 + len(x_distributions)*(power_x_t)
outcome_function = lambda X, T, eps: 1 + 1 * X['Sex'] - 1 * X['Weight.viscera'] + np.log(X['Weight.whole']) - X['Height'] + 4 * T + 2* X['Weight.shucked']*T + 24* X['Weight.shell']*T + 0* X['Weight.shucked']*T + eps 
# outcome_function = lambda X, T, eps: 1 + 1 * X[:,0] - 1 * X[:,1] + 1 * X[:,2] + 4 * T + 2* X[:,0]*T + 24* X[:,1]*T + 0* X[:,2]*T + eps 

Simulation parameters

In [5]:
# Prior parameters for Bayesian update on host
prior_mean = torch.zeros(d)
beta_0, sigma_0_sq, inv_cov_0 = prior_mean, sigma_rand_error,torch.eye(d)
prior_hyperparameters = {'beta_0': beta_0, 'sigma_0_sq': sigma_0_sq,"inv_cov_0":inv_cov_0}

# Hyperparameters for Bayesian update on host
warmup_steps = 50
max_tree_depth = 5

# Number of samples used to estimate outer expectation
n_samples_for_expectation = 50
m_samples_for_expectation = 1000


# Incorporating sqrt constraint into MCMC samples
n_mc = (n_samples_for_expectation * (m_samples_for_expectation+1)) 

In [6]:
n_both_candidates_list = [200, 500, 1000]
proportion = 1 #n_cand2 = prorportion * n_both_candidates_list
std_true_y = 1


data_parameters = {'n_both_candidates_list': n_both_candidates_list, 'proportion':proportion, 'n_rct_before_split':n_rct_before_split, \
                   'x_distributions':x_distributions, 'p_assigned_to_cand2':p_assigned_to_cand2, 'n_host':n_host, 'power_x':power_x, \
                    'power_x_t':power_x_t, 'outcome_function':outcome_function, 'std_true_y':std_true_y, 'causal_param_first_index':causal_param_first_index}

### 2. EIG closed form for varying sample sizes

In [7]:
n_seeds=10
plot_additional = True

text_l1 ='p_assigned_to_host = p_assigned_to_cand2, approx 0.75 of treated in host, '
text_l2 = 'n_host = '+str(n_host)+', sigma_prior = sigma_rand_error = '+str(sigma_rand_error)
text_l3 = 'outcome_function = 1 + 1 * X[:,0] - 1 * X[:,1] + 1 * X[:,2] + 4 * T + 2* X[:,0]*T + 24* X[:,1]*T + 0* X[:,2]*T + eps'

In [11]:
EIG_obs_closed_form, EIG_caus_closed_form = eig_closed_form_varying_sample_size(abalone, T, data_parameters, sigma_rand_error, prior_hyperparameters, n_mc)

TypeError: eig_closed_form_varying_sample_size() missing 2 required positional arguments: 'prior_hyperparameters' and 'n_mc'

In [None]:
if not plot_additional:
    dict_additional_plots_obs = dict_additional_plots_caus = {'Exact complementary':0, 'Exact twin': 0, 'Exact twin treated': 0, 'Exact twin untreated': 0}

else:
    dict_additional_plots_obs, dict_additional_plots_caus = eig_closed_form_exact_datasets(data_parameters, sigma_rand_error, prior_hyperparameters, n_mc)

EIG_obs_closed_form_across_seeds, EIG_caus_closed_form_across_seeds = [], []

for i in range (n_seeds):
    EIG_obs_closed_form, EIG_caus_closed_form = eig_closed_form_varying_sample_size(X_rct, T_rct, data_parameters, sigma_rand_error, prior_hyperparameters, n_mc, synthetic=False)
    if len(EIG_obs_closed_form_across_seeds)==0:
        EIG_obs_closed_form_across_seeds= EIG_obs_closed_form
        EIG_caus_closed_form_across_seeds = EIG_caus_closed_form
    else:
        EIG_obs_closed_form_across_seeds = np.vstack((EIG_obs_closed_form_across_seeds, EIG_obs_closed_form))
        EIG_caus_closed_form_across_seeds = np.vstack((EIG_caus_closed_form_across_seeds, EIG_caus_closed_form))
    

In [None]:
path_obs_closed_form = '/Users/lucile/causal_info_gain/plots/eig_obs_closed_form'
# path_obs_closed_form = 0
alpha = 0.3

plot_array(dict_additional_plots_obs, n_both_candidates_list, EIG_obs_closed_form_across_seeds, axis_names= ['Sample size of candidate datasets', 'EIG predictive'], names=['complementary','twin'],
           text= text_l1+ '\n' + text_l2+ '\n' + text_l3, title= 'EIG predictive', save=path_obs_closed_form, alpha = alpha)

In [None]:
path_caus_closed_form = '/Users/lucile/causal_info_gain/plots/eig_caus_closed_form'
# path_caus_closed_form = 0
alpha = 0.3

plot_array(dict_additional_plots_caus, n_both_candidates_list, EIG_caus_closed_form_across_seeds, axis_names= ['Sample size of candidate datasets', 'EIG causal'], names=['complementary','twin'],
           text= text_l1+ '\n' + text_l2+ '\n' + text_l3, title= 'EIG causal', save=path_caus_closed_form, alpha = alpha)

### 2. EIG from samples for varying sample sizes

In [None]:
n_samples_outer_expectation = 800
n_samples_inner_expectation = 200
n_causal_outer_exp = 800
n_causal_inner_exp = 200

sampling_parameters = {'n_samples_inner_expectation':n_samples_inner_expectation, 'n_samples_outer_expectation':n_samples_outer_expectation, \
                       'n_causal_inner_exp':n_causal_inner_exp, 'n_causal_outer_exp':n_causal_outer_exp}

In [None]:
n_seeds = 2
EIG_obs_samples_across_seeds, EIG_caus_samples_across_seeds = [], []

for i in range (n_seeds):
    EIG_obs_samples, EIG_caus_samples = eig_from_samples_varying_sample_size(data_parameters, sigma_rand_error, prior_hyperparameters, sampling_parameters)
    if len(EIG_obs_samples_across_seeds)==0:
        EIG_obs_samples_across_seeds= EIG_obs_samples
        EIG_caus_samples_across_seeds = EIG_caus_samples
    else:
        EIG_obs_samples_across_seeds = np.vstack((EIG_obs_samples_across_seeds, EIG_obs_samples))
        EIG_caus_samples_across_seeds = np.vstack((EIG_caus_samples_across_seeds, EIG_caus_samples))

In [None]:
text_l1 ='p_assigned_to_host=p_assigned_to_cand2, approx 0.8 treated in host'
text_l2 = 'n_samples_inner_expectation = '+str(n_samples_inner_expectation)+ 'n_samples_outer_expectation = '+str(n_samples_outer_expectation)+', n_host = '+str(n_host)+', sigma_prior = sigma_rand_error = '+str(sigma_rand_error)
text_l4 = 'n_host = 200, sigma_prior = sigma_rand_error = 1, n_causal_inner_exp = '+str(n_causal_inner_exp) 


plot_additional = 0

In [None]:
plot_additional = True
if not plot_additional:
    dict_additional_plots_obs = dict_additional_plots_caus = {'Exact complementary':0, 'Exact twin': 0, 'Exact twin treated': 0, 'Exact twin untreated': 0}

else:
    dict_additional_plots_obs, dict_additional_plots_caus = eig_closed_form_exact_datasets(data_parameters, sigma_rand_error, prior_hyperparameters, n_mc)

In [None]:
# path_obs_samples = '/Users/lucile/causal_info_gain/plots/eig_obs_samples'
path_obs_samples = 0
alpha = 0.3

plot_array(dict_additional_plots_obs, n_both_candidates_list, EIG_obs_samples_across_seeds, axis_names= ['Sample size of candidate datasets', 'EIG predictive'], names=['complementary','twin'],
           text= text_l1+ '\n' + text_l2+ '\n' + text_l3+ '\n' + text_l4, title= 'EIG predictive', save=path_obs_samples, alpha=alpha)

In [None]:
# path_caus_samples = '/Users/lucile/causal_info_gain/plots/eig_caus_samples'
path_caus_samples = 0
alpha = 0.3

plot_array(dict_additional_plots_caus, n_both_candidates_list, EIG_caus_samples_across_seeds, axis_names= ['Sample size of candidate datasets', 'EIG causal'], names=['complementary','twin'],
           text= text_l1+ '\n' + text_l2+ '\n' + text_l3+ '\n' + text_l4, title= 'EIG causal', save=path_caus_samples, alpha=alpha)

## 2. Extreme examples

In [None]:
# n_host_sample = 500 
# sigma_error = 1 

# X_host = (torch.randn((n_host_sample,d)) @ A ) 
# T_host = torch.bernoulli(torch.sigmoid(X_host@ T_allocation_host))
# X_host_times_T = (T_host.unsqueeze(dim=0).T * X_host)
# X_host = torch.concat([X_host,X_host_times_T],dim=1)

# Y_host = X_host @ mu
# Y_host = (1/Y_host.norm()) * Y_host + sigma_error * torch.randn_like(Y_host)

In [None]:
# prior_mean = torch.zeros(2 * d)
# beta_0, sigma_0_sq,inv_cov_0 = prior_mean, sigma_error,torch.eye(2*d)
# prior_hyperparameters = {'beta_0': beta_0, 'sigma_0_sq': sigma_0_sq,"inv_cov_0":inv_cov_0}
# bayesian_regression = BayesianLinearRegression(prior_hyperparameters)

In [None]:
# bayesian_regression.fit(X_host,Y_host)