In [1]:
import numpy as np
import pandas as pd
import torch
torch.set_default_tensor_type(torch.FloatTensor) 
import copy
import sys
import os
notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Add the parent directory to the Python path
sys.path.append(parent_dir)
sys.path.append('/Users/lucile/causal_info_gain/causal_prospective_merge/data')

from rct_data_generator import *
from outcome_models import *
from plotting_functions import *
from mcmc_bayes_update import *
from eig_comp_utils import *
from research_exp_utils import *
import uci_dataset as dataset

#maybe econml or causalml. So far can't get them to work

  _C._set_default_tensor_type(t)


In [2]:
path = '/Users/lucile/causal_info_gain/causal_prospective_merge/'
data, x, t, y = get_data('twins', path)

In [3]:
initial_data = dataset.load_abalone()
initial_data['Sex'] = initial_data['Sex'].map({'M': 0, 'F': 1})
initial_data.dropna(inplace=True)

In [4]:
number_of_candidate_sites = 10
min_sample_size_cand = 150
max_sample_size_cand = 250
host_sample_size = 200
desired_initial_sample_size = 10**6
initial_data = initial_data.sample(n=desired_initial_sample_size, replace=True, random_state=42)


outcome_function = lambda X, T, eps: 1 + 1 * X['Sex'] - 1 * X['Weight.viscera'] + np.log(X['Weight.whole']) - \
    X['Height'] + 4 * T + 2* X['Weight.shucked']*T + 24* X['Weight.shell']*T + 0* X['Weight.shucked']*T + eps 
std_true_y = 1
power_x = 1
power_x_t = 1
sigma_rand_error = 1

exp_parameters = {'number_of_candidate_sites': number_of_candidate_sites, 'min_sample_size_cand': min_sample_size_cand, \
                'max_sample_size_cand': max_sample_size_cand, 'host_sample_size': host_sample_size, 'outcome_function': outcome_function, \
                'std_true_y': std_true_y, 'power_x': power_x, 'power_x_t': power_x_t}

causal_param_first_index = causal_param_first_index = power_x * np.shape(initial_data)[1] + 1   

In [9]:
def generating_random_sites_from(data, exp_parameters):
    sites = {}
    sample_size, number_covariates = np.shape(data)[0], np.shape(data)[1]
    function_indices = {0: lambda X: np.log(X+1), 1: lambda X: X**3, 2: lambda X: X, 3: lambda X: X**2}
    T = new_df = pd.DataFrame({'T': np.random.randint(2, size=sample_size)}, index=data.index) 
    XandT = pd.concat([data, T], axis= 1)
    number_of_candidate_sites = exp_parameters['number_of_candidate_sites']
    min_sample_size_cand = exp_parameters['min_sample_size_cand']
    max_sample_size_cand = exp_parameters['max_sample_size_cand']
    outcome_function = exp_parameters['outcome_function']
    std_true_y = exp_parameters['std_true_y']
    power_x = exp_parameters['power_x']
    power_x_t = exp_parameters['power_x_t']
    number_features = number_covariates + 1
    created_sites = 0
    
    while created_sites < number_of_candidate_sites:
        
        selected_features_for_subsampling = np.random.randint(2, size = number_features) 
        # binary bool vector representing selection for being an input of the sampling function
        random_coefs = [np.random.uniform(-10, 10) for _ in range(number_features)] 
        random_fct_idx = [np.random.randint(0, 4) for _ in range(number_features)] 
        
        def p_assigned_to_site(X, T, eps):
            result = 0
            for j in range(number_features-1):
                result += selected_features_for_subsampling[j] * random_coefs[j] * function_indices[random_fct_idx[j]](X[j])
            result += selected_features_for_subsampling[-1] * random_coefs[-1] *  function_indices[random_fct_idx[-1]](T)
            return sigmoid(result + eps)
        
        sample_size = np.random.randint(min_sample_size_cand, max_sample_size_cand + 1)  # Add 1 to include max_sample_size_cand
        design_data_cand = subsample_one_dataset(XandT, p_assigned_to_site, sample_size, power_x, power_x_t, outcome_function, std_true_y)
        if not design_data_cand.empty:
            sites[created_sites] = design_data_cand
            created_sites += 1

    return sites

In [10]:
#dictionnary of random sites
candidate_sites = generating_random_sites_from(initial_data, exp_parameters)
host = candidate_sites.popitem()[1]

# Prior parameters for Bayesian update on host
d = np.shape(host)[1]-1
prior_mean = torch.zeros(d)
sigma_prior = 1
beta_0, sigma_0_sq, inv_cov_0 = prior_mean, sigma_rand_error,torch.eye(d)
prior_hyperparameters = {'beta_0': beta_0, 'sigma_0_sq': sigma_0_sq,"inv_cov_0":inv_cov_0}


      Sex  Length  Diameter  Height  Weight.whole  Weight.shucked  \
1170  1.0   0.625     0.485   0.175        1.3745          0.7335   
1930  0.0   0.620     0.490   0.160        1.0350          0.4400   
1687  1.0   0.620     0.480   0.175        1.0405          0.4640   
1641  0.0   0.575     0.445   0.160        0.8390          0.4005   
2375  0.0   0.340     0.275   0.090        0.2065          0.0725   

      Weight.viscera  Weight.shell  Rings  T  
1170          0.2715         0.332      9  0  
1930          0.2525         0.285     11  1  
1687          0.2225         0.300      9  0  
1641          0.1980         0.239      9  1  
2375          0.0430         0.070     10  1  


  return 1.0 / (1.0 + np.exp(-x))


In [11]:
XandT_host, Y_host = torch.from_numpy(host.drop(columns=["Y"]).values), torch.from_numpy(host["Y"].values)
n_samples_outer_expectation = 20
n_samples_inner_expectation = 30
results = {"EIG_obs_from_samples": [], 'EIG_caus_from_samples':[], "EIG_obs_closed_form":[], "EIG_caus_closed_form":[], "EIG_obs_bart":[], "EIG_caus_bart":[]}

In [15]:
for _,candidate in candidate_sites.items():
    print(f"For a sample size of {np.shape(candidate)[0]}")
    print(f" % treated in host: {int(100 * candidate['T'].mean())}%")

For a sample size of 209
 % treated in host: 77%
For a sample size of 241
 % treated in host: 48%
For a sample size of 230
 % treated in host: 50%
For a sample size of 247
 % treated in host: 0%
For a sample size of 176
 % treated in host: 51%
For a sample size of 235
 % treated in host: 50%
For a sample size of 208
 % treated in host: 51%
For a sample size of 222
 % treated in host: 42%
For a sample size of 178


ValueError: cannot convert float NaN to integer

In [13]:
for _,candidate in candidate_sites.items():
    X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
    bayes_reg = BayesianLinearRegression(prior_hyperparameters)
    bayes_reg.set_causal_index(causal_param_first_index)
    post_host_parameters = bayes_reg.fit(XandT_host, Y_host)
    n_samples = n_samples_outer_expectation * (n_samples_inner_expectation + 1)

    results["EIG_obs_from_samples"].append(
            bayes_reg.samples_obs_EIG(
                X_cand, n_samples_outer_expectation, n_samples_inner_expectation
            )
        )
    results["EIG_caus_from_samples"].append(
            bayes_reg.samples_causal_EIG(
                X_cand, n_samples_outer_expectation, n_samples_inner_expectation
            )
        )
    

In [14]:
for _, candidate in candidate_sites.items():
    X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
    bayes_reg = BayesianLinearRegression(prior_hyperparameters)
    bayes_reg.set_causal_index(causal_param_first_index)
    post_host_parameters = bayes_reg.fit(XandT_host, Y_host)
    n_samples = n_samples_outer_expectation * (n_samples_inner_expectation + 1)

    results["EIG_obs_closed_form"].append(
            bayes_reg.closed_form_obs_EIG(X_cand)
            )
    results["EIG_caus_closed_form"].append(
            bayes_reg.closed_form_causal_EIG(X_cand)
            )

  sign, logdet = _umath_linalg.slogdet(a, signature=signature)


In [None]:
X_host, T_host, Y_host = host.drop(columns=['T','Y']).values, host['T'].values, host['Y'].values

beta_0, sigma_0_sq, inv_cov_0 = (
    prior_mean,
    sigma_rand_error**2,
    1 / sigma_prior * np.eye(len(prior_mean)),
)
prior_hyperparameters = {
    "beta_0": beta_0,
    "sigma_0_sq": sigma_0_sq,
    "inv_cov_0": inv_cov_0,
}

prior_hyperparameters = {'sigma_0_sq':1, 'p_categorical_pr':0, 'p_categorical_trt':0 }
predictive_model_parameters={"num_trees_pr":200,"num_trees_trt":100}
conditional_model_param={"num_trees_pr":200}

for _, candidate in candidate_sites.items():

    X_cand, T_cand = candidate.drop(columns=['Y','T']).values, candidate['T'].values

    bcf = BayesianCausalForest(
        prior_hyperparameters,
        predictive_model_parameters=predictive_model_parameters,
        conditional_model_param=conditional_model_param)
    bcf.store_train_data(X=X_host, T=T_host, Y=Y_host)
    
    joint_eig = bcf.joint_EIG_calc(X_cand, T_cand, sampling_parameters)

    EIG_obs[cand].append(joint_eig["Obs EIG"])
    EIG_caus[cand].append(joint_eig["Causal EIG"])

    joint_eig

    results["EIG_obs_closed_form"].append(
            bayes_reg.closed_form_obs_EIG(X_cand)
            )
    results["EIG_caus_closed_form"].append(
            bayes_reg.closed_form_causal_EIG(X_cand)
            )


In [None]:
# now merge and compute some CATE error
merged_datasets = {}

for i, candidate in candidate_sites.items():
    merged_datasets[i]= pd.concat([host, candidate], axis=0)

In [None]:
CATE_errors = {}
merged_datasets = {}

from econml.metalearners import TLearner
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
model_y = GradientBoostingRegressor()
model_t = GradientBoostingClassifier()

for i, candidate in merged_datasets.items():
    merged_datasets[i]= pd.concat([host, candidate], axis=0)
    X_merged = merged_datasets[i].filter(regex='^(?!T)').copy()
    X_merged = X_merged.drop(columns=["Y"])
    T_merged = merged_datasets[i]['T']
    Y_merged = merged_datasets[i]['Y']

    learner = TLearner(model_y=model_y, model_t=model_t)
    learner.fit(Y=Y_merged, T=T_merged, X=X_merged)
    cate = learner.effect(X_merged)
    ### need dataset with ground truth to compute some kind of errors here
