In [1]:
import numpy as np
import pandas as pd
import torch
torch.set_default_tensor_type(torch.FloatTensor) 
import copy
import sys
import os
notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Add the parent directory to the Python path
sys.path.append(parent_dir)
sys.path.append('/Users/lucile/causal_info_gain/causal_prospective_merge/data')

from rct_data_generator import *
from outcome_models import *
from plotting_functions import *
from mcmc_bayes_update import *
from eig_comp_utils import *
from research_exp_utils import *

from econml.metalearners import TLearner
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier


  _C._set_default_tensor_type(t)


In [2]:
from scipy.stats import kendalltau, spearmanr

def average_precision_at_k(true_rankings, predicted_rankings, k):
    num_hits = 0
    sum_precision = 0
    for i, pred in enumerate(predicted_rankings[:k], 1):
        if pred in true_rankings:
            num_hits += 1
            sum_precision += num_hits / i
    if not true_rankings:
        return 0
    return sum_precision / min(len(true_rankings), k)

def mean_average_precision(true_rankings, predicted_rankings, k=None):
    if k is None:
        k = len(true_rankings)
    avg_precision = np.mean([average_precision_at_k(true_rankings, predicted_rankings, k_) for k_ in range(1, k + 1)])
    return avg_precision

def precision_at_k(true_rankings, predicted_rankings, k):
    intersection = set(predicted_rankings[:k]) & set(true_rankings)
    return len(intersection) / k

def recall_at_k(true_rankings, predicted_rankings, k):
    intersection = set(predicted_rankings[:k]) & set(true_rankings)
    return len(intersection) / len(true_rankings)

def mrr(true_rankings, predicted_rankings):
    for i, pred in enumerate(predicted_rankings, 1):
        if pred in true_rankings:
            return 1 / i
    return 0

def ndcg(true_rankings, predicted_rankings, k=None):
    if k is None:
        k = len(true_rankings)
    dcg = sum(2 ** true_rankings[i] - 1 / np.log2(i + 2) for i in range(k))
    ideal_rankings = sorted(true_rankings, reverse=True)
    ideal_dcg = sum(2 ** ideal_rankings[i] - 1 / np.log2(i + 2) for i in range(k))
    return dcg / ideal_dcg

def compare_to_ground_truth(results_dict, true_cate_ranking, eig_ranking, top_n = None, k = None):
    
    if top_n is not None:
        topn_eig_ranking = eig_ranking[:top_n]
        topn_true_cate_ranking = true_cate_ranking[:top_n]
    else: 
        topn_eig_ranking, topn_true_cate_ranking = eig_ranking, true_cate_ranking

    if k is None:
        k = len(true_cate_ranking)
    
    results_dict['tau'].append(kendalltau(topn_eig_ranking, topn_true_cate_ranking)[0])      
    results_dict['rho'].append(spearmanr(topn_true_cate_ranking, topn_eig_ranking)[0])
    results_dict['precision_at_k'].append(precision_at_k(true_cate_ranking, topn_eig_ranking, k=k))
    results_dict['recall_at_k'].append(recall_at_k(true_cate_ranking, topn_eig_ranking, k=k))
    results_dict['mean average precision'].append(mean_average_precision(topn_true_cate_ranking, topn_eig_ranking, k=k))
    results_dict['ndcg'].append(ndcg(topn_true_cate_ranking, topn_eig_ranking, k))
    results_dict['rank corr eig'].append(np.corrcoef(topn_true_cate_ranking, topn_eig_ranking)[0, 1])
    results_dict['mean reciprocal rank'].append(mrr(topn_true_cate_ranking, topn_eig_ranking))

    return results_dict

In [3]:
path = '/Users/lucile/causal_info_gain/causal_prospective_merge/'
data_with_groundtruth, x, t, y = get_data('twins', path)
data_with_groundtruth.dropna(inplace=True)
data_with_groundtruth = data_with_groundtruth.rename(columns={'t': 'T', 'y': 'Y'})
XandT = data_with_groundtruth.drop(columns=['Y','y0','y1','ite'])
XandT.head()


Unnamed: 0,eclamp,gestatcat1,gestatcat2,gestatcat3,gestatcat4,gestatcat5,gestatcat6,gestatcat7,gestatcat8,gestatcat9,...,brstate_reg,feduc6,dfageq,nprevistq,data_year,crace,birmon,dtotord_min,dlivord_min,T
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,5.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,5.0,5.0,8.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,5.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,5.0,4.0,6.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,4.0,7.0,0.0,0.0,1.0,0.0,3.0,3.0,0.0


In [4]:
number_of_candidate_sites = 40

min_sample_size_cand = 50
max_sample_size_cand = 150
host_sample_size = 100 
desired_initial_sample_size = 10**4
XandT = XandT.sample(n=desired_initial_sample_size, replace=True, random_state=42)
added_T_coef = 50 # to increase importance of T

outcome_function = None
std_true_y = 1
power_x = 1
power_x_t = 1
sigma_rand_error = 1

exp_parameters = {'number_of_candidate_sites': number_of_candidate_sites+1, 'min_sample_size_cand': min_sample_size_cand, \
                'max_sample_size_cand': max_sample_size_cand, 'host_sample_size': host_sample_size, 'outcome_function': outcome_function, \
                'std_true_y': std_true_y, 'power_x': power_x, 'power_x_t': power_x_t}

causal_param_first_index = power_x*np.shape(XandT)[1]

In [5]:
def generating_random_sites_from(data, exp_parameters, added_T_coef=1):
    
    candidates = {}
    sample_size, number_covariates = np.shape(data)[0], np.shape(data)[1]
    function_indices = {0: lambda X: np.log(X+1), 1: lambda X: X**3, 2: lambda X: X, 3: lambda X: X**2 }
    number_of_candidate_sites = exp_parameters['number_of_candidate_sites']
    min_sample_size_cand = exp_parameters['min_sample_size_cand']
    max_sample_size_cand = exp_parameters['max_sample_size_cand']
    outcome_function = None
    std_true_y = exp_parameters['std_true_y']
    power_x = exp_parameters['power_x']
    power_x_t = exp_parameters['power_x_t']
    number_features = number_covariates
    created_sites = 0
    
    while created_sites < number_of_candidate_sites+1:

        np.random.seed(created_sites)
        
        selected_features_for_subsampling = np.random.randint(2, size = number_features) 
        # binary bool vector representing selection for being an input of the sampling function
        random_coefs = [np.random.uniform(-10, 10) for _ in range(number_features)] 
        random_fct_idx = [np.random.randint(0, len(function_indices.keys())) for _ in range(number_features)] 
        
        def p_assigned_to_site(X, T, eps):
            result = 0
            for j in range(number_features-1):
                result += selected_features_for_subsampling[j] * random_coefs[j] * function_indices[random_fct_idx[j]](X[j])
            # here i use added_T_coef * random_coefs to increase importance of T
            result +=  added_T_coef * random_coefs[-1] *  function_indices[random_fct_idx[-1]](T) #selected_features_for_subsampling[-1]
            return sigmoid(result + eps)
        
        sample_size = np.random.randint(min_sample_size_cand, max_sample_size_cand + 1)  # Add 1 to include max_sample_size_cand

        if created_sites==0:
            sample_size = exp_parameters['host_sample_size']
        design_data_cand = subsample_one_dataset(XandT, p_assigned_to_site, sample_size, power_x, power_x_t, outcome_function, std_true_y, seed=created_sites)
        any_nan = design_data_cand.isna().any().any()
        if not design_data_cand.empty and not any_nan: # we're appending
            candidates[created_sites] = design_data_cand
        else:
            number_of_candidate_sites+=1 # not appending
        created_sites += 1

    return candidates

In [6]:
#dictionnary of random sites
candidate_sites = generating_random_sites_from(XandT, exp_parameters, added_T_coef=50)
for i, cand in candidate_sites.items():
    candidate_sites[i] = pd.concat([cand, data_with_groundtruth.loc[cand.index, 'Y']], axis=1)
    
host = candidate_sites[0]
candidate_sites = {key: value for key, value in candidate_sites.items() if key != 0}
XandT_host, Y_host = torch.from_numpy(host.drop(columns=["Y"]).values), torch.from_numpy(host["Y"].values)

# Prior parameters for Bayesian update on host
d = np.shape(host)[1]-1
prior_mean = torch.zeros(d)
sigma_prior = 1
beta_0, sigma_0_sq, inv_cov_0 = prior_mean, sigma_rand_error,torch.eye(d)
prior_hyperparameters = {'beta_0': beta_0, 'sigma_0_sq': sigma_0_sq,"inv_cov_0":inv_cov_0}


overflow encountered in exp


In [7]:
n_samples_outer_expectation_obs = 400
n_samples_inner_expectation_obs = 800
n_samples_outer_expectation_caus = 400
n_samples_inner_expectation_caus = 800

sampling_parameters = {'n_samples_inner_expectation_obs':n_samples_inner_expectation_obs, 'n_samples_outer_expectation_obs':n_samples_outer_expectation_obs, \
                       'n_samples_inner_expectation_caus':n_samples_inner_expectation_caus, 'n_samples_outer_expectation_caus':n_samples_outer_expectation_caus}

eig_results = {"EIG_obs_from_samples": [], 'EIG_caus_from_samples':[], "EIG_obs_closed_form":[], "EIG_caus_closed_form":[], "EIG_obs_bart":[], "EIG_caus_bart":[]}

In [8]:
print(f" % treated in host: {round(100 * host['T'].mean(),2)}%")

 % treated in host: 33.0%


In [9]:
for _,candidate in candidate_sites.items():
    print(f"For a sample size of {np.shape(candidate)[0]}")
    print(f" % treated in candidate: {round(100 * candidate['T'].mean(),2)}%")

For a sample size of 94
 % treated in candidate: 72.34%
For a sample size of 132
 % treated in candidate: 66.67%
For a sample size of 125
 % treated in candidate: 70.4%
For a sample size of 145
 % treated in candidate: 57.93%
For a sample size of 53
 % treated in candidate: 5.66%
For a sample size of 74
 % treated in candidate: 86.49%
For a sample size of 106
 % treated in candidate: 70.75%
For a sample size of 134
 % treated in candidate: 0.0%
For a sample size of 98
 % treated in candidate: 94.9%
For a sample size of 54
 % treated in candidate: 25.93%
For a sample size of 95
 % treated in candidate: 89.47%
For a sample size of 72
 % treated in candidate: 58.33%
For a sample size of 83
 % treated in candidate: 53.01%
For a sample size of 148
 % treated in candidate: 68.92%
For a sample size of 89
 % treated in candidate: 77.53%
For a sample size of 87
 % treated in candidate: 42.53%
For a sample size of 127
 % treated in candidate: 72.44%
For a sample size of 103
 % treated in candida

In [10]:
for _, candidate in candidate_sites.items():
    X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
    bayes_reg = BayesianLinearRegression(prior_hyperparameters)
    bayes_reg.set_causal_index(causal_param_first_index)
    post_host_parameters = bayes_reg.fit(XandT_host, Y_host)
    n_samples = n_samples_outer_expectation_obs * (n_samples_inner_expectation_obs + 1)

    eig_results["EIG_obs_closed_form"].append(
            bayes_reg.closed_form_obs_EIG(X_cand)
            )
    eig_results["EIG_caus_closed_form"].append(
            bayes_reg.closed_form_causal_EIG(X_cand)
            )

In [11]:
# eig_results["EIG_obs_from_samples"]=[]
# eig_results["EIG_caus_from_samples"]=[]

# for i, candidate in candidate_sites.items():
#     print("from samples "+str(i))
#     X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
#     bayes_reg = BayesianLinearRegression(prior_hyperparameters)
#     bayes_reg.set_causal_index(causal_param_first_index)
#     post_host_parameters = bayes_reg.fit(XandT_host, Y_host)

#     eig_results["EIG_obs_from_samples"].append(
#             bayes_reg.samples_obs_EIG(
#                 X_cand, n_samples_outer_expectation_obs, n_samples_inner_expectation_obs
#             )
#         )
#     eig_results["EIG_caus_from_samples"].append(
#             bayes_reg.samples_causal_EIG(
#                 X_cand, n_samples_outer_expectation_obs, n_samples_inner_expectation_obs
#             )
#         )

In [12]:
# now merge and compute some CATE error
merged_datasets = {}

for i, candidate in candidate_sites.items():
    merged_datasets[i]= pd.concat([host, candidate], axis=0)

In [13]:
cate_diff = {}
merged_mse = []
XandT_host=host.drop(columns=["Y"])

X_zero = XandT_host.copy() # we predict on host with T=0 and T=1
X_zero.iloc[:,causal_param_first_index:] = 0

X_one = XandT_host.copy()
X_one.iloc[:,causal_param_first_index:] = XandT_host.iloc[:,:causal_param_first_index]

### Merging and computing ground truth

In [14]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.metrics import log_loss
merged_mse = []

for i, candidate in merged_datasets.items():

    XandT_merged = candidate.drop(columns=["Y"])
    Y_merged = candidate['Y']

    learner = LinearRegression(fit_intercept=False)
    learner.fit(y=Y_merged, X=XandT_merged) # we fit on merged datasets

    true_cate = data_with_groundtruth.loc[host.index, 'ite']

    pred_cate = learner.predict(X_one)-learner.predict(X_zero)

    merged_mse.append(mean_squared_error(true_cate, pred_cate))


### Comparing our EIGs with ground truth

In [15]:
obs_eig_ranking_closed_form = sorted(range(len(eig_results["EIG_obs_closed_form"])), key=lambda i: eig_results["EIG_obs_closed_form"][i], reverse=True)
print(obs_eig_ranking_closed_form)

caus_eig_ranking_closed_form = sorted(range(len(eig_results["EIG_caus_closed_form"])), key=lambda i: eig_results["EIG_caus_closed_form"][i], reverse=True)
print(caus_eig_ranking_closed_form)

# obs_eig_ranking_from_samples = sorted(range(len(eig_results["EIG_obs_from_samples"])), key=lambda i: eig_results["EIG_obs_from_samples"][i], reverse=True)
# print(obs_eig_ranking_from_samples)

# caus_eig_ranking_from_samples = sorted(range(len(eig_results["EIG_caus_from_samples"])), key=lambda i: eig_results["EIG_caus_from_samples"][i], reverse=True)
# print(caus_eig_ranking_from_samples)

true_cate_ranking = sorted(range(len(merged_mse)), key=lambda i: merged_mse[i], reverse=False) # reverse is False because its error terms
print(true_cate_ranking)

[2, 27, 13, 42, 25, 3, 1, 16, 50, 48, 21, 41, 6, 17, 28, 8, 45, 10, 38, 34, 0, 32, 14, 31, 37, 20, 39, 15, 30, 35, 11, 29, 12, 33, 5, 49, 24, 36, 19, 46, 40, 23, 7, 22, 47, 18, 9, 26, 43, 44, 4]
[2, 27, 13, 25, 42, 3, 1, 16, 48, 21, 6, 41, 50, 17, 28, 8, 38, 45, 10, 0, 34, 14, 32, 31, 20, 37, 39, 30, 35, 11, 29, 33, 15, 5, 49, 12, 24, 36, 19, 46, 40, 18, 47, 9, 23, 7, 22, 26, 4, 44, 43]
[3, 27, 26, 2, 1, 22, 44, 42, 7, 28, 21, 13, 16, 48, 25, 8, 6, 9, 39, 15, 31, 17, 10, 32, 34, 41, 35, 37, 11, 38, 14, 49, 40, 19, 47, 20, 0, 18, 12, 36, 30, 33, 45, 24, 5, 50, 46, 4, 29, 23, 43]


In [16]:
k = 20
top_n = 20

In [17]:
correlation_with_true_rankings={'tau':[],'rho':[], \
      'precision_at_k': [], 'recall_at_k':[], 'mean average precision': [], \
      'ndcg': [], 'rank corr eig': [], 'mean reciprocal rank': []}

compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, obs_eig_ranking_closed_form, top_n = top_n, k = k)
compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, caus_eig_ranking_closed_form, top_n = top_n, k = k)

# compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, obs_eig_ranking_from_samples, top_n = top_n, k = k)
# compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, caus_eig_ranking_from_samples, top_n = top_n, k = k)


{'tau': [-0.11578947368421053, -0.2105263157894737],
 'rho': [-0.1819548872180451, -0.3218045112781954],
 'precision_at_k': [1.0, 1.0],
 'recall_at_k': [0.39215686274509803, 0.39215686274509803],
 'mean average precision': [0.8630710128245707, 0.8930264856912737],
 'ndcg': [0.9999999999999998, 0.9999999999999998],
 'rank corr eig': [-0.1946170824432616, -0.4077302695680013],
 'mean reciprocal rank': [1.0, 1.0]}

### Baselines

In [18]:
import copy

### random ranking
random_ranking = np.random.choice(np.arange(1, number_of_candidate_sites+1), size=number_of_candidate_sites, replace=False)


### ranking by sample size
sample_size_order = sorted(candidate_sites.keys(), key=lambda key: -candidate_sites[key].shape[0])


### ranking by similarity of covariate distribution
mean_vector_host = XandT_host.iloc[:,:causal_param_first_index].mean()
cov_matrix_host = XandT_host.iloc[:,:causal_param_first_index].cov()
mvn = multivariate_normal(mean=mean_vector_host, cov=cov_matrix_host, allow_singular=1)
# get log likelihood of candidate sites
log_likelihood_list=[]
for i, candidate in candidate_sites.items():
    log_likelihoods=mvn.logpdf(candidate.iloc[:,:causal_param_first_index].values)
    log_likelihood_list.append(np.mean(log_likelihoods))

similarity_cov_distrib_ranking= sorted(range(len(log_likelihood_list)), key=lambda i: log_likelihood_list[i], reverse=True)

### ranking by similarity of propensity scores
# we fit a propensity score model at target site and store logloss
# for each site: we fit the model further on the cand site and compute log
# nd assess the loss. Sites associated with loss values with higher discrepancy from the host should have distinct 
#treatment allocation scheme, and thus be a better fit. 

ps_model = LinearRegression(fit_intercept=False)
ps_model.fit(XandT_host.iloc[:,:causal_param_first_index], XandT_host['T'])
t_host_pred = ps_model.predict(XandT_host.iloc[:,:causal_param_first_index])
mse_host = mean_squared_error(t_host_pred, XandT_host['T'])
mse_diff_list = []

for i, candidate in candidate_sites.items():
    ps_model_copy= copy.deepcopy(ps_model)
    ps_model_copy.fit(candidate.iloc[:,:causal_param_first_index], candidate['T'])
    t_cand_pred = ps_model_copy.predict(XandT_host.iloc[:,:causal_param_first_index]) # predict on host!
    mse_cand = abs(mean_squared_error(t_cand_pred, XandT_host['T']) - mse_host)
    mse_diff_list.append(mse_cand)

similarity_pscore_ranking = sorted(range(len(mse_diff_list)), key=lambda i: mse_diff_list[i], reverse=True) 
# the more diff in pscore the better so reverse=True


print(random_ranking)
print(sample_size_order)
print(similarity_cov_distrib_ranking)
print(similarity_pscore_ranking)

[50 13 40 17 34  9 19 24 27 26 15 36 12 39 47 31  7 25 14 38  5 33 20  6
 43  4  3 45 32 48  8  1 46 41 29 49 22 23 28 37 42 44 16 18  2 11 21 30
 10 35]
[14, 4, 29, 44, 27, 8, 2, 52, 25, 24, 17, 3, 47, 40, 50, 23, 43, 7, 19, 9, 30, 11, 1, 36, 15, 16, 34, 33, 13, 39, 45, 22, 46, 28, 32, 6, 51, 26, 12, 41, 37, 31, 21, 35, 49, 38, 10, 20, 5, 48, 42]
[47, 18, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 48, 49, 50]
[35, 19, 11, 14, 39, 47, 36, 49, 40, 5, 33, 31, 20, 24, 37, 30, 12, 10, 32, 34, 41, 18, 45, 6, 46, 8, 28, 29, 17, 50, 48, 15, 21, 38, 16, 25, 42, 2, 0, 27, 9, 1, 13, 7, 22, 23, 26, 43, 44, 3, 4]


In [19]:
compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, random_ranking, top_n = top_n, k = k)
compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, sample_size_order, top_n = top_n, k = k)
compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, similarity_cov_distrib_ranking, top_n = top_n, k = k)
compare_to_ground_truth(correlation_with_true_rankings, true_cate_ranking, similarity_pscore_ranking, top_n = top_n, k = k)

{'tau': [-0.11578947368421053,
  -0.2105263157894737,
  -0.031578947368421054,
  -0.052631578947368425,
  -0.052631578947368425,
  0.010526315789473684],
 'rho': [-0.1819548872180451,
  -0.3218045112781954,
  -0.0706766917293233,
  -0.04661654135338346,
  -0.030075187969924807,
  0.019548872180451125],
 'precision_at_k': [1.0, 1.0, 1.0, 0.95, 1.0, 1.0],
 'recall_at_k': [0.39215686274509803,
  0.39215686274509803,
  0.39215686274509803,
  0.37254901960784315,
  0.39215686274509803,
  0.39215686274509803],
 'mean average precision': [0.8630710128245707,
  0.8930264856912737,
  0.14902158436505905,
  0.1609595249595019,
  0.17271240697347895,
  0.015144063238103486],
 'ndcg': [0.9999999999999998,
  0.9999999999999998,
  0.9999999999999998,
  0.9999999999999998,
  0.9999999999999998,
  0.9999999999999998],
 'rank corr eig': [-0.1946170824432616,
  -0.4077302695680013,
  -0.08152995480447862,
  0.04820902594204496,
  -0.2081776770736918,
  0.14167823496897794],
 'mean reciprocal rank': [1.0

### Show results

In [21]:
correlation_with_true_rankings= pd.DataFrame.from_dict(correlation_with_true_rankings)
correlation_with_true_rankings.index = ['obs_closed_form', 'caus_closed_form', 'random', 'sample size', 'similarity_cov_distrib_ranking', 'similarity_pscore_ranking size'] #, 'obs_from_samples', 'caus_from_samples']
correlation_with_true_rankings

Unnamed: 0,tau,rho,precision_at_k,recall_at_k,mean average precision,ndcg,rank corr eig,mean reciprocal rank
obs_closed_form,-0.115789,-0.181955,1.0,0.392157,0.863071,1.0,-0.194617,1.0
caus_closed_form,-0.210526,-0.321805,1.0,0.392157,0.893026,1.0,-0.40773,1.0
random,-0.031579,-0.070677,1.0,0.392157,0.149022,1.0,-0.08153,0.5
sample size,-0.052632,-0.046617,0.95,0.372549,0.16096,1.0,0.048209,0.25
similarity_cov_distrib_ranking,-0.052632,-0.030075,1.0,0.392157,0.172712,1.0,-0.208178,0.25
similarity_pscore_ranking size,0.010526,0.019549,1.0,0.392157,0.015144,1.0,0.141678,0.2


In [None]:
###### below is version where ground truth is wrt merged dataset

# for i, candidate in merged_datasets.items():

#     XandT_merged = candidate.drop(columns=["Y"])
#     Y_merged = candidate['Y']

#     learner = LinearRegression(fit_intercept=False)
#     learner.fit(y=Y_merged, X=XandT_merged) # we fit on merged datasets

#     true_cate = data_with_groundtruth.loc[XandT_merged.index, 'ite']

#     X_zero = XandT_merged.copy() # we predict on host with T=0 and T=1
#     X_zero.iloc[:,causal_param_first_index:] = 0

#     X_one = XandT_merged.copy()
#     X_one.iloc[:,causal_param_first_index:] = XandT_merged.iloc[:,:causal_param_first_index]

#     pred_cate = learner.predict(X_one)-learner.predict(X_zero)

#     merged_mse.append(mean_squared_error(true_cate, pred_cate))

## bart stuff

In [None]:
# X_host, T_host, Y_host = host.drop(columns=['T','Y']).values, host['T'].values.astype(np.int32), host['Y'].values

# prior_hyperparameters = {'sigma_0_sq':1, 'p_categorical_pr':0, 'p_categorical_trt':0 }
# predictive_model_parameters={"num_trees_pr":200,"num_trees_trt":100}
# conditional_model_param={"num_trees_pr":200}


# for i, candidate in candidate_sites.items():

#     print("from samples "+str(i))
#     X_cand, T_cand = candidate.drop(columns=['Y','T']).values, candidate['T'].values.astype(np.int32)

#     bcf = BayesianCausalForest(
#         prior_hyperparameters,
#         predictive_model_parameters=predictive_model_parameters,
#         conditional_model_param=conditional_model_param)
#     bcf.store_train_data(X=X_host, T=T_host, Y=Y_host)
    
#     joint_eig = bcf.joint_EIG_calc(X_cand, T_cand, sampling_parameters)

#     results["EIG_obs_bart"].append(joint_eig["Obs EIG"])
#     results["EIG_caus_bart"].append(joint_eig["Causal EIG"])