In [1]:
import numpy as np
import pandas as pd
import torch
torch.set_default_tensor_type(torch.FloatTensor) 
import copy
import sys
import os
notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Add the parent directory to the Python path
sys.path.append(parent_dir)
sys.path.append('/Users/lucile/causal_info_gain/causal_prospective_merge/data')

from rct_data_generator import *
from outcome_models import *
from plotting_functions import *
from mcmc_bayes_update import *
from eig_comp_utils import *
from research_exp_utils import *

from econml.metalearners import TLearner
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier


  _C._set_default_tensor_type(t)


In [2]:
path = '/Users/lucile/causal_info_gain/causal_prospective_merge/'
data_with_groundtruth, x, t, y = get_data('twins', path)
data_with_groundtruth.dropna(inplace=True)
data_with_groundtruth = data_with_groundtruth.rename(columns={'t': 'T', 'y': 'Y'})
XandT = data_with_groundtruth.drop(columns=['Y','y0','y1','ite'])
XandT.head()


Unnamed: 0,eclamp,gestatcat1,gestatcat2,gestatcat3,gestatcat4,gestatcat5,gestatcat6,gestatcat7,gestatcat8,gestatcat9,...,brstate_reg,feduc6,dfageq,nprevistq,data_year,crace,birmon,dtotord_min,dlivord_min,T
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,5.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,5.0,5.0,8.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,5.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,5.0,4.0,6.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,4.0,7.0,0.0,0.0,1.0,0.0,3.0,3.0,0.0


In [3]:
number_of_candidate_sites = 50
min_sample_size_cand = 100
max_sample_size_cand = 300
host_sample_size = 200 #TODO fix (or not)
desired_initial_sample_size = 10**4
XandT = XandT.sample(n=desired_initial_sample_size, replace=True, random_state=42)

outcome_function = None
std_true_y = 1
power_x = 1
power_x_t = 1
sigma_rand_error = 1

exp_parameters = {'number_of_candidate_sites': number_of_candidate_sites+1, 'min_sample_size_cand': min_sample_size_cand, \
                'max_sample_size_cand': max_sample_size_cand, 'host_sample_size': host_sample_size, 'outcome_function': outcome_function, \
                'std_true_y': std_true_y, 'power_x': power_x, 'power_x_t': power_x_t}

causal_param_first_index = causal_param_first_index = power_x * np.shape(XandT)[1] + 1   

In [4]:
def generating_random_sites_from(data, exp_parameters, seed=None):
    if seed is not None:
        np.random.seed(seed)
    sites = {}
    sample_size, number_covariates = np.shape(data)[0], np.shape(data)[1]
    function_indices = {0: lambda X: np.log(X+1), 1: lambda X: X**3, 2: lambda X: X, 3: lambda X: X**2}
    number_of_candidate_sites = exp_parameters['number_of_candidate_sites']
    min_sample_size_cand = exp_parameters['min_sample_size_cand']
    max_sample_size_cand = exp_parameters['max_sample_size_cand']
    outcome_function = None
    std_true_y = exp_parameters['std_true_y']
    power_x = exp_parameters['power_x']
    power_x_t = exp_parameters['power_x_t']
    number_features = number_covariates
    created_sites = 0
    
    while created_sites < number_of_candidate_sites:
        
        selected_features_for_subsampling = np.random.randint(2, size = number_features) 
        # binary bool vector representing selection for being an input of the sampling function
        random_coefs = [np.random.uniform(-10, 10) for _ in range(number_features)] 
        random_fct_idx = [np.random.randint(0, 4) for _ in range(number_features)] 
        
        def p_assigned_to_site(X, T, eps):
            result = 0
            for j in range(number_features-1):
                result += selected_features_for_subsampling[j] * random_coefs[j] * function_indices[random_fct_idx[j]](X[j])
            result += selected_features_for_subsampling[-1] * random_coefs[-1] *  function_indices[random_fct_idx[-1]](T)
            return sigmoid(result + eps)
        
        sample_size = np.random.randint(min_sample_size_cand, max_sample_size_cand + 1)  # Add 1 to include max_sample_size_cand
        design_data_cand = subsample_one_dataset(XandT, p_assigned_to_site, sample_size, power_x, power_x_t, outcome_function, std_true_y, seed=seed)
        any_nan = design_data_cand.isna().any().any()
        if not design_data_cand.empty and not any_nan:
            sites[created_sites] = design_data_cand
            created_sites += 1

    return sites

In [5]:
#dictionnary of random sites
candidate_sites = generating_random_sites_from(XandT, exp_parameters, seed=0)
for i, cand in candidate_sites.items():
    candidate_sites[i] = pd.concat([cand, data_with_groundtruth.loc[cand.index, 'Y']], axis=1)
    
host = candidate_sites.popitem()[1]

# Prior parameters for Bayesian update on host
d = np.shape(host)[1]-1
prior_mean = torch.zeros(d)
sigma_prior = 1
beta_0, sigma_0_sq, inv_cov_0 = prior_mean, sigma_rand_error,torch.eye(d)
prior_hyperparameters = {'beta_0': beta_0, 'sigma_0_sq': sigma_0_sq,"inv_cov_0":inv_cov_0}


overflow encountered in exp


In [6]:
XandT_host, Y_host = torch.from_numpy(host.drop(columns=["Y"]).values), torch.from_numpy(host["Y"].values)

n_samples_outer_expectation_obs = 200
n_samples_inner_expectation_obs = 400
n_samples_outer_expectation_caus = 200
n_samples_inner_expectation_caus = 400

sampling_parameters = {'n_samples_inner_expectation_obs':n_samples_inner_expectation_obs, 'n_samples_outer_expectation_obs':n_samples_outer_expectation_obs, \
                       'n_samples_inner_expectation_caus':n_samples_inner_expectation_caus, 'n_samples_outer_expectation_caus':n_samples_outer_expectation_caus}

results = {"EIG_obs_from_samples": [], 'EIG_caus_from_samples':[], "EIG_obs_closed_form":[], "EIG_caus_closed_form":[], "EIG_obs_bart":[], "EIG_caus_bart":[]}

In [7]:
for _,candidate in candidate_sites.items():
    print(f"For a sample size of {np.shape(candidate)[0]}")
    print(f" % treated in host: {round(100 * candidate['T'].mean(),2)}%")

For a sample size of 102
 % treated in host: 66.67%
For a sample size of 251
 % treated in host: 68.53%
For a sample size of 183
 % treated in host: 67.21%
For a sample size of 107
 % treated in host: 74.77%
For a sample size of 125
 % treated in host: 70.4%
For a sample size of 214
 % treated in host: 73.83%
For a sample size of 201
 % treated in host: 71.64%
For a sample size of 203
 % treated in host: 69.95%
For a sample size of 112
 % treated in host: 75.0%
For a sample size of 214
 % treated in host: 67.29%
For a sample size of 275
 % treated in host: 77.82%
For a sample size of 160
 % treated in host: 72.5%
For a sample size of 113
 % treated in host: 73.45%
For a sample size of 113
 % treated in host: 73.45%
For a sample size of 113
 % treated in host: 73.45%
For a sample size of 113
 % treated in host: 73.45%
For a sample size of 113
 % treated in host: 73.45%
For a sample size of 113
 % treated in host: 73.45%
For a sample size of 113
 % treated in host: 73.45%
For a sample si

In [8]:
for _, candidate in candidate_sites.items():
    X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
    bayes_reg = BayesianLinearRegression(prior_hyperparameters)
    bayes_reg.set_causal_index(causal_param_first_index)
    post_host_parameters = bayes_reg.fit(XandT_host, Y_host)
    n_samples = n_samples_outer_expectation_obs * (n_samples_inner_expectation_obs + 1)

    results["EIG_obs_closed_form"].append(
            bayes_reg.closed_form_obs_EIG(X_cand)
            )
    results["EIG_caus_closed_form"].append(
            bayes_reg.closed_form_causal_EIG(X_cand)
            )

In [9]:
for _, candidate in candidate_sites.items():
    X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
    bayes_reg = BayesianLinearRegression(prior_hyperparameters)
    bayes_reg.set_causal_index(causal_param_first_index)
    post_host_parameters = bayes_reg.fit(XandT_host, Y_host)

    results["EIG_obs_from_samples"].append(
            bayes_reg.samples_obs_EIG(
                X_cand, n_samples_outer_expectation_obs, n_samples_inner_expectation_obs
            )
        )
    results["EIG_caus_from_samples"].append(
            bayes_reg.samples_causal_EIG(
                X_cand, n_samples_outer_expectation_obs, n_samples_inner_expectation_obs
            )
        )
    

In [10]:
# X_host, T_host, Y_host = host.drop(columns=['T','Y']).values, host['T'].values.astype(np.int32), host['Y'].values

# prior_hyperparameters = {'sigma_0_sq':1, 'p_categorical_pr':0, 'p_categorical_trt':0 }
# predictive_model_parameters={"num_trees_pr":200,"num_trees_trt":100}
# conditional_model_param={"num_trees_pr":200}


# for _, candidate in candidate_sites.items():

#     X_cand, T_cand = candidate.drop(columns=['Y','T']).values, candidate['T'].values.astype(np.int32)

#     bcf = BayesianCausalForest(
#         prior_hyperparameters,
#         predictive_model_parameters=predictive_model_parameters,
#         conditional_model_param=conditional_model_param)
#     bcf.store_train_data(X=X_host, T=T_host, Y=Y_host)
    
#     joint_eig = bcf.joint_EIG_calc(X_cand, T_cand, sampling_parameters)

#     results["EIG_obs_bart"].append(joint_eig["Obs EIG"])
#     results["EIG_caus_bart"].append(joint_eig["Causal EIG"])

In [11]:
# now merge and compute some CATE error
merged_datasets = {}

for i, candidate in candidate_sites.items():
    merged_datasets[i]= pd.concat([host, candidate], axis=0)

In [12]:
cate_diff = {}

model_y = GradientBoostingRegressor()
model_t = GradientBoostingClassifier()

for i, candidate in merged_datasets.items():
    
    X_merged = merged_datasets[i].filter(regex='^(?!T)').copy()
    X_merged = X_merged.drop(columns=["Y"])
    T_merged = merged_datasets[i]['T']
    Y_merged = merged_datasets[i]['Y']

    learner = TLearner(models= GradientBoostingRegressor())
    learner.fit(Y=Y_merged, T=T_merged, X=X_merged)
    cate = learner.effect(X_merged)
    ### need dataset with ground truth to compute some kind of errors here
    true_ite = data_with_groundtruth.loc[merged_datasets[i].index, 'ite']
    cate_diff[i]=np.mean(abs(cate - true_ite.values))


In [71]:
top_n = 20
eig_ranking_closed_form = sorted(range(len(results["EIG_caus_closed_form"])), key=lambda i: results["EIG_caus_closed_form"][i], reverse=True)[:top_n]
print(eig_ranking_closed_form)
eig_ranking_from_samples = sorted(range(len(results["EIG_caus_from_samples"])), key=lambda i: results["EIG_caus_from_samples"][i], reverse=True)[:top_n]
print(eig_ranking_from_samples)
true_cate_ranking = sorted(cate_diff, key=cate_diff.get, reverse=True)[:top_n]
print(true_cate_ranking)

[10, 1, 9, 6, 7, 2, 5, 11, 8, 0, 4, 3, 12, 13, 14, 15, 16, 17, 18, 19]
[10, 9, 2, 6, 1, 7, 8, 0, 3, 4, 5, 11, 23, 49, 33, 13, 28, 32, 12, 31]
[10, 4, 11, 1, 8, 2, 7, 5, 3, 9, 0, 6, 16, 41, 37, 13, 31, 47, 42, 45]


In [69]:
from scipy.stats import kendalltau, spearmanr

def average_precision_at_k(true_rankings, predicted_rankings, k):
    num_hits = 0
    sum_precision = 0
    for i, pred in enumerate(predicted_rankings[:k], 1):
        if pred in true_rankings:
            num_hits += 1
            sum_precision += num_hits / i
    if not true_rankings:
        return 0
    return sum_precision / min(len(true_rankings), k)

def mean_average_precision(true_rankings, predicted_rankings, k=None):
    if k is None:
        k = len(true_rankings)
    avg_precision = np.mean([average_precision_at_k(true_rankings, predicted_rankings, k_) for k_ in range(1, k + 1)])
    return avg_precision

def precision_at_k(true_rankings, predicted_rankings, k):
    intersection = set(predicted_rankings[:k]) & set(true_rankings)
    return len(intersection) / k

def recall_at_k(true_rankings, predicted_rankings, k):
    intersection = set(predicted_rankings[:k]) & set(true_rankings)
    return len(intersection) / len(true_rankings)

# Mean Reciprocal Rank (MRR)
def mrr(true_rankings, predicted_rankings):
    for i, pred in enumerate(predicted_rankings, 1):
        if pred in true_rankings:
            return 1 / i
    return 0


In [73]:
k = 20

tau_closed_form, p_value_closed_form = kendalltau(true_cate_ranking, eig_ranking_closed_form)
print('tau_closed_form '+str(tau_closed_form))
print('p_value_closed_form '+str(p_value_closed_form))

tau_from_samples, p_value_from_samples = kendalltau(true_cate_ranking, eig_ranking_from_samples)
print('tau_from_samples '+str(tau_from_samples))
print('p_value_from_samples '+str(p_value_from_samples))

rho_closed_form, _ = spearmanr(true_cate_ranking, eig_ranking_closed_form)
print('rho_closed_form '+str(rho_closed_form))

rho_from_samples, _ = spearmanr(true_cate_ranking, eig_ranking_from_samples)
print('rho_from_samples '+str(rho_from_samples))

print("Precision at K Closed Form for k=", k, 'is', precision_at_k(true_cate_ranking, eig_ranking_closed_form, k=k))
print("Precision at K From Samples for k=", k, 'is', precision_at_k(true_cate_ranking, eig_ranking_from_samples, k=k))

print("Recall at K Closed Form for k=", k, 'is', recall_at_k(true_cate_ranking, eig_ranking_closed_form, k=k))
print("Recall at K From Samples for k=", k, 'is', recall_at_k(true_cate_ranking, eig_ranking_from_samples, k=k))

print("Mean Average Precision Closed Form (MAP):", mean_average_precision(true_cate_ranking, eig_ranking_closed_form, k=k))
print("Mean Average Precision From Samples (MAP):", mean_average_precision(true_cate_ranking, eig_ranking_from_samples, k=k))

print('ndcg_closed_form '+str(ndcg_score([true_cate_ranking], [eig_ranking_closed_form])))
print('ndcg_from_samples '+str(ndcg_score([true_cate_ranking], [eig_ranking_from_samples])))

print('rank corr eig closed form '+ str(np.corrcoef(true_cate_ranking, eig_ranking_closed_form)[0, 1]))
print('rank corr eig from samples '+ str(np.corrcoef(true_cate_ranking, eig_ranking_from_samples)[0, 1]))


tau_closed_form 0.631578947368421
p_value_closed_form 3.6222609144151624e-05
tau_from_samples 0.5578947368421052
p_value_from_samples 0.00035899561704050146
rho_closed_form 0.819548872180451
rho_from_samples 0.7293233082706766
Precision at K Closed Form for k= 20 is 0.7
Precision at K From Samples for k= 20 is 0.7
Recall at K Closed Form for k= 20 is 0.7
Recall at K From Samples for k= 20 is 0.7
Mean Average Precision Closed Form (MAP): 0.9250419660302797
Mean Average Precision From Samples (MAP): 0.9098225080813365
ndcg_closed_form 0.974812858996945
ndcg_from_samples 0.956496732397278
rank corr eig closed form 0.8227256827698197
rank corr eig from samples 0.8291004918868893
