In [1]:
import numpy as np
import pandas as pd
import torch
torch.set_default_tensor_type(torch.FloatTensor) 
import copy
import sys
import os
notebook_dir = os.getcwd()
parent_dir = os.path.dirname(notebook_dir)
# Add the parent directory to the Python path
sys.path.append(parent_dir)
sys.path.append('/Users/lucile/causal_info_gain/causal_prospective_merge/data')

from rct_data_generator import *
from outcome_models import *
from plotting_functions import *
from mcmc_bayes_update import *
from eig_comp_utils import *
from research_exp_utils import *

from econml.metalearners import TLearner
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier


  _C._set_default_tensor_type(t)


In [2]:
from scipy.stats import kendalltau, spearmanr

def average_precision_at_k(true_rankings, predicted_rankings, k):
    num_hits = 0
    sum_precision = 0
    for i, pred in enumerate(predicted_rankings[:k], 1):
        if pred in true_rankings:
            num_hits += 1
            sum_precision += num_hits / i
    if not true_rankings:
        return 0
    return sum_precision / min(len(true_rankings), k)

def mean_average_precision(true_rankings, predicted_rankings, k=None):
    if k is None:
        k = len(true_rankings)
    avg_precision = np.mean([average_precision_at_k(true_rankings, predicted_rankings, k_) for k_ in range(1, k + 1)])
    return avg_precision

def precision_at_k(true_rankings, predicted_rankings, k):
    intersection = set(predicted_rankings[:k]) & set(true_rankings)
    return len(intersection) / k

def recall_at_k(true_rankings, predicted_rankings, k):
    intersection = set(predicted_rankings[:k]) & set(true_rankings)
    return len(intersection) / len(true_rankings)

def mrr(true_rankings, predicted_rankings):
    for i, pred in enumerate(predicted_rankings, 1):
        if pred in true_rankings:
            return 1 / i
    return 0

def ndcg(true_rankings, predicted_rankings, k=None):
    if k is None:
        k = len(true_rankings)
    dcg = sum(2 ** true_rankings[i] - 1 / np.log2(i + 2) for i in range(k))
    ideal_rankings = sorted(true_rankings, reverse=True)
    ideal_dcg = sum(2 ** ideal_rankings[i] - 1 / np.log2(i + 2) for i in range(k))
    return dcg / ideal_dcg

In [3]:
path = '/Users/lucile/causal_info_gain/causal_prospective_merge/'
data_with_groundtruth, x, t, y = get_data('twins', path)
data_with_groundtruth.dropna(inplace=True)
data_with_groundtruth = data_with_groundtruth.rename(columns={'t': 'T', 'y': 'Y'})
XandT = data_with_groundtruth.drop(columns=['Y','y0','y1','ite'])
XandT.head()


Unnamed: 0,eclamp,gestatcat1,gestatcat2,gestatcat3,gestatcat4,gestatcat5,gestatcat6,gestatcat7,gestatcat8,gestatcat9,...,brstate_reg,feduc6,dfageq,nprevistq,data_year,crace,birmon,dtotord_min,dlivord_min,T
0,0.0,0.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,...,5.0,2.0,1.0,0.0,0.0,0.0,0.0,3.0,3.0,1.0
1,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,...,5.0,5.0,8.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0
2,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,...,5.0,2.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,...,5.0,4.0,6.0,0.0,0.0,0.0,0.0,2.0,2.0,1.0
4,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,5.0,4.0,7.0,0.0,0.0,1.0,0.0,3.0,3.0,0.0


In [4]:
number_of_candidate_sites = 50
min_sample_size_cand = 100
max_sample_size_cand = 200
host_sample_size = 200 
desired_initial_sample_size = 10**4
XandT = XandT.sample(n=desired_initial_sample_size, replace=True, random_state=42)

outcome_function = None
std_true_y = 1
power_x = 1
power_x_t = 1
sigma_rand_error = 1

exp_parameters = {'number_of_candidate_sites': number_of_candidate_sites+1, 'min_sample_size_cand': min_sample_size_cand, \
                'max_sample_size_cand': max_sample_size_cand, 'host_sample_size': host_sample_size, 'outcome_function': outcome_function, \
                'std_true_y': std_true_y, 'power_x': power_x, 'power_x_t': power_x_t}

causal_param_first_index = causal_param_first_index = power_x * np.shape(XandT)[1] + 1   

In [32]:
def generating_random_sites_from(data, exp_parameters):
    
    candidates = {}
    sample_size, number_covariates = np.shape(data)[0], np.shape(data)[1]
    function_indices = {0: lambda X: np.log(X+1), 1: lambda X: X**3, 2: lambda X: X, 3: lambda X: X**2}
    number_of_candidate_sites = exp_parameters['number_of_candidate_sites']
    min_sample_size_cand = exp_parameters['min_sample_size_cand']
    max_sample_size_cand = exp_parameters['max_sample_size_cand']
    outcome_function = None
    std_true_y = exp_parameters['std_true_y']
    power_x = exp_parameters['power_x']
    power_x_t = exp_parameters['power_x_t']
    number_features = number_covariates
    created_sites = 0
    
    while created_sites < number_of_candidate_sites+1:

        np.random.seed(created_sites)
        
        selected_features_for_subsampling = np.random.randint(2, size = number_features) 
        # binary bool vector representing selection for being an input of the sampling function
        random_coefs = [np.random.uniform(-10, 10) for _ in range(number_features)] 
        random_fct_idx = [np.random.randint(0, 4) for _ in range(number_features)] 
        
        def p_assigned_to_site(X, T, eps):
            result = 0
            for j in range(number_features-1):
                result += selected_features_for_subsampling[j] * random_coefs[j] * function_indices[random_fct_idx[j]](X[j])
            result += selected_features_for_subsampling[-1] * random_coefs[-1] *  function_indices[random_fct_idx[-1]](T)
            return sigmoid(result + eps)
        
        sample_size = np.random.randint(min_sample_size_cand, max_sample_size_cand + 1)  # Add 1 to include max_sample_size_cand
        if created_sites==0:
            sample_size = exp_parameters['host_sample_size']
        design_data_cand = subsample_one_dataset(XandT, p_assigned_to_site, sample_size, power_x, power_x_t, outcome_function, std_true_y, seed=created_sites)
        any_nan = design_data_cand.isna().any().any()
        if not design_data_cand.empty and not any_nan: # we're appending
            candidates[created_sites] = design_data_cand
        else:
            number_of_candidate_sites+=1 # not appending
        created_sites += 1

    return candidates

In [33]:
#dictionnary of random sites
candidate_sites = generating_random_sites_from(XandT, exp_parameters)
for i, cand in candidate_sites.items():
    candidate_sites[i] = pd.concat([cand, data_with_groundtruth.loc[cand.index, 'Y']], axis=1)
    
host = candidate_sites[0]
candidate_sites = {key: value for key, value in candidate_sites.items() if key != 0}
XandT_host, Y_host = torch.from_numpy(host.drop(columns=["Y"]).values), torch.from_numpy(host["Y"].values)

# Prior parameters for Bayesian update on host
d = np.shape(host)[1]-1
prior_mean = torch.zeros(d)
sigma_prior = 1
beta_0, sigma_0_sq, inv_cov_0 = prior_mean, sigma_rand_error,torch.eye(d)
prior_hyperparameters = {'beta_0': beta_0, 'sigma_0_sq': sigma_0_sq,"inv_cov_0":inv_cov_0}


overflow encountered in exp


In [35]:
n_samples_outer_expectation_obs = 400
n_samples_inner_expectation_obs = 800
n_samples_outer_expectation_caus = 400
n_samples_inner_expectation_caus = 800

sampling_parameters = {'n_samples_inner_expectation_obs':n_samples_inner_expectation_obs, 'n_samples_outer_expectation_obs':n_samples_outer_expectation_obs, \
                       'n_samples_inner_expectation_caus':n_samples_inner_expectation_caus, 'n_samples_outer_expectation_caus':n_samples_outer_expectation_caus}

eig_results = {"EIG_obs_from_samples": [], 'EIG_caus_from_samples':[], "EIG_obs_closed_form":[], "EIG_caus_closed_form":[], "EIG_obs_bart":[], "EIG_caus_bart":[]}

In [36]:
for _,candidate in candidate_sites.items():
    print(f"For a sample size of {np.shape(candidate)[0]}")
    print(f" % treated in host: {round(100 * candidate['T'].mean(),2)}%")

For a sample size of 200
 % treated in host: 71.0%
For a sample size of 144
 % treated in host: 72.22%
For a sample size of 182
 % treated in host: 68.13%
For a sample size of 175
 % treated in host: 73.71%
For a sample size of 195
 % treated in host: 71.79%
For a sample size of 103
 % treated in host: 72.82%
For a sample size of 124
 % treated in host: 76.61%
For a sample size of 156
 % treated in host: 69.23%
For a sample size of 184
 % treated in host: 70.11%
For a sample size of 148
 % treated in host: 78.38%
For a sample size of 104
 % treated in host: 68.27%
For a sample size of 145
 % treated in host: 65.52%
For a sample size of 122
 % treated in host: 68.03%
For a sample size of 133
 % treated in host: 67.67%
For a sample size of 198
 % treated in host: 69.7%
For a sample size of 139
 % treated in host: 76.26%
For a sample size of 137
 % treated in host: 70.8%
For a sample size of 177
 % treated in host: 68.93%
For a sample size of 137
 % treated in host: 72.99%
For a sample si

In [37]:
eig_results["EIG_obs_closed_form"]=[]
eig_results["EIG_caus_closed_form"]=[]

for _, candidate in candidate_sites.items():
    X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
    bayes_reg = BayesianLinearRegression(prior_hyperparameters)
    bayes_reg.set_causal_index(causal_param_first_index)
    post_host_parameters = bayes_reg.fit(XandT_host, Y_host)
    n_samples = n_samples_outer_expectation_obs * (n_samples_inner_expectation_obs + 1)

    eig_results["EIG_obs_closed_form"].append(
            bayes_reg.closed_form_obs_EIG(X_cand)
            )
    eig_results["EIG_caus_closed_form"].append(
            bayes_reg.closed_form_causal_EIG(X_cand)
            )

In [10]:
# eig_results["EIG_obs_from_samples"]=[]
# eig_results["EIG_caus_from_samples"]=[]

# for _, candidate in candidate_sites.items():
#     X_cand = torch.from_numpy(candidate.drop(columns=["Y"]).values)
#     bayes_reg = BayesianLinearRegression(prior_hyperparameters)
#     bayes_reg.set_causal_index(causal_param_first_index)
#     post_host_parameters = bayes_reg.fit(XandT_host, Y_host)

#     eig_results["EIG_obs_from_samples"].append(
#             bayes_reg.samples_obs_EIG(
#                 X_cand, n_samples_outer_expectation_obs, n_samples_inner_expectation_obs
#             )
#         )
#     eig_results["EIG_caus_from_samples"].append(
#             bayes_reg.samples_causal_EIG(
#                 X_cand, n_samples_outer_expectation_obs, n_samples_inner_expectation_obs
#             )
#         )

In [11]:
# X_host, T_host, Y_host = host.drop(columns=['T','Y']).values, host['T'].values.astype(np.int32), host['Y'].values

# prior_hyperparameters = {'sigma_0_sq':1, 'p_categorical_pr':0, 'p_categorical_trt':0 }
# predictive_model_parameters={"num_trees_pr":200,"num_trees_trt":100}
# conditional_model_param={"num_trees_pr":200}


# for _, candidate in candidate_sites.items():

#     X_cand, T_cand = candidate.drop(columns=['Y','T']).values, candidate['T'].values.astype(np.int32)

#     bcf = BayesianCausalForest(
#         prior_hyperparameters,
#         predictive_model_parameters=predictive_model_parameters,
#         conditional_model_param=conditional_model_param)
#     bcf.store_train_data(X=X_host, T=T_host, Y=Y_host)
    
#     joint_eig = bcf.joint_EIG_calc(X_cand, T_cand, sampling_parameters)

#     results["EIG_obs_bart"].append(joint_eig["Obs EIG"])
#     results["EIG_caus_bart"].append(joint_eig["Causal EIG"])

In [50]:
# now merge and compute some CATE error
merged_datasets = {}

for i, candidate in candidate_sites.items():
    merged_datasets[i]= pd.concat([host, candidate], axis=0)

In [55]:
cate_diff = {}

from sklearn.metrics import mean_squared_error

model_y = GradientBoostingRegressor()
model_t = GradientBoostingClassifier()

for i, candidate in merged_datasets.items():
    
    X_merged = merged_datasets[i].filter(regex='^(?!T)').copy()
    X_merged = X_merged.drop(columns=["Y"])
    T_merged = merged_datasets[i]['T']
    Y_merged = merged_datasets[i]['Y']

    learner = TLearner(models= GradientBoostingRegressor())
    learner.fit(Y=Y_merged, T=T_merged, X=X_merged)
    pred_cate = learner.effect(X_merged)
    ### need dataset with ground truth to compute some kind of errors here
    true_cate = data_with_groundtruth.loc[merged_datasets[i].index, 'ite']
    pred_cate
    cate_diff[i]= - mean_squared_error(true_cate, pred_cate)


In [57]:
top_n = 30
obs_eig_ranking_closed_form = sorted(range(len(eig_results["EIG_obs_closed_form"])), key=lambda i: eig_results["EIG_obs_closed_form"][i], reverse=True)[:top_n]
print(obs_eig_ranking_closed_form)
caus_eig_ranking_closed_form = sorted(range(len(eig_results["EIG_caus_closed_form"])), key=lambda i: eig_results["EIG_caus_closed_form"][i], reverse=True)[:top_n]
print(caus_eig_ranking_closed_form)
# eig_ranking_from_samples = sorted(range(len(eig_results["EIG_caus_from_samples"])), key=lambda i: eig_results["EIG_caus_from_samples"][i], reverse=True)[:top_n]
# print(eig_ranking_from_samples)
true_cate_ranking = sorted(cate_diff, key=cate_diff.get, reverse=True)[:top_n]
print(true_cate_ranking)

[3, 51, 14, 27, 44, 29, 25, 4, 2, 17, 11, 49, 7, 23, 43, 36, 19, 8, 24, 18, 9, 1, 30, 15, 28, 16, 34, 39, 22, 33]
[3, 11, 25, 14, 27, 44, 51, 2, 17, 29, 4, 49, 7, 23, 36, 24, 43, 28, 19, 18, 8, 9, 30, 1, 46, 15, 16, 34, 39, 22]
[36, 41, 51, 9, 24, 46, 25, 6, 21, 19, 39, 47, 34, 40, 20, 33, 22, 28, 18, 52, 13, 29, 43, 23, 16, 50, 42, 1, 12, 8]


In [58]:
k = 10

tau_closed_form, p_value_tau_closed_form = kendalltau(true_cate_ranking, caus_eig_ranking_closed_form)
rho_closed_form, p_value_rho_closed_form = spearmanr(true_cate_ranking, caus_eig_ranking_closed_form)
precision_at_k_closed_form = precision_at_k(true_cate_ranking, caus_eig_ranking_closed_form, k=k)
recall_at_k_closed_form = recall_at_k(true_cate_ranking, caus_eig_ranking_closed_form, k=k)
map_closed_form = mean_average_precision(true_cate_ranking, caus_eig_ranking_closed_form, k=k)
ndcg_closed_form = ndcg(true_cate_ranking, caus_eig_ranking_closed_form, k)
rank_corr_closed_form = np.corrcoef(true_cate_ranking, caus_eig_ranking_closed_form)[0, 1]
mrr_closed_form = mrr(true_cate_ranking, caus_eig_ranking_closed_form)


# tau_from_samples, p_value_tau_from_samples = kendalltau(true_cate_ranking, eig_ranking_from_samples)
# rho_from_samples, p_value_rho_from_samples = spearmanr(true_cate_ranking, eig_ranking_from_samples)
# precision_at_k_from_samples = precision_at_k(true_cate_ranking, eig_ranking_from_samples, k=k)
# recall_at_k_from_samples = recall_at_k(true_cate_ranking, eig_ranking_from_samples, k=k)
# map_from_samples = mean_average_precision(true_cate_ranking, eig_ranking_from_samples, k=k)
# ndcg_from_samples = ndcg(true_cate_ranking, eig_ranking_from_samples, k)
# rank_corr_from_samples = np.corrcoef(true_cate_ranking, eig_ranking_from_samples)[0, 1]
# mrr_from_samples = mrr(true_cate_ranking, eig_ranking_from_samples)



correlation_with_true_rankings={'tau':[tau_closed_form],'rho':[rho_closed_form], \
      'precision_at_k': [precision_at_k_closed_form], \
      'recall_at_k':[recall_at_k_closed_form], \
      'mean average precision': [map_closed_form], 
      'ndcg': [ndcg_closed_form], \
      'rank corr eig': [rank_corr_closed_form], \
      "mean reciprocal rank": [mrr_closed_form]}

correlation_with_true_rankings= pd.DataFrame.from_dict(correlation_with_true_rankings)
correlation_with_true_rankings.index = ['caus_closed_form']
correlation_with_true_rankings


Unnamed: 0,tau,rho,precision_at_k,recall_at_k,mean average precision,ndcg,rank corr eig,mean reciprocal rank
caus_closed_form,0.002299,-0.007786,0.3,0.1,0.064317,0.286633,-0.02222,0.333333
