In [2]:
import toml
import sys
import numpy as np
import pandas as pd

from scipy.stats import norm
from scipy.special import logsumexp

import os

try:
    chdir
    print("Changed")
except:
    os.chdir("./bartpy/")
    chdir = True

from bartpy.sklearnmodel import SklearnModel

In [3]:
def logmeanexp(x):
    return logsumexp(x)-np.log(x.size)

In [93]:
experiment = 12

In [94]:
from julia import Julia
jl = Julia(compiled_modules=False)
%load_ext julia.magic

%julia using JLD
%julia using TOML
%julia using Random
%julia include("../../data/synthetic.jl")
%julia using .Synthetic

%julia Random.seed!(1234)
%julia experiment = $experiment
%julia config_path = "../../data/synthetic/$(experiment).toml"
obj_size = %julia TOML.parsefile(config_path)["data"]["obj_size"]

%julia SigmaU, U_, T_, X_, Y_, epsY, ftxu = generate_synthetic_confounder(config_path)
T, X, Y = %julia T_, X_, Y_
nObjects = int(len(T)/obj_size)
n = len(T)
object_ids = np.zeros((n, nObjects))

for i in range(nObjects):
    object_ids[i*obj_size:(i+1)*obj_size, i] = 1


Z = np.concatenate([T.reshape(-1, 1), X, object_ids], axis=1)

The julia.magic extension is already loaded. To reload it, use:
  %reload_ext julia.magic


In [95]:
np.mean(T) - (np.mean(T) - min(T)) * 0.95

0.027750000000000052

In [96]:
if T.max() == 1.:
    binary = True
    doTs = [1., 0.]
else:
    binary = False
    doTnSteps = 20
    lower = np.mean(T) - (np.mean(T) - min(T)) * 0.95
    upper = np.mean(T) - (np.mean(T) - max(T)) * 0.95
    doTs = np.linspace(lower, upper, doTnSteps)

In [97]:
model = SklearnModel(n_samples=10,
                     n_chains=10,
                     n_burn=50,
                     n_trees=10,
                     thin=1,
                     store_in_sample_predictions=False)
model.fit(Z, Y)

Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)


Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
  4%|▍         | 2/50 [00:00<00:04, 10.10it/s]

Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
 10%|█         | 5/50 [00:00<00:04,  9.38it/s]

Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
  2%|▏         | 1/50 [00:00<00:08,  5.88it/s]

Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
  2%|▏         | 1/50 [00:00<00:05,  9.51it/s]

Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
  6%|▌         | 3/50 [00:00<00:05,  8.88it/s]]

Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
 22%|██▏       | 11/50 [00:01<00:03, 10.29it/s]

Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
100%|██████████| 50/50 [00:04<00:00, 11.83it/s]
 80%|████████  | 40/50 [00:03<00:00, 12.15it/s]

Starting sampling


100%|██████████| 50/50 [00:04<00:00, 11.59it/s]


Starting sampling


100%|██████████| 50/50 [00:04<00:00, 11.57it/s]


Starting sampling


100%|██████████| 10/10 [00:00<00:00, 13.58it/s]
 86%|████████▌ | 43/50 [00:03<00:00, 12.81it/s]


Starting sampling


100%|██████████| 50/50 [00:04<00:00, 11.53it/s]


Starting sampling


 70%|███████   | 7/10 [00:00<00:00, 11.23it/s]]
100%|██████████| 50/50 [00:04<00:00, 11.30it/s]


Starting sampling


100%|██████████| 10/10 [00:00<00:00, 11.17it/s]
100%|██████████| 50/50 [00:04<00:00, 11.11it/s]


Starting sampling


 60%|██████    | 6/10 [00:00<00:00, 10.18it/s]]


Starting sampling


 40%|████      | 4/10 [00:00<00:00, 15.02it/s]]
100%|██████████| 10/10 [00:00<00:00, 12.65it/s]
100%|██████████| 10/10 [00:00<00:00, 13.86it/s]
100%|██████████| 10/10 [00:00<00:00, 17.08it/s]
100%|██████████| 10/10 [00:00<00:00, 17.97it/s]


Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)


Starting burn


  prob_value_selected_within_variable = np.log(mutation.existing_node.data.proportion_of_value_in_variable(splitting_variable, splitting_value))
  return self.log_transition_ratio(tree, mutation) + self.log_likihood_ratio(model, tree, mutation) + self.log_tree_ratio(model, tree, mutation)
100%|██████████| 50/50 [00:01<00:00, 32.66it/s]


Starting sampling


100%|██████████| 50/50 [00:01<00:00, 29.71it/s]


Starting sampling


100%|██████████| 10/10 [00:00<00:00, 32.20it/s]
100%|██████████| 10/10 [00:00<00:00, 32.26it/s]


SklearnModel(alpha=0.95, beta=2.0,
       initializer=<bartpy.initializers.sklearntreeinitializer.SklearnTreeInitializer object at 0x11fec7ac8>,
       n_burn=50, n_chains=10, n_jobs=-1, n_samples=10, n_trees=10,
       sigma_a=0.001, sigma_b=0.001, store_acceptance_trace=True,
       store_in_sample_predictions=False, thin=1,
       tree_sampler=<bartpy.samplers.unconstrainedtree.treemutation.UnconstrainedTreeMutationSampler object at 0x11fec7a90>)

In [98]:
Zcf = Z.copy()

def PEHE(effect, effect_pred):
    return np.sqrt(((effect - effect_pred)**2).mean())

PEHEs = np.zeros_like(doTs)


for i, doT in enumerate(doTs):
    mask = %julia mask = T_ .!= $doT
    Ycf = %julia ftxu(fill($doT, sum(mask)), X_[mask, :], U_[mask, :], epsY[mask])
    mask = T != doT
    
    Zcf[:, 0] = doT
    Y_pred = model.predict(Z[mask])
    Ycf_pred = model.predict(Zcf[mask])
    effect = Y[mask] - Ycf
    effect_pred = Y_pred - Ycf_pred
    PEHEs[i] = PEHE(effect, effect_pred)

print(PEHEs.mean())



6.132312536994409
