In [4]:
import numpy as np
import rpy2
import rpy2.robjects as robjects
import pickle
from time import time
import os

# RPY2 is used an interconnect between Python and R. It allows
# my to run R code from python which makes this experimentation
# process smoother.
from rpy2.robjects import IntVector, FloatVector, Formula
from rpy2.robjects.packages import importr
from rpy2.robjects import numpy2ri
numpy2ri.activate()

stats = importr('stats') # standard regression package
matching = importr('Matching') # GenMatch package
snow = importr('snow') # cluster manager

## 1. Define an Experimental Framework

### A. Data Generation

#### Raw Data

Generate raw data in line with the descriptions in Section 1 of the paper. 

In [5]:
# GLOBAL CONFIG

# Var count
n_vars = 10

# Data types (default is standard normal)
binary_indeces = [1, 3, 6, 8, 9]
binarize = True

# Associations between vars an treat/outcome
treat_vars = [0,1,2,3,4,5,6,7]
outcome_vars = [0,1,2,3,4,8,9,10]

# Treat/outcome generation weights
assignment_weights = np.array([0, 0.8, -0.25, 0.6, -0.4, -0.8, -0.5, 0.7])
outcome_weights = np.array([-3.85, 0.3, -0.36, -0.73, -0.2, 0.71, -0.19, 0.26])
true_treat_effect = -0.4

def generate_data(n_samples=1000):
    # Generate 10 Random Vars
    # 1-4 are confounders: associated with outcome + treatment
    # 5-7 are exposure predictors
    # 8-10 are outcome predictors
    X = np.random.normal(loc=0.0, scale=1.0, size=(n_samples, n_vars))

    # Binarize specified vars if requested.
    if binarize:
        for var in binary_indeces:
            X[:, var-1] = (X[:, var -1] > 0).astype(int)

    # Add dummy for bias param     
    X = np.hstack([np.ones((n_samples, 1)), X])
    return X

In [6]:
# DEBUG
# X = generate_data(2000)
# X.shape

#### Assignment Models

The code below implements the various assigment models described in Section 1 of the paper. 

In [7]:
# Create the models

assignment_models={}

def nonlinear_transform(X, B, quad_indeces):
    for quad_index in quad_indeces:
        quad = X[:, quad_index]**2
        X = np.hstack([X, quad.reshape(-1, 1)])
        B = np.append(B, B[quad_index])
    
    return X, B

def nonadditive_transform(X, B, interaction_indeces, interaction_weights=None):
    for interaction_index, var_indeces in enumerate(interaction_indeces):
        int_1, int_2 = var_indeces
        interaction_val = X[:, int_1]*X[:, int_2]
        
        if not interaction_weights:
            interaction_val = interaction_val*0.5
        else:
            interaction_val = interaction_val*interaction_weights[interaction_index]
            
        X = np.hstack([X, interaction_val.reshape(-1, 1)])
        B = np.append(B, B[int_1])
    
    return X, B

# Scenario 1
assignment_models["A_add_lin"] = lambda B, X: np.dot(X, B)

# Scenario 2:     
assignment_models["B_add_mild_nlin"] = lambda B, X: np.dot(*nonlinear_transform(X, B,
                                                       quad_indeces=[2]))
# Scenario 3:
assignment_models["C_add_mod_nlin"] = lambda B, X: np.dot(*nonlinear_transform(X, B,
                                                       quad_indeces=[2, 4, 7]))
# Scenario 4:
assignment_models["D_mild_nadd_lin"] = lambda B, X: np.dot(*nonadditive_transform(X, B,
                                                       interaction_indeces=[(1,3), (2, 4), (4,5), (5,6)]))

# Scenario 5:
assignment_models["E_mild_nadd_mild_nlin"] = lambda B, X: np.dot(*nonlinear_transform(*nonadditive_transform(X, B,
                                                       interaction_indeces=[(1,3), (2, 4), (4,5), (5,6)]), quad_indeces=[2]))
# Scenario 6
assignment_models["F_mod_nadd_lin"] = lambda B, X: np.dot(*nonadditive_transform(X, B,
                                                       interaction_indeces=[(1,3), (2, 4), (3,5), (4,6), (5,7), (1,6), (2,3),
                                                                            (3,4), (4,5), (5,6)],
                                                       interaction_weights=[0.5, 0.7, 0.5, 0.7, 0.5, 0.5, 0.7, 0.5, 0.5, 0.5]))
# Scenario 7
assignment_models["G_mod_nadd_mod_nlin"] = lambda B, X: np.dot(*nonlinear_transform(*nonadditive_transform(X, B,
                                                       interaction_indeces=[(1,3), (2, 4), (3,5), (4,6), (5,7), (1,6), (2,3),
                                                                            (3,4), (4,5), (5,6)],
                                                       interaction_weights=[0.5, 0.7, 0.5, 0.7, 0.5, 0.5, 0.7, 0.5, 0.5, 0.5]), 
                                                                            quad_indeces=[2, 4, 7]))

assignment_model_names = ['A_add_lin', 'B_add_mild_nlin', 'C_add_mod_nlin', 'D_mild_nadd_lin',
                     'E_mild_nadd_mild_nlin', 'F_mod_nadd_lin', 'G_mod_nadd_mod_nlin']

In [8]:
# Tests 
assert(set(assignment_models["A_add_lin"](np.array([2, 0.5, 1.5]),
                                                np.array([[1, 2,4], [1, 10, 20]]))) == set([9, 37]))

assert(set(assignment_models["B_add_mild_nlin"](np.array([2, 0.5, 1.5]),
                                                np.array([[1, 2,4], [1, 10, 20]]))) == set([33, 637]))

assert(set(assignment_models["C_add_mod_nlin"](np.array([2, 0.5, 1.5, 1, 1, 1, 2, 3]),
                                                np.array([[1, 2,4,5,6,7,8,9], [1, 10, 20, 30, 40, 50, 60, 60]]))) == set([373, 13457]))

assert(set(assignment_models["D_mild_nadd_lin"](np.array([2, 0.5, 1.5, 1, 1, 1, 2, 3]),
                                                np.array([[1, 2,4,5,6,7,8,9], [1, 10, 20, 30, 40, 50, 60, 60]]))) == set([139.5, 3632]))

assert(set(assignment_models["E_mild_nadd_mild_nlin"](np.array([2, 0.5, 1.5, 1, 1, 1, 2, 3]),
                                                np.array([[1, 2,4,5,6,7,8,9], [1, 10, 20, 30, 40, 50, 60, 60]]))) == set([163.5, 4232]))


In [9]:
# Return assignments for a set of data
# by applying the specified assignment model
# and then assigning treatment status probabilistically.
def get_assignments(B, X, n_samples, scenario="A_add_lin"):
    X_usable = X[:, treat_vars]
    
    # Calculate the probabilities of assignment
    linear_assignment_data = assignment_models[scenario](B, X_usable)
    p_treat = 1.0/(1+np.exp(-1*linear_assignment_data))

    # Assign
    rand = np.random.random(n_samples)
    assignments = (rand < p_treat).astype(int)
    
    return assignments

#### Outcome Data

Get outcomes for given data based on the model defined in Section 1

In [10]:
def get_outcomes(B, X, assignments, effect=true_treat_effect):
    X_usable = X[:, outcome_vars]
    return effect*assignments + np.dot(X_usable, B)

In [11]:
# DEBUG
# assignments = get_assignments(assignment_weights, X, "mild_nonaddititive_mild_nonlinear")
# outcomes = get_outcomes(outcome_weights, X, assignments)

Helper function to return a dataset of desired size based on a given assignment model. 

In [12]:
def get_data(n_samples, assignment_model):
    X = generate_data(n_samples)
    assignments = get_assignments(assignment_weights, X,
                                  n_samples, assignment_model)

    outcomes = get_outcomes(outcome_weights, X, assignments)
    
    return assignments, outcomes, X

### B. Cluster Compute

Given that some of the trials were going to >48 hours to run, I needed a way to speed things up. GenMatch can be run on a cluster because mutation and evaluation of a generation is highly parrallelizable. The code below wraps the snow library in R to manage local and remote clusters. 

The simple first option to split computation across CPU cores locally. This produces a 5-10x speedup depending on your machine. The second option is to go remote and explote 32 cores for $0.9 an hour on multiple AWS machines. The remote option relies on the ability of the master machine to ssh into the slave nodes without a password. On AWS this was configured based on the static master IP address. 

Be careful about bandwith! On trial of 1000 runs may use up to 10TB of data transfer in the cluster. This resulted in some unfortunate AWS spending. 

In [13]:
AWS_MASTER_DNS="ip-172-31-42-147.ec2.internal"
AWS_SLAVE_1 = "ubuntu@ip-172-31-43-193.ec2.internal"
AWS_SLAVE_2 = "ubuntu@ip-172-31-81-244.ec2.internal"
AWS_MASTER_PORT_RANGE = list(range(11305, 11340))

class ClusterProvider(object):
    def __init__(self, n_nodes=8, remote_hosts=None, ports=None):
        if remote_hosts is None:
            self.cl = snow.makeSOCKcluster(["localhost"]*n_nodes)
        else:
            # Set the acceptable ports for connection
            # from the slaves
            if not ports:
                ports = AWS_MASTER_PORT_RANGE
            
            # Construct the connection string
            addresses = []
            for remote_host, n_nodes in remote_hosts:
                addresses+=[remote_host]*n_nodes
                
            self.cl = snow.makeSOCKcluster(addresses, rscript="Rscript", manual=False, snowlib="/usr/local/lib/R/site-library",
                                           port=IntVector(ports), master=AWS_MASTER_DNS, outfile="/dev/stdout", timeout=10)
    
    def get_cluster(self):
        return self.cl
    
    def kill_cluster(self):
        snow.stopCluster(self.cl)

In [10]:
# Local cluster
cluster_provider = ClusterProvider(n_nodes=8)

In [11]:
# Remote cluster
# cluster_provider = ClusterProvider(remote_hosts=[(AWS_SLAVE_1, 8)],
#                                     ports = list(range(11305, 11314)))

In [12]:
# Run this with True to kill the cluster
kill = False # termination protection
if kill:
    cluster_provider.kill_cluster()

### C. Flexible Causal Inference Methods

The code below implements the matching methods described in the paper: propensity score matching and GenMatch. The idea behind this code was too allow multiple different configurations of each method to be run with simple parameter flags. The goal being to reduce code repetition and prevent inconsistencies arising from differeing implementations. I am largely happy with the result because, as is visible later, these functions do expose a very clean experimental API. However, allowing for many different configurations requires some ugly code at the beginning of each function. Excuse the mess!

#### Estimators

Define methods which can process outcomes, assignments and covariate data into a treatment effect estimate. 

1. Logistic Regression
2. GenMatch
3. Distance matching based on Manual Mahalanobis metric (for use with the latent distributions produced by the VAE). 

In [14]:
# Helper function which runs logistic regression in R
# to determine propensity scores for a dataset. This is used
# in the propensity score matching method and in GenMatch
def get_propensity_scores(assignments, covariate_data):
    # Setup
    y = IntVector(assignments)
    fmla = Formula('y ~ X')
    env = fmla.environment
    
    # Run propensiy regression
    env['X'] = covariate_data
    env['y'] = y
    fit = stats.glm(fmla, family="binomial")
    
    # DEBUG: fit.rx("coefficients")
    return fit.rx2("fitted.values")

In [15]:
# 1. Logisic Regression Propensity Matching
def logistic_prop_matching_est(outcomes, assignments, covariate_data, *args, **kwargs):
    global gm_warnings
    logistic_propensity_scores = get_propensity_scores(assignments, covariate_data)
    
    # Use prop scores from the neural network regression
    # if supplied
    nn_p_scores = kwargs.get("nn_p_scores", None)
    if nn_p_scores is not None:
        if gm_warnings:
            print("Using p-scores from neural net")
        propensity_scores = nn_p_scores
    else:
        propensity_scores = logistic_propensity_scores
    
    # Run matching on prop scores
    match_out = matching.Match(
        Y=FloatVector(outcomes),
        Tr=IntVector(assignments),
        X=propensity_scores,
        replace=True)
    
    gm_warnings = False # only warn once
    return np.array(match_out.rx2("est").rx(1,1))[0]

In [16]:
# 2. GenMatch Matching
def genmatch_est(outcomes, assignments, covariate_data, *args, **kwargs):
    global gm_warnings
    
    # Get the singleton cluster
    cl = cluster_provider.get_cluster()
    
    # Flag on whethert or not to use propensity scores
    # in GenMatch
    if kwargs.get("genmatch_with_prop_scores", True):
        if gm_warnings:
            print("Genmatch running with p scores")
        
        # Parameter to allow prop scores to
        # be derived from custom data
        propensity_vars = kwargs.get("propensity_vars", None)
        if propensity_vars is None:
            propensity_vars = covariate_data
        else:
            if gm_warnings:
                print("Finding propensity scores with custom vars")

        logistic_p_scores = np.array(get_propensity_scores(assignments, propensity_vars))
            
        nn_p_scores = kwargs.get("nn_p_scores", None)
        
        # Use either the prop scores from the neural or the logistic regression
        # or both!
        if (nn_p_scores is not None) and kwargs.get("nn_p_scores_with_logistic", False):
            if gm_warnings:
                print("Using neural net and logistic p scores")
            matching_data = np.hstack([covariate_data, nn_p_scores.reshape(-1, 1),
                                       logistic_p_scores.reshape(-1, 1)])
            
        elif (nn_p_scores is not None):
            if gm_warnings:
                print("Using neural net  p scores")
            matching_data = np.hstack([covariate_data, nn_p_scores.reshape(-1, 1)])
        else:
            if gm_warnings:
                print("Using logistic p scores")
            matching_data = np.hstack([covariate_data, logistic_p_scores.reshape(-1, 1)])
             
    else:
        if gm_warnings:
            print("Not using prop scores")
        matching_data = covariate_data
    
    # Alllow evaluation of balance on custom vars
    balance_vars = kwargs.get("balance_vars", None)
    if balance_vars is None:
        balance_vars = covariate_data
    else:
        if gm_warnings:
            print("Evaluating balance on custom vars")
    
    # Run GenMatch
    start = time()
    gen_out = matching.GenMatch(
        Tr=IntVector(assignments),
        X=matching_data,
        BalanceMatrix=balance_vars,
        print_level=0,
        cluster=cl)
    
    if gm_warnings:
        print("GenMatch Time: ", time() - start)
    
    match_out = matching.Match(
        Y=FloatVector(outcomes),
        Tr=IntVector(assignments),
        X=matching_data,
        replace=True,
        Weight_matrix=gen_out)
    
    gm_warnings = False # only warn once
    return np.array(match_out.rx2("est").rx(1,1))[0]

In [17]:
# DEBUG
# est = logistic_prop_matching_est(assignments, X[:, 1:]) # exclude the bias term
# np.array(est)

In [18]:
# DEBUG
# est = genmatch_est(assignments, X[:, 1:]) # exclude the bias term
# np.array(est)

In [19]:
# Find the distance between two distributions
# using the Mahalanobis distance or the Bhattacharyya distance
def distributional_distance(mu1, var1, mu2, var2, metric="md"):
    # Formulas from:
    # https://en.wikipedia.org/wiki/Bhattacharyya_distance
    # https://en.wikipedia.org/wiki/Mahalanobis_distance
    
    if metric not in ["md", "bhat"]:
        raise Exception("Invalid Metric")
        
    var1 = np.exp(var1)
    var2 = np.exp(var2)
    
    V1 = np.diag(var1)
    V2 = np.diag(var2)
    
    if metric =="bhat":
        V = (V1 + V2)/2.0
    else:
        V = V1
        
    VI = np.linalg.inv(V)
    
    md = np.sqrt(np.dot(np.dot((mu1-mu2),VI),(mu1-mu2).T))
    
    if metric =="md":
        return md
    
    bhat_additive = 0.5*np.log(float(np.linalg.det(V))/np.sqrt(np.linalg.det(V1) + np.linalg.det(V2)))
    
    return ((1/8.0)*md) + bhat_additive

def mahalanobis_matching(outcomes, assignments, covariate_data, covariate_covariance, *args, **kwargs):
    global gm_warnings #warn once mechanism
    
    # Include propensity scores?
    if kwargs.get("md_with_prop_scores", True):
        
        propensity_vars = kwargs.get("propensity_vars", None)
        if propensity_vars is None:
            propensity_vars = covariate_data
        else:
            if gm_warnings:
                print("Finding propensity scores with custom vars")
            
        propensity_scores = np.array(get_propensity_scores(assignments, propensity_vars))
        prop_var = np.var(propensity_scores)
        propensity_variance = np.full((propensity_scores.shape[0], 1), prop_var)
        
        # Add prop scores to covar data
        covariate_data = np.hstack([covariate_data, propensity_scores.reshape(-1, 1)])
        covariate_covariance = np.hstack([covariate_covariance, propensity_variance])
    else:
        if gm_warnings:
            print("Not using prop scores")
    
    # Prepare data
    treated = covariate_data[assignments==1]
    treated_var = covariate_covariance[assignments==1]
    
    control = covariate_data[assignments==0]
    control_var = covariate_covariance[assignments==0]
    
    num_treated = treated.shape[0]
    num_control = control.shape[0]
    
    m_distances = np.zeros((num_treated, num_control))
    
    # Find the distances and match on them
    start = time()
    for treated_index, treat_mu in enumerate(treated):
        treat_variance = treated_var[treated_index]
        
        for control_index, control_mu in enumerate(control): 
            metric = kwargs.get("distance_metric", "md")
            control_variance = control_var[control_index]
                
            m_distances[treated_index, control_index] = \
                distributional_distance(treat_mu, treat_variance, control_mu,
                                        control_variance, metric=metric)
                
    if gm_warnings:
        elapsed = np.round(time() - start, 2)
        print("Mahalanobis D. time: ", elapsed, "seconds")
    
    # Match
    md_minimum_matches = np.argmin(m_distances, axis=1) 
    
    # Find treatment effects for the treated
    effects = outcomes[assignments==1] - outcomes[assignments==0][md_minimum_matches]
    
    gm_warnings = False # only warn once
    return np.mean(effects) #ATT

### D. Data Interconnect

CSV files are used to pass information between this file and the Neural Network files. The code below defines helper functions which save data generated in this file to CSVs for neural net training and  functions which read in the processed data which results from the training. 

In [32]:
RAW_DATA_DIR = "../Data/Raw"
PROCESSED_DATA_DIR = "../Data/Processed"
VAE_Z4 = "VAE/"
VAE_Z2 = "VAE/Z2/"
REG = "Regression/"

In [33]:
# Enforce file name regularity with a function
# which returns a namem based on the running params. 
def get_data_file_name(n_samples, model, file_num, data_suffix, data_folder="", processed=False):
    file_name = "/{}n_{}_model_{}_v_{}_{}.csv".format(
        data_folder,
        n_samples,
        model,
        file_num, 
        data_suffix)
    if not processed:
        return RAW_DATA_DIR + file_name
    
    return PROCESSED_DATA_DIR + file_name

# Create data files based on Monte Carlo generated data.
def write_data_files(n_files, n_samples, model="A_add_lin"):

    for file_num in range(n_files): 
        assignments, outcomes, covariates = get_data(n_samples, model)
        file_prefix = get_data_file_name(n_samples, model, file_num)
        
        np.savetxt(file_prefix + "covar_data.csv", covariates, delimiter=",")
        np.savetxt(file_prefix + "outcome_data.csv", outcomes, delimiter=",")
        np.savetxt(file_prefix + "assignment_data.csv", assignments, delimiter=",")

# Retrieve processed and unprocessed data from files in order to run experiments. 
def get_data_from_file(n_samples, model, file_num, loss_type=None, nn_p_regression=False):
    
    original_covariate_suffix = "covar_data"
    original_covariate_file = get_data_file_name(n_samples, model, file_num,
                                                 original_covariate_suffix, processed=False)

    outcome_file = get_data_file_name(n_samples, model, file_num, "outcome_data",processed=False)
    assignment_file = get_data_file_name(n_samples, model, file_num, "assignment_data",processed=False)

    
    original_covariates = np.loadtxt(original_covariate_file, delimiter=",")
    outcomes = np.loadtxt(outcome_file, delimiter=",")
    assignments = np.loadtxt(assignment_file, delimiter=",")
    
    extra_data = {}
    if not (loss_type or nn_p_regression):
        raise Exception("Invalid config. Need loss type or p regression option")
        
    if loss_type:
        if loss_type in ["reconstruction", "sparsity"]:
            covariate_suffix = "covar_data_{}".format(loss_type)
            covariate_file = get_data_file_name(n_samples, model, file_num, covariate_suffix, processed=True)
            covariates = np.loadtxt(covariate_file, delimiter=",")
        elif loss_type in ["vae"]:
            covariate_suffix = "covar_data"
            covariate_file = get_data_file_name(n_samples, model, file_num, 
                                                covariate_suffix, data_folder=VAE_Z2, processed=True)
            covariates_with_std = np.loadtxt(covariate_file, delimiter=",")

            # Split means and covariance
            column_count = covariates_with_std.shape[1]
            covar_marker = int(column_count/2)

            covariates = covariates_with_std[:, :covar_marker]
            extra_data["covariate_covariance"] = covariates_with_std[:, covar_marker:]
        
        if not nn_p_regression:
            return assignments, outcomes, covariates, original_covariates, extra_data
    
    if nn_p_regression:
        reg_suffix = "covar_data"
        regression_file = get_data_file_name(n_samples, model, file_num,
                                                     reg_suffix, data_folder=REG, processed=True)
        regression_prop_scores = np.loadtxt(regression_file, delimiter=",")

        extra_data["nn_p_scores"] = regression_prop_scores
    
        if not loss_type:
            return assignments, outcomes, original_covariates, extra_data
    
    return assignments, outcomes, covariates, original_covariates, extra_data

In [22]:
# Write data files for 1000 runs all models
# Careful with this, it writes ~3GB of data. 
write_files = False
if write_files:
    for model in assignment_model_names:
        write_data_files(n_files=1000, n_samples=1000, model=model)

Utilities to store and retrive pickled results dictionaries. This allows us to persist and sync results across different machines/processes.

In [34]:
def store_results_dict(results, name):
    pickle.dump(results, open("../Results/{}.p".format(name), "wb" ))
    
def retrieve_results_dict(name):
    try:
        return pickle.load(open( "../Results/{}.p".format(name), "rb" ))
    except Exception as e:
        return None

### E. Experiment Runner Code

The functions below are the core of the experimental process. They provide a clean interface to allow various combinations of input to the matching methods. The first funciton runs a Monte Carlo simulation for a given sample size and number of samples. It calls retrieves/generates any data required, passes this data to the matching functions and processes the results to create the bias/RMSE metrics. The second function wraps the first in order to run a Monte Carlo battery - applying the same settings across many different assignment models and managing the storage and retrieval of results. 

#### Single Simulation

Run a single model for n runs

In [24]:
# Wrapper function for all three of the matching methods above. This allows the matching function
# to be passed into the experiment running code without concern over the method API.
def get_estimate(outcomes, assignments, covar_data, method, *args, **kwargs):
    return method(outcomes, assignments, covar_data, *args, **kwargs)

In [25]:
def run_simulation(runs=1000, n_samples=1000,
                   assignment_model="additive_linear",
                   estimator=logistic_prop_matching_est,
                   from_files=False,
                   file_numbers=None,
                   verbose=True,
                   *args, **kwargs):
    
    global gm_warnings
    gm_warnings = True
    
    progress_tick = max(1, int(runs/10))
    results = np.zeros(runs)

    print("Simulation running. Config:")
    print("n_samples:", n_samples)
    print("assignment_model:", assignment_model)
    print("from_files:", from_files)
    
    if file_numbers is None:
        file_numbers = range(runs)
    else:
        if runs != len(file_numbers):
            raise exception("Invalid number of file numbers supplied")
    
    # Run multiple Monte Carlo trials
    for i, file_number in enumerate(file_numbers):
        balance_vars = None
        propensity_vars = None
        extra_data = {}
        
        # Prepare data for matching
        if from_files:
            loss_type = kwargs.get("loss_type", None)
            nn_p_regression = kwargs.get("nn_p_regression", None)
            
            if not (loss_type or nn_p_regression):
                raise Exception("Must supply loss type or p regression option to read from files")
            
            if loss_type:
                assignments, outcomes, covar_data, original_covars, extra_data = get_data_from_file(n_samples,
                                                                       model=assignment_model,
                                                                       file_num=file_number,
                                                                       loss_type=loss_type,
                                                                       nn_p_regression=nn_p_regression)
                
                if kwargs.get("evaluate_on_original_covars", False):
                    balance_vars=original_covars

                if kwargs.get("propensity_on_original_covars", False):
                    propensity_vars=original_covars
            
            elif nn_p_regression:
                assignments, outcomes, covar_data, extra_data = get_data_from_file(n_samples,
                                                                       model=assignment_model,
                                                                       file_num=file_number,
                                                                       loss_type=loss_type,
                                                                       nn_p_regression=nn_p_regression)
                
                
        else:
            assignments, outcomes, covar_data = get_data(n_samples, assignment_model)
            covar_data = covar_data[:, 1:] #exclude bias term
        
        # Run matching
        results[i] = get_estimate(outcomes,
                                  assignments,
                                  covar_data,
                                  estimator,
                                  balance_vars=balance_vars,
                                  propensity_vars=propensity_vars,
                                  *args,
                                  **extra_data,
                                  **kwargs)
        
        if i%progress_tick == progress_tick-1 and verbose:
            print("Done {} of {}".format(i+1, runs))
    
    # Process the returned treatment effects into bias/RMSE. 
    biases = (true_treat_effect-results)/true_treat_effect * 100
    errors = (true_treat_effect-results)**2
    
    bias = np.abs(np.mean(biases))
    rmse = np.mean(errors)**0.5
    
    if verbose:
        print("\nRMSE", rmse)
        print("Bias", bias)
        print("===============\n\n")
    
    return {"RMSE": rmse, "Bias": bias, "biases": biases, "errors": errors}

In [26]:
# run_simulation(runs=1, n_samples=1000, assignment_model="A_add_lin",
#               estimator=mahalanobis_matching, verbose=True, from_files=True, loss_type="vae")

In [27]:
# sim_results["biases"]

#### Experiment Battery

In [28]:
def get_store_name(subfolder, models_being_run, est, runs, n_samples):
    # Get standardized name of file to store results
    
    if set(models_being_run) == set(assignment_model_names):
        store_name = "{}/est_{}_runs_{}_n_{}".format(
            subfolder,
            est.__name__,
            runs,
            n_samples)
    else:
        store_name = "{}/est_{}_runs_{}_n_{}_models_{}".format(
            subfolder,
            est.__name__,
            runs,
            n_samples,
            "_".join(models_being_run))
    
    return store_name

def run_test_battery(est,
                     store_name=None, 
                     runs=1000,
                     n_samples=1000,
                     models=assignment_models,
                     overwrite=False, verbosity=1,
                     *args, **kwargs):
    # Logging
    def printer(level, *args):
        if level <= verbosity:
            print(*args)
    
    # Storage config
    if store_name is None:
        if "results_subfolder" in kwargs:
            subfolder = kwargs["results_subfolder"]
        else:
            subfolder = "Original"
        store_name = get_store_name(subfolder, models, est, runs, n_samples)
        print("Results File:", store_name)
            
    results = retrieve_results_dict(store_name)

    if overwrite or (not results):
        printer(1, "No valid, existant results found. Beggining battery.\n")
        results = {}
        for model in models:
            printer(1, "Running: ", model)
            results[model] = run_simulation(
                                runs=runs,
                                n_samples=n_samples,
                                assignment_model=model,
                                estimator=est,
                                verbose=(verbosity==2),
                                *args, **kwargs)
            store_results_dict(results[model], store_name+"_checkpoint_"+model)
            printer(1, "Done.\n")

        store_results_dict(results, store_name)
    else:
        printer(1, "Displaying cached results.\n")
    
    printer(1, "Results")
    for model, results in results.items():
        printer(1, "Model: ", model)
        print(1, "Bias: ", results["Bias"])
        print(1, "RMSE: ", results["RMSE"], "\n")

## Run the Experiments

### A. Run the Logistic Regression Battery

In [45]:
run_test_battery(
    est=logistic_prop_matching_est,
    runs=1000,
    n_samples=1000)

Results File: Original/est_logistic_prop_matching_est_runs_1000_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.045874914703647685
1 RMSE:  0.07310500057973227 

Model:  B_add_mild_nlin
1 Bias:  3.1844355433209786
1 RMSE:  0.06588422028138122 

Model:  C_add_mod_nlin
1 Bias:  10.094350684204597
1 RMSE:  0.07650839711310455 

Model:  D_mild_nadd_lin
1 Bias:  6.720731771408928
1 RMSE:  0.08531717119502563 

Model:  E_mild_nadd_mild_nlin
1 Bias:  10.36168716658826
1 RMSE:  0.09094245826533698 

Model:  F_mod_nadd_lin
1 Bias:  3.1228082403965436
1 RMSE:  0.07605107262377982 

Model:  G_mod_nadd_mod_nlin
1 Bias:  11.830178367664905
1 RMSE:  0.07798212919046259 



### B. Run the GenMatch Battery

This battery would require over 60 hours of raw compute. So I split this across three machines using remote clusters. This notebook was duplicated and each of the cells below run on a different kernel with its own cluster. Did 60 hours in approx 4.5 hours for around $20

In [29]:
gm_est = genmatch_est
gm_runs = 1000
gm_n_samples = 1000
gm_models_sets = [assignment_model_names[:3], assignment_model_names[3:5], assignment_model_names[5:]]
gm_files_to_be_produced = []

for model_set in gm_models_sets:
    gm_files_to_be_produced.append(get_store_name("Original", model_set, gm_est, gm_runs, gm_n_samples))

gm_files_to_be_produced

['Original/est_genmatch_est_runs_1000_n_1000_models_A_add_lin_B_add_mild_nlin_C_add_mod_nlin',
 'Original/est_genmatch_est_runs_1000_n_1000_models_D_mild_nadd_lin_E_mild_nadd_mild_nlin',
 'Original/est_genmatch_est_runs_1000_n_1000_models_F_mod_nadd_lin_G_mod_nadd_mod_nlin']

In [30]:
run_test_battery(
    est=gm_est,
    runs=gm_runs,
    n_samples=gm_n_samples,
    models=gm_models_sets[0],
    verbosity=2)

Results File: Original/est_genmatch_est_runs_1000_n_1000_models_A_add_lin_B_add_mild_nlin_C_add_mod_nlin
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  5.5585826624331105
1 RMSE:  0.041571354846349606 

Model:  B_add_mild_nlin
1 Bias:  4.309919000663494
1 RMSE:  0.03799524129548324 

Model:  C_add_mod_nlin
1 Bias:  3.715796982487495
1 RMSE:  0.043206587873791204 



In [31]:
run_test_battery(
    est=gm_est,
    runs=gm_runs,
    n_samples=gm_n_samples,
    models=gm_models_sets[1],
    verbosity=2)

Results File: Original/est_genmatch_est_runs_1000_n_1000_models_D_mild_nadd_lin_E_mild_nadd_mild_nlin
Displaying cached results.

Results
Model:  D_mild_nadd_lin
1 Bias:  2.30672600038697
1 RMSE:  0.040751955269481575 

Model:  E_mild_nadd_mild_nlin
1 Bias:  1.6356097465092616
1 RMSE:  0.0388547493767899 



In [32]:
run_test_battery(
    est=gm_est,
    runs=gm_runs,
    n_samples=gm_n_samples,
    models=gm_models_sets[2],
    verbosity=2)

Results File: Original/est_genmatch_est_runs_1000_n_1000_models_F_mod_nadd_lin_G_mod_nadd_mod_nlin
Displaying cached results.

Results
Model:  F_mod_nadd_lin
1 Bias:  5.058677376450292
1 RMSE:  0.04404046330729526 

Model:  G_mod_nadd_mod_nlin
1 Bias:  3.185538423779952
1 RMSE:  0.04439998296725508 



#### Combine results

In [34]:
gm_combined_name = get_store_name("Original", assignment_model_names, gm_est, gm_runs, gm_n_samples)
linear_combined_name = get_store_name("Original", assignment_model_names, logistic_prop_matching_est, 1000, 1000)

In [35]:
results = {}
for file in gm_files_to_be_produced:
    results.update(retrieve_results_dict(file))

store_results_dict(results, gm_combined_name)
results

{'A_add_lin': {'RMSE': 0.041571354846349606, 'Bias': 5.5585826624331105},
 'B_add_mild_nlin': {'RMSE': 0.03799524129548324, 'Bias': 4.309919000663494},
 'C_add_mod_nlin': {'RMSE': 0.043206587873791204, 'Bias': 3.715796982487495},
 'D_mild_nadd_lin': {'RMSE': 0.040751955269481575, 'Bias': 2.30672600038697},
 'E_mild_nadd_mild_nlin': {'RMSE': 0.0388547493767899,
  'Bias': 1.6356097465092616},
 'F_mod_nadd_lin': {'RMSE': 0.04404046330729526, 'Bias': 5.058677376450292},
 'G_mod_nadd_mod_nlin': {'RMSE': 0.04439998296725508,
  'Bias': 3.185538423779952}}

### C. GenMatch Vs Logistic Propensity Score Matching

In [36]:
gm_results = retrieve_results_dict(gm_combined_name)
lin_results = retrieve_results_dict(linear_combined_name)

results = {
    "Linear": lin_results,
    "GenMatch": gm_results
}

for model in assignment_model_names:
    print(model, "\n")
    for matching in results.keys():
        print(matching)
        print("RMSE:", results[matching][model]["RMSE"], "Bias:", results[matching][model]["Bias"])
        
    print("==============")
    print()
    

A_add_lin 

Linear
RMSE: 0.07310500057973227 Bias: 0.045874914703647685
GenMatch
RMSE: 0.041571354846349606 Bias: 5.5585826624331105

B_add_mild_nlin 

Linear
RMSE: 0.06588422028138122 Bias: 3.1844355433209786
GenMatch
RMSE: 0.03799524129548324 Bias: 4.309919000663494

C_add_mod_nlin 

Linear
RMSE: 0.07650839711310455 Bias: 10.094350684204597
GenMatch
RMSE: 0.043206587873791204 Bias: 3.715796982487495

D_mild_nadd_lin 

Linear
RMSE: 0.08531717119502563 Bias: 6.720731771408928
GenMatch
RMSE: 0.040751955269481575 Bias: 2.30672600038697

E_mild_nadd_mild_nlin 

Linear
RMSE: 0.09094245826533698 Bias: 10.36168716658826
GenMatch
RMSE: 0.0388547493767899 Bias: 1.6356097465092616

F_mod_nadd_lin 

Linear
RMSE: 0.07605107262377982 Bias: 3.1228082403965436
GenMatch
RMSE: 0.04404046330729526 Bias: 5.058677376450292

G_mod_nadd_mod_nlin 

Linear
RMSE: 0.07798212919046259 Bias: 11.830178367664905
GenMatch
RMSE: 0.04439998296725508 Bias: 3.185538423779952



### D. Autoencoder Test Battery

In [37]:
ae_runs = 200

#### AE with only Reconstruction Loss


#### Config 1
Pure reconstruction with propensity score estimates

In [38]:
run_test_battery(
    est=genmatch_est,
    runs=ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="reconstruction",
    results_subfolder="AE/Reconstruction",
    verbosity=2)

Results File: AE/Reconstruction/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.4982566498005895
1 RMSE:  0.07940586362085975 

Model:  B_add_mild_nlin
1 Bias:  5.619851855780028
1 RMSE:  0.08253689905638634 

Model:  C_add_mod_nlin
1 Bias:  1.823000644966564
1 RMSE:  0.07960620818304176 

Model:  D_mild_nadd_lin
1 Bias:  6.264071047188581
1 RMSE:  0.08682767400208805 

Model:  E_mild_nadd_mild_nlin
1 Bias:  8.581211509093361
1 RMSE:  0.0870754365718883 

Model:  F_mod_nadd_lin
1 Bias:  0.009021286717901873
1 RMSE:  0.07551900452145399 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.015501963704979574
1 RMSE:  0.07505690217179598 



#### Config 2
Pure reconstruction *without* propensity score estimates

In [39]:
run_test_battery(
    est=genmatch_est,
    runs=ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="reconstruction",
    results_subfolder="AE/Reconstruction/nopropscores",
    genmatch_with_prop_scores=False,
    verbosity=2)

Results File: AE/Reconstruction/nopropscores/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.5388832593158581
1 RMSE:  0.07368715901276338 

Model:  B_add_mild_nlin
1 Bias:  3.276210486566693
1 RMSE:  0.0816027177764639 

Model:  C_add_mod_nlin
1 Bias:  0.6999435126924826
1 RMSE:  0.07554576764697919 

Model:  D_mild_nadd_lin
1 Bias:  4.937316461303445
1 RMSE:  0.08274988726545493 

Model:  E_mild_nadd_mild_nlin
1 Bias:  7.481250991590314
1 RMSE:  0.08478344012915207 

Model:  F_mod_nadd_lin
1 Bias:  0.9411372663771971
1 RMSE:  0.07537800943456885 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.8048814215327594
1 RMSE:  0.07788164426593222 



#### Config 3
Pure reconstruction, evaluating balance on uncompressed data, *without* propensity score.

In [40]:
run_test_battery(
    est=genmatch_est,
    runs=ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="reconstruction",
    results_subfolder="AE/Reconstruction/evalonoriginal",
    evaluate_on_original_covars=True,
    genmatch_with_prop_scores=False,
    verbosity=2)

Results File: AE/Reconstruction/evalonoriginal/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  1.552233121749752
1 RMSE:  0.06527597429814515 

Model:  B_add_mild_nlin
1 Bias:  3.333034582793069
1 RMSE:  0.06967542349832183 

Model:  C_add_mod_nlin
1 Bias:  0.6790984831418302
1 RMSE:  0.06579060225122386 

Model:  D_mild_nadd_lin
1 Bias:  4.3816613069158254
1 RMSE:  0.07068863465516317 

Model:  E_mild_nadd_mild_nlin
1 Bias:  6.876422257502002
1 RMSE:  0.07278407746587175 

Model:  F_mod_nadd_lin
1 Bias:  2.58611002166168
1 RMSE:  0.06981215022358879 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.502204523397222
1 RMSE:  0.0691460613259185 



#### Config 4
Pure reconstruction *with* propensity score derived from uncompressed data. Evaluating on uncompressed.

In [41]:
run_test_battery(
    est=genmatch_est,
    runs=ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="reconstruction",
    results_subfolder="AE/Reconstruction/evalonoriginal_withp",
    evaluate_on_original_covars=True,
    propensity_on_original_covars=True,
    verbosity=2)

Results File: AE/Reconstruction/evalonoriginal_withp/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.7067484388360161
1 RMSE:  0.03786592930152072 

Model:  B_add_mild_nlin
1 Bias:  0.13339724242402798
1 RMSE:  0.043485078136632285 

Model:  C_add_mod_nlin
1 Bias:  0.11494109318830585
1 RMSE:  0.050248376512268364 

Model:  D_mild_nadd_lin
1 Bias:  1.9150254839541625
1 RMSE:  0.04117176894231486 

Model:  E_mild_nadd_mild_nlin
1 Bias:  3.1961874905224175
1 RMSE:  0.04268150513623945 

Model:  F_mod_nadd_lin
1 Bias:  1.8485395850150468
1 RMSE:  0.04785803044605611 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.18265461358981974
1 RMSE:  0.053080370063081465 



#### Config 5
Pure reconstruction *with* propensity score derived from uncompressed data. Evaluating on compressed.

In [42]:
run_test_battery(
    est=genmatch_est,
    runs=ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="reconstruction",
    results_subfolder="AE/Reconstruction/withp",
    evaluate_on_original_covars=False,
    propensity_on_original_covars=True,
    verbosity=2)

Results File: AE/Reconstruction/withp/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  1.373322079710853
1 RMSE:  0.048648363590301766 

Model:  B_add_mild_nlin
1 Bias:  0.18520053724370186
1 RMSE:  0.053997099576098474 

Model:  C_add_mod_nlin
1 Bias:  0.6021376702766212
1 RMSE:  0.05363450514544961 

Model:  D_mild_nadd_lin
1 Bias:  1.9725293478895758
1 RMSE:  0.05146490692396631 

Model:  E_mild_nadd_mild_nlin
1 Bias:  4.154412436745519
1 RMSE:  0.05394638272616707 

Model:  F_mod_nadd_lin
1 Bias:  1.9623181573686646
1 RMSE:  0.055835868418105714 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.13880680510402485
1 RMSE:  0.060491851132907115 



#### AE with Reconstruction and Sparsity

#### Config 1

Sparse reconstruction *without* propensity score.

In [46]:
run_test_battery(
    est=genmatch_est,
    runs=50,
    n_samples=1000,
    from_files=True,
    loss_type="sparsity",
    results_subfolder="AE/Sparsity",
    genmatch_with_prop_scores=False,
    verbosity=2)

Results File: AE/Sparsity/est_genmatch_est_runs_50_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  8.526097159023248
1 RMSE:  0.1134592853375768 

Model:  B_add_mild_nlin
1 Bias:  4.131268935319159
1 RMSE:  0.1160868056178415 

Model:  C_add_mod_nlin
1 Bias:  8.430802732391273
1 RMSE:  0.1108407840160775 

Model:  D_mild_nadd_lin
1 Bias:  5.322330536939029
1 RMSE:  0.10460071276975336 

Model:  E_mild_nadd_mild_nlin
1 Bias:  0.8211996810856915
1 RMSE:  0.12143098844611416 

Model:  F_mod_nadd_lin
1 Bias:  13.889165268095837
1 RMSE:  0.13591469046446247 

Model:  G_mod_nadd_mod_nlin
1 Bias:  6.744962783709057
1 RMSE:  0.10264505172208324 



#### Config 2

Sparse with prop scores

In [47]:
run_test_battery(
    est=genmatch_est,
    runs=50,
    n_samples=1000,
    from_files=True,
    loss_type="sparsity",
    results_subfolder="AE/Sparsity/withp",
    genmatch_with_prop_scores=True,
    propensity_on_original_covars=True,
    verbosity=2)

Results File: AE/Sparsity/withp/est_genmatch_est_runs_50_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  1.6136292441155462
1 RMSE:  0.05547243408003388 

Model:  B_add_mild_nlin
1 Bias:  0.44513296886365444
1 RMSE:  0.0532783960077565 

Model:  C_add_mod_nlin
1 Bias:  0.4257899593481197
1 RMSE:  0.05914334516222583 

Model:  D_mild_nadd_lin
1 Bias:  0.4726712662232528
1 RMSE:  0.052783396792566356 

Model:  E_mild_nadd_mild_nlin
1 Bias:  2.583291458156751
1 RMSE:  0.06302520184923029 

Model:  F_mod_nadd_lin
1 Bias:  3.274417247927261
1 RMSE:  0.06293152300833735 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.695770947223328
1 RMSE:  0.06237666457453329 



#### Config 3
Sparse with prop scores, eval on original

In [48]:
run_test_battery(
    est=genmatch_est,
    runs=50,
    n_samples=1000,
    from_files=True,
    loss_type="sparsity",
    results_subfolder="AE/Sparsity/evalonoriginal_withp",
    genmatch_with_prop_scores=True,
    propensity_on_original_covars=True,
    evaluate_on_original_covars=True,
    verbosity=2)

Results File: AE/Sparsity/evalonoriginal_withp/est_genmatch_est_runs_50_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  3.0052785609630153
1 RMSE:  0.05135956480780116 

Model:  B_add_mild_nlin
1 Bias:  0.514080145976968
1 RMSE:  0.043153932448751264 

Model:  C_add_mod_nlin
1 Bias:  0.9633292625895341
1 RMSE:  0.04668919693650328 

Model:  D_mild_nadd_lin
1 Bias:  0.30727483094548486
1 RMSE:  0.05212186169768896 

Model:  E_mild_nadd_mild_nlin
1 Bias:  2.7111127095297336
1 RMSE:  0.05075685049205623 

Model:  F_mod_nadd_lin
1 Bias:  0.43743334386364874
1 RMSE:  0.04714358098168887 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.9975102646443165
1 RMSE:  0.05399429284681247 



### Results Printout

In [50]:
gm_results = retrieve_results_dict(gm_combined_name)
lin_results = retrieve_results_dict(linear_combined_name)

gm_ae_recon_results = retrieve_results_dict(
    get_store_name("AE/Reconstruction", assignment_model_names, genmatch_est, ae_runs, 1000))

gm_ae_recon_no_prop_score_results = retrieve_results_dict(
    get_store_name("AE/Reconstruction/nopropscores", assignment_model_names, genmatch_est, ae_runs, 1000))

gm_ae_recon_original_fitness_results = retrieve_results_dict(
    get_store_name("AE/Reconstruction/evalonoriginal", assignment_model_names, genmatch_est, ae_runs, 1000))

gm_ae_recon_original_fitness_with_prop_score_results = retrieve_results_dict(
    get_store_name("AE/Reconstruction/evalonoriginal_withp", assignment_model_names, genmatch_est, ae_runs, 1000))

gm_ae_recon_with_prop_score_results = retrieve_results_dict(
    get_store_name("AE/Reconstruction/withp", assignment_model_names, genmatch_est, ae_runs, 1000))


###

gm_ae_sparse_results = retrieve_results_dict(
    get_store_name("AE/Sparsity", assignment_model_names, genmatch_est, 50, 1000))

gm_ae_sparse_original_fitness_with_prop_score_results = retrieve_results_dict(
    get_store_name("AE/Sparsity/evalonoriginal_withp", assignment_model_names, genmatch_est, 50, 1000))

gm_ae_sparse_with_prop_score_results = retrieve_results_dict(
    get_store_name("AE/Sparsity/withp", assignment_model_names, genmatch_est, 50, 1000))



results = {
    "Linear": lin_results,
    "GenMatch": gm_results,
    "GenMatch AE Recon": gm_ae_recon_results,
    "GenMatch AE Recon, No P Score": gm_ae_recon_no_prop_score_results,
    "GenMatch AE Recon, Org. Fitness": gm_ae_recon_original_fitness_results,
    "GenMatch AE Recon, Org. Fitness, With P": gm_ae_recon_original_fitness_with_prop_score_results,
    "GenMatch AE Recon, With P": gm_ae_recon_with_prop_score_results,
    ###
    "GenMatch AE Sparse": gm_ae_sparse_results,
    "GenMatch AE Sparse, Org. Fitness, With P": gm_ae_sparse_original_fitness_with_prop_score_results,
    "GenMatch AE Sparse, With P": gm_ae_sparse_with_prop_score_results,
}

for model in assignment_model_names:
    print(model, "\n")
    for matching in results.keys():
        print(matching)
        print("RMSE:", np.round(results[matching][model]["RMSE"], 4), "Bias:",
              np.round(results[matching][model]["Bias"], 4))
        
    print("==============")
    print()
    

A_add_lin 

Linear
RMSE: 0.0731 Bias: 0.0459
GenMatch
RMSE: 0.0416 Bias: 5.5586
GenMatch AE Recon
RMSE: 0.0794 Bias: 0.4983
GenMatch AE Recon, No P Score
RMSE: 0.0737 Bias: 0.5389
GenMatch AE Recon, Org. Fitness
RMSE: 0.0653 Bias: 1.5522
GenMatch AE Recon, Org. Fitness, With P
RMSE: 0.0379 Bias: 0.7067
GenMatch AE Recon, With P
RMSE: 0.0486 Bias: 1.3733
GenMatch AE Sparse
RMSE: 0.1135 Bias: 8.5261
GenMatch AE Sparse, Org. Fitness, With P
RMSE: 0.0514 Bias: 3.0053
GenMatch AE Sparse, With P
RMSE: 0.0555 Bias: 1.6136

B_add_mild_nlin 

Linear
RMSE: 0.0659 Bias: 3.1844
GenMatch
RMSE: 0.038 Bias: 4.3099
GenMatch AE Recon
RMSE: 0.0825 Bias: 5.6199
GenMatch AE Recon, No P Score
RMSE: 0.0816 Bias: 3.2762
GenMatch AE Recon, Org. Fitness
RMSE: 0.0697 Bias: 3.333
GenMatch AE Recon, Org. Fitness, With P
RMSE: 0.0435 Bias: 0.1334
GenMatch AE Recon, With P
RMSE: 0.054 Bias: 0.1852
GenMatch AE Sparse
RMSE: 0.1161 Bias: 4.1313
GenMatch AE Sparse, Org. Fitness, With P
RMSE: 0.0432 Bias: 0.5141
GenMatc

### E. VAE

In [51]:
vae_runs = 200

#### 1. GenMatch in latent space

#### Config 1
Genmatch without propensity score

In [52]:
run_test_battery(
    est=genmatch_est,
    runs=vae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="VAE/",
    genmatch_with_prop_scores=False,
    verbosity=2)

Results File: VAE//est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  7.312201130997685
1 RMSE:  0.05295933266800956 

Model:  B_add_mild_nlin
1 Bias:  7.214011335197719
1 RMSE:  0.052447212539800586 

Model:  C_add_mod_nlin
1 Bias:  5.105324395918432
1 RMSE:  0.04918834245145284 

Model:  D_mild_nadd_lin
1 Bias:  5.1043386948034275
1 RMSE:  0.04855458362958724 

Model:  E_mild_nadd_mild_nlin
1 Bias:  3.875474594602394
1 RMSE:  0.04781873913431988 

Model:  F_mod_nadd_lin
1 Bias:  8.549881888333742
1 RMSE:  0.0572041703159975 

Model:  G_mod_nadd_mod_nlin
1 Bias:  4.392239501605347
1 RMSE:  0.04799559601072314 



#### Config 2

GenMatch with propensity on original covars, evaluating on latent. 

In [53]:
run_test_battery(
    est=genmatch_est,
    runs=vae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="VAE/withp",
    genmatch_with_prop_scores=True,
    propensity_on_original_covars=True,
    verbosity=2)

Results File: VAE/withp/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  2.5395433059520007
1 RMSE:  0.04808528784042503 

Model:  B_add_mild_nlin
1 Bias:  2.04490054506496
1 RMSE:  0.04660732524183032 

Model:  C_add_mod_nlin
1 Bias:  0.15073221258237324
1 RMSE:  0.0507555360190453 

Model:  D_mild_nadd_lin
1 Bias:  0.3606620636051126
1 RMSE:  0.04962363133543826 

Model:  E_mild_nadd_mild_nlin
1 Bias:  1.5379434666280805
1 RMSE:  0.0499287081523966 

Model:  F_mod_nadd_lin
1 Bias:  3.1169056061108686
1 RMSE:  0.05503239603006674 

Model:  G_mod_nadd_mod_nlin
1 Bias:  1.3989182232673525
1 RMSE:  0.05336476463717196 



#### Config 3

GenMatch with propensity on original covars, evaluating balance on original covars.

In [54]:
run_test_battery(
    est=genmatch_est,
    runs=vae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="VAE/evalonoriginal_withp",
    genmatch_with_prop_scores=True,
    propensity_on_original_covars=True,
    evaluate_on_original_covars=True,
    verbosity=2)

Results File: VAE/evalonoriginal_withp/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.8517170004947486
1 RMSE:  0.04701381917058319 

Model:  B_add_mild_nlin
1 Bias:  0.3672756443601539
1 RMSE:  0.048237894709229706 

Model:  C_add_mod_nlin
1 Bias:  1.3369943583773656
1 RMSE:  0.053340455624368247 

Model:  D_mild_nadd_lin
1 Bias:  1.1463459806259935
1 RMSE:  0.04852571125820148 

Model:  E_mild_nadd_mild_nlin
1 Bias:  3.383983114942237
1 RMSE:  0.05382206092404716 

Model:  F_mod_nadd_lin
1 Bias:  1.2160734246097318
1 RMSE:  0.055325910561995914 

Model:  G_mod_nadd_mod_nlin
1 Bias:  2.619943937710873
1 RMSE:  0.052742346254803 



#### 2. Mahalanobis Distance

Use the latent space means and standard deviations to try direct matching. 

#### Config 1

MD distance plain

In [55]:
run_test_battery(
    est=mahalanobis_matching,
    runs=vae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="VAE/md",
    verbosity=2)

Results File: VAE/md/est_mahalanobis_matching_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  7.544441012550429
1 RMSE:  0.05138754558193052 

Model:  B_add_mild_nlin
1 Bias:  7.298749820510931
1 RMSE:  0.050962216786474376 

Model:  C_add_mod_nlin
1 Bias:  3.9777874804774442
1 RMSE:  0.04374998484853666 

Model:  D_mild_nadd_lin
1 Bias:  5.180281858062211
1 RMSE:  0.0462263853072062 

Model:  E_mild_nadd_mild_nlin
1 Bias:  4.192882095179809
1 RMSE:  0.04551575417194539 

Model:  F_mod_nadd_lin
1 Bias:  7.983947334265511
1 RMSE:  0.05533750312001229 

Model:  G_mod_nadd_mod_nlin
1 Bias:  4.230594789867654
1 RMSE:  0.047919920965010125 



#### Config 2
MD distance with propensity scores

In [56]:
run_test_battery(
    est=mahalanobis_matching,
    runs=vae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="VAE/md_withp",
    md_with_prop_scores=True,
    propensity_on_original_covars=True,
    verbosity=2)

Results File: VAE/md_withp/est_mahalanobis_matching_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  7.57902532597313
1 RMSE:  0.05149514462878567 

Model:  B_add_mild_nlin
1 Bias:  7.299220691076224
1 RMSE:  0.050977018369883775 

Model:  C_add_mod_nlin
1 Bias:  3.9412419310526183
1 RMSE:  0.043646522766075474 

Model:  D_mild_nadd_lin
1 Bias:  5.149583862026001
1 RMSE:  0.04606738315619503 

Model:  E_mild_nadd_mild_nlin
1 Bias:  4.1849816542073155
1 RMSE:  0.04538368034996612 

Model:  F_mod_nadd_lin
1 Bias:  7.9760975724759895
1 RMSE:  0.05528390984921817 

Model:  G_mod_nadd_mod_nlin
1 Bias:  4.215663518355213
1 RMSE:  0.04782197261829298 



#### Config 3

Use the Bhattacharyya Distance distance which finds distributional overlap for normal distributions.

In [57]:
run_test_battery(
    est=mahalanobis_matching,
    runs=vae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="VAE/Z2/bhat",
    distance_metric="bhat",
    verbosity=2)

Results File: VAE/Z2/bhat/est_mahalanobis_matching_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  7.556229902669169
1 RMSE:  0.050628029614782195 

Model:  B_add_mild_nlin
1 Bias:  7.318992673380274
1 RMSE:  0.050024202084690994 

Model:  C_add_mod_nlin
1 Bias:  4.181261410314277
1 RMSE:  0.04285994072768112 

Model:  D_mild_nadd_lin
1 Bias:  5.085225497432111
1 RMSE:  0.0452803013165698 

Model:  E_mild_nadd_mild_nlin
1 Bias:  4.142427330949168
1 RMSE:  0.045729912752522466 

Model:  F_mod_nadd_lin
1 Bias:  8.097072410798255
1 RMSE:  0.0545518692276087 

Model:  G_mod_nadd_mod_nlin
1 Bias:  4.523303488528463
1 RMSE:  0.04781288827208273 



### F. Neural Propensity Score Regression

In [30]:
reg_ae_runs = 200

In [72]:
run_test_battery(
    est=logistic_prop_matching_est,
    runs=200,
    n_samples=1000,
    from_files=True,
    results_subfolder="NN",
    nn_p_regression=True
)

Results File: NN/est_logistic_prop_matching_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  2.516970645424258
1 RMSE:  0.08208114416711368 

Model:  B_add_mild_nlin
1 Bias:  3.483748622303001
1 RMSE:  0.08543882017480643 

Model:  C_add_mod_nlin
1 Bias:  5.00918193487844
1 RMSE:  0.10057195625252761 

Model:  D_mild_nadd_lin
1 Bias:  4.505665739350313
1 RMSE:  0.09322603726388229 

Model:  E_mild_nadd_mild_nlin
1 Bias:  5.654346511047072
1 RMSE:  0.08809153862302047 

Model:  F_mod_nadd_lin
1 Bias:  1.3475296921836275
1 RMSE:  0.09338027635325147 

Model:  G_mod_nadd_mod_nlin
1 Bias:  8.98239908298724
1 RMSE:  0.12032750964509623 



In [57]:
run_test_battery(
    est=genmatch_est,
    runs=250,
    n_samples=1000,
    from_files=True,
    results_subfolder="NN/genmatch",
    genmatch_with_prop_scores=True,
    nn_p_regression=True,
    verbosity=2)

Results File: NN/genmatch/est_genmatch_est_runs_250_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  4.29807544319546
1 RMSE:  0.03924093815095593 

Model:  B_add_mild_nlin
1 Bias:  5.027476827504075
1 RMSE:  0.041036686487701785 

Model:  C_add_mod_nlin
1 Bias:  2.1088702260847136
1 RMSE:  0.04341309335599729 

Model:  D_mild_nadd_lin
1 Bias:  2.3862953259334945
1 RMSE:  0.03882568477146027 

Model:  E_mild_nadd_mild_nlin
1 Bias:  1.01074209695447
1 RMSE:  0.03792983147379567 

Model:  F_mod_nadd_lin
1 Bias:  4.655858061545212
1 RMSE:  0.04384773084167265 

Model:  G_mod_nadd_mod_nlin
1 Bias:  2.032590286077631
1 RMSE:  0.04310322815049054 



### Regression + AE

In [35]:
run_test_battery(
    est=genmatch_est,
    runs=reg_ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="reconstruction",
    results_subfolder="NN/genmatch_reconstruction",
    genmatch_with_prop_scores=True,
    nn_p_regression=True,
    verbosity=2)

Results File: NN/genmatch_reconstruction/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.529673171829923
1 RMSE:  0.05386042530009443 

Model:  B_add_mild_nlin
1 Bias:  1.536696262108329
1 RMSE:  0.05679903685019429 

Model:  C_add_mod_nlin
1 Bias:  0.08182277851303908
1 RMSE:  0.06297264676850725 

Model:  D_mild_nadd_lin
1 Bias:  3.6659780883960047
1 RMSE:  0.05920459006176178 

Model:  E_mild_nadd_mild_nlin
1 Bias:  4.789360578078707
1 RMSE:  0.058677011907493345 

Model:  F_mod_nadd_lin
1 Bias:  0.5585211201953151
1 RMSE:  0.05941723315251356 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.21314704713027943
1 RMSE:  0.06181632295148564 



In [36]:
run_test_battery(
    est=genmatch_est,
    runs=reg_ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="reconstruction",
    results_subfolder="NN/genmatch_reconstruction_evalonoriginal",
    genmatch_with_prop_scores=True,
    nn_p_regression=True,
    evaluate_on_original_covars=True,
    verbosity=2)

Results File: NN/genmatch_reconstruction_evalonoriginal/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.05733422862098678
1 RMSE:  0.04460639113827511 

Model:  B_add_mild_nlin
1 Bias:  0.3416270354118921
1 RMSE:  0.04337636216599876 

Model:  C_add_mod_nlin
1 Bias:  0.17896723806440895
1 RMSE:  0.055350444444338454 

Model:  D_mild_nadd_lin
1 Bias:  2.2316810121202617
1 RMSE:  0.04387889187316208 

Model:  E_mild_nadd_mild_nlin
1 Bias:  3.344728110847269
1 RMSE:  0.04439170278043582 

Model:  F_mod_nadd_lin
1 Bias:  0.23455257555113562
1 RMSE:  0.04949492394802052 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.007101031185222269
1 RMSE:  0.05363245105383774 



### Reconstruction + VAE

In [37]:
run_test_battery(
    est=genmatch_est,
    runs=reg_ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="NN/genmatch_vae",
    genmatch_with_prop_scores=True,
    nn_p_regression=True,
    verbosity=2)

Results File: NN/genmatch_vae/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  1.8614017258002897
1 RMSE:  0.046627158814633134 

Model:  B_add_mild_nlin
1 Bias:  1.0623641880990218
1 RMSE:  0.04780889865562421 

Model:  C_add_mod_nlin
1 Bias:  1.1022105885256166
1 RMSE:  0.05763152334980864 

Model:  D_mild_nadd_lin
1 Bias:  0.8832082725827941
1 RMSE:  0.05134149227345619 

Model:  E_mild_nadd_mild_nlin
1 Bias:  1.180599024428345
1 RMSE:  0.051764207766344696 

Model:  F_mod_nadd_lin
1 Bias:  3.371783090461461
1 RMSE:  0.06068408190931019 

Model:  G_mod_nadd_mod_nlin
1 Bias:  0.6670186230576475
1 RMSE:  0.05552969888840167 



In [38]:
run_test_battery(
    est=genmatch_est,
    runs=reg_ae_runs,
    n_samples=1000,
    from_files=True,
    loss_type="vae",
    results_subfolder="NN/genmatch_vae_evaloriginal",
    genmatch_with_prop_scores=True,
    nn_p_regression=True,
    evaluate_on_original_covars=True,
    verbosity=2)

Results File: NN/genmatch_vae_evaloriginal/est_genmatch_est_runs_200_n_1000
Displaying cached results.

Results
Model:  A_add_lin
1 Bias:  0.025656542980655707
1 RMSE:  0.04467389196112116 

Model:  B_add_mild_nlin
1 Bias:  0.7731961580198056
1 RMSE:  0.05205840412610209 

Model:  C_add_mod_nlin
1 Bias:  0.9092655458551755
1 RMSE:  0.05781784981206415 

Model:  D_mild_nadd_lin
1 Bias:  3.038828666389273
1 RMSE:  0.05245779974299758 

Model:  E_mild_nadd_mild_nlin
1 Bias:  2.872895222216922
1 RMSE:  0.05685137200754044 

Model:  F_mod_nadd_lin
1 Bias:  1.2299437114421792
1 RMSE:  0.05633433796082689 

Model:  G_mod_nadd_mod_nlin
1 Bias:  1.0446758178775761
1 RMSE:  0.06328368505167929 

