In [1]:
# Imports
import pandas as pd
import numpy as np
import doubleml as dml
from doubleml.datasets import fetch_401K

import torch
import os
import time
from functools import partial

from causal_cocycle.model import cocycle_model,cocycle_outcome_model
from causal_cocycle.optimise import *
from causal_cocycle.loss_functions import Loss
from causal_cocycle.conditioners import Lin_Conditioner,Lin_Conditioner_T,NN_RELU_Conditioner,NN_RELU_Conditioner_T
from causal_cocycle.transformers import Transformer,Shift_layer,Scale_layer,RQS_layer,Inverse_layer
from causal_cocycle.helper_functions import likelihood_loss,mmd,propensity_score, empirical_KR
from causal_cocycle.kernels import *
from causal_cocycle.regression_functionals import *
from causal_cocycle.distribution_estimation import *

In [2]:
# Getting data and names
Data = fetch_401K(return_type='DataFrame')
data = Data.to_numpy()
names = np.array(list(Data[:0]))

In [3]:
# Data set-up (outside wrapper function)

# Names
covariates = ['age', 'inc', 'educ', 'fsize', 'marr',
                 'twoearn', 'db', 'pira', 'hown', 'e401']
treatment = ["e401"]
outcome = ["net_tfa"]

# Creating tensors
X = Data[Data.columns.intersection(covariates)]
names_x = np.array(list(X[:0]))
treatment_ind = np.where(names_x == "e401")[0][0]
X = X.to_numpy()
cols_order = ([treatment_ind]+
              list(np.linspace(0,treatment_ind-1,treatment_ind).astype(int))+
              list(np.linspace(treatment_ind+1,len(X.T)-1,len(X.T)-1-treatment_ind).astype(int)))
X = X[:,cols_order]
N = len(X)
Y = Data[Data.columns.intersection(outcome)].to_numpy().reshape(N,)
X,Y = torch.tensor(X),torch.tensor(Y).view(N,1)

In [4]:
# Method + opt set up (outside wrapper function)

# Treatment effect estimation
splits = 5
estimator = "S-estimator" 

# RKHS training and CV
ls_method = "med_heuristic"
hyper_lambda = 2**torch.linspace(-10,0,5)
hyper_ls = 2**torch.linspace(-1,1,5)
train_val_split = 0.8
folds = int(1/(1-train_val_split))

hyper_grid_lambda = hyper_lambda.repeat(len(hyper_ls))
hyper_grid_ls = torch.repeat_interleave(hyper_ls,len(hyper_lambda))   

# Propensity model training
subsample = True
subsamples = 1024
miniter = 500
maxiter = 500

# Setting training optimiser args
opt_args = ["learn_rate",
            "scheduler",
            "batch_size",
            "maxiter",
            "miniter",
            "weight_decay",
            "print_",
            "val_batch_size"]
opt_argvals = [learn_rate,
              scheduler,
              batch_size,
             maxiter,
              miniter,
              weight_decay,
              True,
              val_batch_size]

hyper = ["weight_decay"]
hyper_val = [0]

#Shorthand function calls
def NN(i,o=1,width=128,layers=2):
    return NN_RELU_Conditioner(width = width,
                                     layers = layers, 
                                     input_dims =  i, 
                                     output_dims = o,
                                     bias = True)


In [5]:
# Storage objects
ATE_PI = torch.zeros(splits)
ATE_IPW = torch.zeros(splits)
ATE_DR = torch.zeros(splits)
weights1,weights0 = [],[]

In [6]:
# Specifying dimensions
N = len(X)
D = len(X.T) - 1*(estimator=="T-estimator")
P = D - 1*(estimator=="S-estimator")

# Shuffling data
torch.manual_seed(0)
shuffled_inds = torch.randperm(Y.size()[0])
X = X[shuffled_inds]
Y = Y[shuffled_inds]

# Scaling data
scale_X = torch.ones(len(X.T))
for i in range(len(X.T)):
    if len(torch.unique(X[:,i])) > 2:
        scale_X[i] = X[:,i].var(0)**0.5
scale_Y = Y.var(0)**0.5
Xscale,Yscale = X/scale_X, Y/scale_Y

# Getting sample splits for DR estimation
Xsplits = get_CV_splits(Xscale,splits)
Ysplits = get_CV_splits(Yscale,splits)

In [7]:
# Specifying models for cross-validation
if cocycle == "linear":
    conditioners_list = [[Lin_Conditioner(D,1)]]
    transformers_list = [Transformer([Shift_layer()])]
if cocycle == "additive":
    conditioners_list = [[NN(D,1,width,layers)]]
    transformers_list = [Transformer([Shift_layer()])]
if cocycle == "affine":
    conditioners_list = [[NN(D,1,width,layers)]*2]
    transformers_list = [Transformer([Shift_layer(),Scale_layer()])]
if cocycle == "continuous":
    conditioners_list = [[NN(D,1,width,layers)]*3]
    transformers_list = [Transformer([Shift_layer(),Scale_layer(),RQS_layer(RQS_bins)])]
    
models_validation = []
for m in range(len(conditioners_list)):
    models_validation.append(cocycle_model(conditioners_list[m],transformers_list[m]))
hyper_args = [hyper]*len(conditioners_list)
hyper_argvals = [hyper_val]*len(conditioners_list)

In [8]:
# Getting loss functon (using CMMD_V as scalable for validation)
loss_fn =  Loss(loss_fn = cocycle_loss,kernel = [gaussian_kernel(torch.ones(1),1)]*2)
loss_fn_val =  Loss(loss_fn = "CMMD_V",kernel = [gaussian_kernel(torch.ones(1),1)]*2)
loss_fn.median_heuristic(Xscale,Yscale,subsamples = med_heuristic_samples)
loss_fn_val.median_heuristic(Xscale,Yscale,subsamples = med_heuristic_samples)

  batch_inds = torch.tensor([np.random.choice(ind_list,subsamples)]).long().view(subsamples,)


## S-estimation

In [None]:
# Doing ATE estimation
if estimator == "S-estimator":
    for k in range(splits):

        # Getting dataset
        Xtrain,Ytrain = Xsplits[k][0],Ysplits[k][0]
        Xtest,Ytest = Xsplits[k][1],Ysplits[k][1]

        # Getting model for kth split
        final_model,val_losses = validate(models_validation,
                                         loss_fn,
                                         Xtrain,
                                         Ytrain,
                                         loss_fn_val,
                                         validation_method,
                                         train_val_split,
                                         opt_args,
                                         opt_argvals,
                                         hyper_args,
                                         hyper_argvals,
                                         choose_best_model,
                                         retrain)
    
        # Getting interventional inputs
        Xtest1,Xtest0 = Xtest*1,Xtest*1
        Xtest1[:,0],Xtest0[:,0] = 1,0
    
        # Getting potential outcomes and plug-in estimator
        Y1scale = final_model.cocycle(Xtest1,Xtest,Ytest).detach()
        Y0scale = final_model.cocycle(Xtest0,Xtest,Ytest).detach()
        ATE_PI[k] = (Y1scale-Y0scale).mean()*scale_Y

        # Estimating propensity model
        kernel = exponential_kernel(lengthscale = torch.ones(P, requires_grad = True),scale = 1)
        regressor = NW_functional(kernel)
        propensity_model = Conditional_Expectation_Regressor(regressor)
        losses = propensity_model.optimise(Xtrain[:,1:],Xtrain[:,:1].float(),
                                    subsample = subsample,
                                    miniter = miniter,
                                    maxiter = maxiter,
                                    subsamples = subsamples,
                                    nfold = folds)

        # Getting IPW weights and estimator
        Probs = propensity_model.forward(Xtrain[:,:1],Xtrain[:,1:].float(),Xtest[:,1:].float()).detach()
        weights1.append(Xtest[:,:1]/Probs)
        weights0.append((1-Xtest[:,:1])/(1-Probs))
        ATE_IPW[k] = ((weights1[k]-weights0[k])*Ytest*scale_Y).mean()

        # Getting conditional expectations and DR estimator
        EY1 = final_model.cocycle_outer(Xtest1,Xtest,Ytest).detach().mean(1)*scale_Y
        EY0 = final_model.cocycle_outer(Xtest0,Xtest,Ytest).detach().mean(1)*scale_Y
        ATE_DR[k] = ATE_PI[k]+ATE_IPW[k]
        ATE_DR[k] += (weights0[k][:,0]*EY0 - weights1[k][:,0]*EY1).mean()

Training loss last 10 avg is : tensor(-0.5159)
98.0  % completion
Finished optimising final model
iter 0 , loss =  tensor(0.2321)
iter 10 , loss =  tensor(0.2220)
iter 20 , loss =  tensor(0.2190)
iter 30 , loss =  tensor(0.2223)
iter 40 , loss =  tensor(0.2110)
iter 50 , loss =  tensor(0.2153)
iter 60 , loss =  tensor(0.2065)


In [None]:
Probs.max()

## T-estimation

In [None]:
if estimator == "T-estimator":
    Utest1_splits,Utest0_splits = [],[]
    for k in range(splits):

        # Getting dataset
        Xtrain,Ytrain = Xsplits[k][0],Ysplits[k][0]
        Xtest,Ytest = Xsplits[k][1],Ysplits[k][1]
        treated_train,control_train = Xtrain[:,0]==1,Xtrain[:,0]==0
        treated_test,control_test = Xtest[:,0]==1,Xtest[:,0]==0
    
        #  Splitting dataset into treated and control
        Y_1train,X_1train = Ytrain[treated_train], Xtrain[treated_train,1:]
        Y_0train,X_0train = Ytest[control_train], Xtest[control_train,1:]
        Y_1test,X_1test = Ytest[treated_test], Xtest[treated_test,1:]
        Y_0test,X_0test = Ytest[control_test], Xtest[control_test,1:]
        
        # T-estimation
        final_model1,val_losses1 = validate(models_validation,
                                             loss_fn,
                                             X_1train,
                                             Y_1train,
                                             loss_fn_val,
                                             validation_method,
                                             train_val_split,
                                             opt_args,
                                             opt_argvals,
                                             hyper_args,
                                             hyper_argvals,
                                             choose_best_model,
                                             retrain)
        
        final_model0,val_losses0 = validate(models_validation,
                                             loss_fn,
                                             X_0train,
                                             Y_0train,
                                             loss_fn_val,
                                             validation_method,
                                             train_val_split,
                                             opt_args,
                                             opt_argvals,
                                             hyper_args,
                                             hyper_argvals,
                                             choose_best_model,
                                             retrain)
        # Estimating KR transport
        U_1train = final_model1.inverse_transformation(X_1train,Y_1train).detach()
        U_0train = final_model0.inverse_transformation(X_0train,Y_0train).detach()
        KR_map = empirical_KR(U_0train,U_1train)

        # Getting base distribution test samples for entire fold
        U_1test = final_model1.inverse_transformation(X_1test,Y_1test).detach()
        U_0test = final_model0.inverse_transformation(X_0test,Y_0test).detach()
        Utest1,Utest0 = torch.zeros((len(treated_test),1)), torch.zeros((len(treated_test),1))
        Utest1[treated_test],Utest0[control_test] = U_1test,U_0test
        Utest1[control_test],Utrain0[treated_test] = KR_map.forward(U_0test),KR_map.forward(U_1test)

        # Saving samples for re-use
        Utest1_splits.append(Utest1)
        Utest0_splits.append(Utest0)
        
        # Getting potential outcomes and plug-in estimator
        Y1scale =  final_model1.transformation(Xtest[:,1:],Utest1).detach()
        Y0scale =  final_model0.transformation(Xtest[:,1:],Utest0).detach()
        ATE_PI[k] = (Y1scale-Y0scale).mean()*scale_Y

        # Estimating propensity model
        kernel = exponential_kernel(lengthscale = torch.ones(P, requires_grad = True),scale = 1)
        regressor = NW_functional(kernel)
        propensity_model = Conditional_Expectation_Regressor(regressor)
        losses = propensity_model.optimise(Xtrain[:,1:],Xtrain[:,:1].float(),
                                    subsample = True,
                                    miniter = 200,
                                    maxiter = 200,
                                    subsamples = 1024,
                                    nfold = folds)

        # Getting IPW weights and estimator
        Probs = propensity_model.forward(Xtrain[:,:1],Xtrain[:,1:],Xtest[:,1:])
        weights1.append(Xtest[:,:1]/Probs)
        weights0 .append((1-Xtest[:,:1])/(1-Probs))
        ATE_IPW[k] = ((weights1[k]-weights0[k])*Ytest*scale_Y).mean()

        EY1 =  final_model1.transformation_outer(Xtest[:,1:],Utest1).detach().mean(1)*scale_Y
        EY0 =  final_model0.transformation_outer(Xtest[:,1:],Utest0).detach().mean(1)*scale_Y

        ATE_DR[k] = ATE_PI[k]+ATE_IPW[k]
        ATE_DR[k] += (weights0[k][:,0]*EY0 - weights1[k][:,0]*EY1).mean()

In [None]:
print(ATE_PI.mean(),ATE_DR.mean(),ATE_IPW.mean())