In [1]:
!pip install econml

In [2]:
# External Libaries required for the majority of the assignment, more to be added as more tasks are completed 
from econml.metalearners import XLearner
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import matplotlib.pyplot as plt

In [3]:
class Metrics:
    
    def pehe(self,effect_true, effect_pred):
        """
        Precision in Estimating the Heterogeneous Treatment Effect (PEHE)
        :param effect_true: true treatment effect value
        :param effect_pred: predicted treatment effect value
        :return: PEHE
        """
        return np.abs(np.mean(effect_pred) - np.mean(effect_true))

    def abs_ate(self,effect_true, effect_pred):
        """
        Absolute error for the Average Treatment Effect (ATE)
        :param effect_true: true treatment effect value
        :param effect_pred: predicted treatment effect value
        :return: absolute error on ATE
        """
        return np.sqrt(np.mean((effect_true - effect_pred)**2))
    @staticmethod
    def abs_att(effect_pred, yf, t, e):
        """
        Absolute error for the Average Treatment Effect on the Treated
        :param effect_pred: predicted treatment effect value
        :param yf: factual (observed) outcome
        :param t: treatment status (treated/control)
        :param e: whether belongs to the experimental group
        :return: absolute error on ATT
        """
        att_true = np.mean(yf[t > 0]) - np.mean(yf[(1 - t + e) > 1])
        att_pred = np.mean(effect_pred[(t + e) > 1])

        return np.abs(att_pred - att_true)
    @staticmethod
    def policy_risk(effect_pred, yf, t, e):
        """
        Computes the risk of the policy defined by predicted effect
        :param effect_pred: predicted treatment effect value
        :param yf: factual (observed) outcome
        :param t: treatment status (treated/control)
        :param e: whether belongs to the experimental group
        :return: policy risk
        """
        # Consider only the cases for which we have experimental data (i.e., e > 0)
        t_e = t[e > 0]
        yf_e = yf[e > 0]
        effect_pred_e = effect_pred[e > 0]

        if np.any(np.isnan(effect_pred_e)):
            return np.nan

        policy = effect_pred_e > 0.0
        treat_overlap = (policy == t_e) * (t_e > 0)
        control_overlap = (policy == t_e) * (t_e < 1)

        if np.sum(treat_overlap) == 0:
            treat_value = 0
        else:
            treat_value = np.mean(yf_e[treat_overlap])

        if np.sum(control_overlap) == 0:
            control_value = 0
        else:
            control_value = np.mean(yf_e[control_overlap])

        pit = np.mean(policy)
        policy_value = pit * treat_value + (1.0 - pit) * control_value

        return 1.0 - policy_value
metrics = Metrics()

## Data Exploration, Preprocessing and Modelling

In [4]:
# x =  Background Variables, t = Treatment Variable (Support or no support), yf = Outcome Variable (Factual)
# ycf = Outcome Variable (Counterfactual), ite = individual treatment effect
data = np.load('../input/datasetihdp/ihdp.npz')
for f in data.files:
  print(f'{f}: {data[f].shape}')

In [5]:
df_x,df_t,df_yf,df_ycf,df_ite = data['x'],data['t'],data['yf'], data['ycf'],data['ite']
print('ATE : ', np.mean(df_ite))

In [6]:
IHDP_x,IHDP_t,IHDP_yf,IHDP_ycf, IHDP_ite = pd.DataFrame(df_x),pd.DataFrame(df_t), pd.DataFrame(df_yf), pd.DataFrame(df_ycf), pd.DataFrame(df_ite)
IHDP_x.info()

There appears to be no missing data or non-numerical values from the IHDP dataset therefore no preprocessing is needed in regards to encoding and filling Nan rows.

In [7]:
IHDP_x.describe().T

In [8]:
sns.pairplot(data=IHDP_x)

In [9]:
bins=20
fig, axs = plt.subplots(1, 5, figsize=(16, 4))
axs[0].hist(df_x, bins=bins)
axs[1].hist(df_t, bins=bins)
axs[2].hist(df_yf, bins=bins)
axs[3].hist(df_ycf, bins=bins)
axs[4].hist(df_ite, bins=bins)
plt.show()

In [10]:
IHDP_x.hist(bins=25,figsize=(12,10))

In [11]:
limit = 150
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
# These scatterplots have only been made via factual outcomes 
# More scatterplots could be made to model the counterfactual outcomes
axs[0].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 1][:limit]
               , df_yf[df_t == 1][:limit], label = "Treated")
axs[0].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 0][:limit]
               , df_yf[df_t == 0][:limit], label = "Control")
axs[1].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 1][:limit]
               , df_ycf[df_t == 1][:limit], label = "Treated")
axs[1].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 0][:limit]
               , df_ycf[df_t == 0][:limit], label = "Control")
axs[0].legend(ncol=2)
axs[1].legend(ncol=2)
plt.show()

In [12]:
limit = 150
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
# These scatterplots have only been made via factual outcomes 
# More scatterplots could be made to model the counterfactual outcomes
axs[0].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 1][:limit]
               , df_yf[df_t == 1][:limit], label = "Treated")
axs[0].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 0][:limit]
               , df_yf[df_t == 0][:limit], label = "Control")
axs[1].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 1][:limit]
               , df_ycf[df_t == 1][:limit], label = "Treated")
axs[1].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 0][:limit]
               , df_ycf[df_t == 0][:limit], label = "Control")
axs[0].legend(ncol=2)
axs[1].legend(ncol=2)
plt.show()

In [13]:
plt.figure(figsize=(18, 10))
heatmap = sns.heatmap(IHDP_x.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [14]:
bins=20
plt.figsize=(16, 4)
plt.hist(df_t, bins=bins, color = "orange")
plt.title("IHDP Control and treatment Distribution", fontsize=12, fontweight="bold")
plt.show()

The above graph confirms our need to use X-learner. There is clear inbalance towards the treatment and control groups in both datasets, hopefully X-learner should be able to deal with this when calculating our CATE value. 

### Standardizing and spliting


In [15]:
x_train, x_test, t_train, t_test, yf_train, yf_test, ite_train, ite_test = train_test_split(df_x, df_t, df_yf, df_ite, test_size=0.2)

### Random Forest Regressor 

In [16]:
# Simple Random forest trained using the IHDP features + treatment
random_forest_IHDP = RandomForestRegressor() 
Concatenated_XT_train_IHDP = np.concatenate([x_train, t_train], axis=1)
random_forest_IHDP.fit(Concatenated_XT_train_IHDP, yf_train.flatten())
# t=0 - IHDP
Concatenated_XT_zeros_IHDP = np.concatenate([x_test, np.zeros_like(t_test)], axis=1)
Y0_IHDP = random_forest_IHDP.predict(Concatenated_XT_zeros_IHDP)
# t=1 - IHDP
Concatenated_XT_ones_IHDP = np.concatenate([x_test, np.ones_like(t_test)], axis=1)
Y1_IHDP = random_forest_IHDP.predict(Concatenated_XT_ones_IHDP)
#ITEs for IHDP dataset
# Effect_Estimates_IHDP
estimted_eff = Y1_IHDP - Y0_IHDP
#Metrics
ATE_IHDP = metrics.abs_ate(ite_test, estimted_eff)
PEHE_IHDP = metrics.pehe(ite_test, estimted_eff)
print('ATE, PEHE',ATE_IHDP,PEHE_IHDP )

In [17]:
results = []
results.append(['RF', ATE_IHDP, PEHE_IHDP])
cols = ['Method', 'ATE test', 'PEHE test']
df_First = pd.DataFrame(results, columns=cols)
df_First

### XGB Regressor 

Here we will start with a simple model similar to the random forest machine learning process, however we will need to standardize the data. Any features that are non binary will be standardized, including the label (assuming it's non-binary). First, we need to check for non binary columns

In [18]:
temp_X_IHDP = pd.DataFrame(x_train)
temp_X_test_IHDP = pd.DataFrame(x_test)
#temp_X_IHDP.head()
temp_yf_IHDP = pd.DataFrame(yf_train)
#temp_yf_IHDP.head()
#[temp_X_IHDP[cols].unique() for cols in temp_X_IHDP]

#### Scaling the data

Columns 0-5 all require conventional scaling, however the remainder are binary and so do not. We also know that our outcome column requires Standard scaling based on previous modelling.

In [19]:
# IHDP
# Scale the first 6 columns of our features (all non binary)
temp_X_IHDP.iloc[:, 0:5] = StandardScaler().fit_transform(temp_X_IHDP.iloc[:, 0:5])
temp_X_test_IHDP.iloc[:, 0:5] = StandardScaler().fit_transform(temp_X_test_IHDP.iloc[:, 0:5])
# Scale our outcomes column 
yf_train_Stan = StandardScaler().fit_transform(temp_yf_IHDP) 
#temp_X_IHDP.head()
x_train_Stan = temp_X_IHDP.to_numpy()
x_test_Stan = temp_X_test_IHDP.to_numpy()

In [20]:
# Concatenating the treatment and feature columns of the datasets 
Concatenated_XT_train_IHDP = np.concatenate([x_train_Stan, t_train], axis=1)
Linear_Regressor_IHDP = LinearRegression().fit(Concatenated_XT_train_IHDP, yf_train_Stan.flatten())
# t=0
Concatenated_XT_zeros_IHDP = np.concatenate([x_test_Stan, np.zeros_like(t_test)], axis=1)
Y0_IHDP = Linear_Regressor_IHDP.predict(Concatenated_XT_zeros_IHDP)
# t=1
Concatenated_XT_ones_IHDP = np.concatenate([x_test_Stan, np.ones_like(t_test)], axis=1)
Y1_IHDP = Linear_Regressor_IHDP.predict(Concatenated_XT_ones_IHDP)
#ITEs for the dataset
Effect_Estimates_IHDP_LR = Y1_IHDP - Y0_IHDP
# Metrics
ATE_IHDP_LR = metrics.abs_ate(ite_test, Effect_Estimates_IHDP_LR)
PEHE_IHDP_LR = metrics.pehe(ite_test, Effect_Estimates_IHDP_LR)

In [21]:
# Metrics for IHDP dataset
results = []
results.append(['RF', ATE_IHDP, PEHE_IHDP])
results.append(['LR', ATE_IHDP_LR, PEHE_IHDP_LR])
cols = ['Method', 'ATE test', 'PEHE test']
df_First = pd.DataFrame(results, columns=cols)
df_First

 As it can be seen,The linear regressor performs considerably worse than the Random forest model.

### Optimized Random Forest Regressor

In [22]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression

class Tuning:    
    Model = RandomForestRegressor()
    random_grid = {'bootstrap': [True, False],
         'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
         'max_features': ['auto', 'sqrt'],
         'min_samples_leaf': [1, 2, 4],
         'min_samples_split': [2, 5, 10],
         'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
    param_grid = {'fit_intercept': [True, False],'positive': [True, False]}


    def random_forest(self,XTtrain, Ytrain, Weights="None", Model=Model, random_grid=random_grid):
            Optimizer = RandomizedSearchCV(estimator = self.Model, param_distributions = self.random_grid, n_iter = 100, cv = 10, verbose=2, n_jobs = -1)
            if Weights == "None":
                    Optimizer.fit(XTtrain, Ytrain.flatten())
            else:
                    Optimizer.fit(XTtrain, Ytrain.flatten(), sample_weight=Weights)
            return Optimizer.best_params_       
                
                

#### Grid Search

In [23]:
Benchmark_IHDP = RandomForestRegressor()
# Our parameter Grid uses values centered around the parameters found from the random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [5, 10 , 20],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2, 3, 4],
    'min_samples_split': [2, 4, 6],
    'n_estimators': [200, 400, 600]
}

GridSearch_RF_IHDP = GridSearchCV(estimator = Benchmark_IHDP, param_grid = param_grid, cv = 10, n_jobs = -1, verbose = 2)
GridSearch_RF_IHDP.fit(Concatenated_XT_train_IHDP, yf_train.flatten())
GridSearch_RF_IHDP.best_params_

#### Metric Evaluation

Now we can define our optimized random forest model using the results of the grid serach and evaluate their performances using the usual process 

In [24]:
Optimized_RF_IHDP = RandomForestRegressor(bootstrap=True, max_depth=5, max_features='auto', min_samples_leaf=3, min_samples_split=4, n_estimators=400)
Optimized_RF_IHDP.fit(Concatenated_XT_train_IHDP, yf_train.flatten())

# t=0 - IHDP
Concatenated_XT_zeros_IHDP = np.concatenate([x_test, np.zeros_like(t_test)], axis=1)
Y0_IHDP = Optimized_RF_IHDP.predict(Concatenated_XT_zeros_IHDP)
# t=1 - IHDP
Concatenated_XT_ones_IHDP = np.concatenate([x_test, np.ones_like(t_test)], axis=1)
Y1_IHDP = Optimized_RF_IHDP.predict(Concatenated_XT_ones_IHDP)

#ITEs for IHDP dataset
Effect_Estimated = Y1_IHDP - Y0_IHDP

Now once again, we can calculate our metric for each of the datasets

In [25]:
ATE_IHDP_Optimized = metrics.abs_ate(ite_test, Effect_Estimated)
PEHE_IHDP_Optimized = metrics.pehe(ite_test, Effect_Estimated)

In [26]:
results = []
results.append(['RF', ATE_IHDP, PEHE_IHDP])
results.append(['LR', ATE_IHDP_LR, PEHE_IHDP_LR])
results.append(['Optimized RF', ATE_IHDP_Optimized, PEHE_IHDP_Optimized])
cols = ['Method', 'ATE test', 'PEHE test']
df_First = pd.DataFrame(results, columns=cols)
df_First

As denoted by the above results, there is a clear improvement in regards to bth the average treatment effect metric and PEHE; it is clear that optimizing each of the models has a sizeable impact on their performance

### Linear Regressor - Optimized

Unfortunately there is not alot of hyperparameter tuning to be done in regards to the linear regressor, therefore we will run only a grid search to find the optimal values for what is available to us

Unfortunately, there is no real improvement in either average treatment effect or PEHE for the optimized linear regression model; we can see there is still a clear poor performance. 

### Feature Importances 

Finally we can plot and vizualise the feature importances of the data, from most useful to least useful. As the columns don't have a title, we can recognize feature by their column indices.

In [27]:
from matplotlib import pyplot

# Feature Importances for the IHDP dataset using random forest regression
Optimized_RF_IHDP.fit(Concatenated_XT_train_IHDP, yf_train.flatten())
features_IHDP_RF = list(Concatenated_XT_train_IHDP)
importances_IHDP_RF = Optimized_RF_IHDP.feature_importances_
std_IHDP = np.std([tree.feature_importances_ for tree in Optimized_RF_IHDP.estimators_], axis=0)
indices_IHDP_RF = np.argsort(importances_IHDP_RF)[::-1]
print("Top 5 Feature Importances - Random Forest")
for f in range(5):
    print("%d. %s (%f)" % (f + 1, "Feature Column: "+ str(indices_IHDP_RF[f]), importances_IHDP_RF[indices_IHDP_RF[f]]))
pyplot.bar([x for x in range(len(importances_IHDP_RF))], importances_IHDP_RF)
pyplot.title("Feature Importance for Jobs dataset using RF")
pyplot.ylabel("Feature Score")
pyplot.xlabel("Feature Number")
pyplot.show()
    
    
# # Feature Importances for the IHDP dataset using Linear regression
# LR_IHDP_Optimized.fit(Concatenated_XT_train_IHDP, IHDP_yf_train_Standardized.flatten())
# importances_IHDP_LR = LR_IHDP_Optimized.coef_
# indices_IHDP_LR = np.argsort(importances_IHDP_LR)[::-1]
# print("Top 5 Feature Importances - Linear Regressor")
# for f in range(5):
#     print("%d. %s (%f)" % (f + 1, "Feature Column: "+ str(indices_IHDP_LR[f]), importances_IHDP_LR[indices_IHDP_LR[f]]))
    

We can clearly see that the most important feature in our IHDP dataset is the feature at column index 25 (We have used column indexes as we don't have titles to refer to). Column 25 in this case is our concatenated treatment data. The feature importances for this dataset are extremely skewed. We can also see that the reliance on the treatment feature is even more apparent in linear regressors then it is for random forest. There doesn't seem to be any obvious feature importance patterns outside of treatment except for the fact that colum index 14 and 5 of the background features consistently feature in the top 5 most important. 

## Training Our Models - Propensity Score Re-weighting

For this section we are going to use a random forest classifier. First we need to train a classifer to predict propensity scores based on background features. We also need a function that calculates sample weights based on these propensity scores.

Let's train our classifier to predic propensity scores and then define each of the variables of the IPSW equation: ti, e(x) Note: e(x) can also be referred to as P(ti|xi)

In [28]:
Simple_Classifier_IHDP = RandomForestClassifier()

# These will act as our ti values 
ti_IHDP = np.squeeze(t_train)

# Classifier trained to predict propensity scores
Simple_Classifier_IHDP.fit(x_train, ti_IHDP)

# These will act as our ptx values
ex_IHDP = Simple_Classifier_IHDP.predict_proba(x_train).T[1].T + 0.0001

Given the equation: 
    
$$w_i = \frac{t_i}{e(x_i)} + \frac{1-t_i}{1-e(x_i)}$$


We can use our imported calc_weights function to calculate our sample weights 

In [29]:
import math
def Calc_Weights(ti, ex):
    return (ti / ex) + ((1-ti) / (1-ex))

In [30]:
Sample_Weights_IHDP = Calc_Weights(ti_IHDP, ex_IHDP)

Now that we have our sample weights, we can train our regressors using the weights

In [31]:
# Training variables for random forest 
Concatenated_XT_train_IHDP = np.concatenate([x_train, t_train], axis=1)
# Training variables for linear regression 
# Concatenated_XT_train_IHDP_Standardized = np.concatenate([IHDP_x_train_Standardized, IHDP_t_train], axis=1)

RandomForest_IHDP_IPSW = RandomForestRegressor()
# LinearRegressor_IHDP_IPSW = LinearRegression()

# Trained regressors
RandomForest_IHDP_IPSW.fit(Concatenated_XT_train_IHDP, yf_train.flatten(), sample_weight=Sample_Weights_IHDP)
# LinearRegressor_IHDP_IPSW.fit(Concatenated_XT_train_IHDP_Standardized, IHDP_yf_train_Standardized.flatten(), sample_weight=Sample_Weights_IHDP)

Now we can repeat previous steps for both models for hyperparameter optimization and metric evaluation, this starts with a Random and Grid search for the Random forest model.

#### Random Search

In [32]:
# tuning = Tuning()
# print(tuning.random_forest(Concatenated_XT_train_IHDP, yf_train, Weights=Sample_Weights_IHDP, Model=RandomForest_IHDP_IPSW))

#### Grid Search - Both Random forest and Linear Regressor

In [33]:
# Our parameter Grid uses values centered around the parameters found from the random search
param_grid = {
    'bootstrap': [True],
    'max_depth': [8, 10, 12],
    'max_features': ['auto', 'sqrt'],
    'min_samples_leaf': [2,3,4],
    'min_samples_split': [4, 5, 6],
    'n_estimators': [1000, 1200, 1400]
}

GridSearch_RF_IHDP_IPSW = GridSearchCV(estimator = RandomForest_IHDP_IPSW, param_grid = param_grid, cv = 10, n_jobs = -1, verbose = 2)
GridSearch_RF_IHDP_IPSW.fit(Concatenated_XT_train_IHDP, yf_train.flatten(), sample_weight=Sample_Weights_IHDP)
GridSearch_RF_IHDP_IPSW.best_params_

In [34]:
# print(Optimize_LR(Concatenated_XT_train_IHDP_Standardized, IHDP_yf_train_Standardized,Weights=Sample_Weights_IHDP, ModelTwo=LinearRegressor_IHDP_IPSW))

#### Metric Evaluation

As usual, we begin by establishing our optimal IPSW models, then calculate our predictions for treatment = 1 and treatment = 0, and then estimate treatment effects.

In [35]:
RF_IPSW_IHDP = RandomForestRegressor(bootstrap=True, max_depth=8, max_features='auto', min_samples_leaf=2, min_samples_split=5, n_estimators=1000)
RF_IPSW_IHDP.fit(Concatenated_XT_train_IHDP, yf_train.flatten(), sample_weight=Sample_Weights_IHDP)

In [36]:
# t=0 - IHDP RF
Concatenated_XT_zeros_IHDP = np.concatenate([x_test, np.zeros_like(t_test)], axis=1)
Y0_IHDP_IPSW_RF = RF_IPSW_IHDP.predict(Concatenated_XT_zeros_IHDP)
# t=1 - IHDP RF
Concatenated_XT_ones_IHDP = np.concatenate([x_test, np.ones_like(t_test)], axis=1)
Y1_IHDP_IPSW_RF = RF_IPSW_IHDP.predict(Concatenated_XT_ones_IHDP)

# # t=0 - IHDP LR
# Concatenated_XT_zeros_IHDP = np.concatenate([x_test_Stan, np.zeros_like(t_test)], axis=1)
# Y0_IHDP_IPSW_LR = LR_IPSW_IHDP.predict(Concatenated_XT_zeros_IHDP)
# # t=1 - IHDP LR
# Concatenated_XT_ones_IHDP = np.concatenate([x_test_Stan, np.ones_like(t_test)], axis=1)
# Y1_IHDP_IPSW_LR = LR_IPSW_IHDP.predict(Concatenated_XT_ones_IHDP)

In [37]:
#ITEs for Random Forest Model IHDP IPSW
estimated_effect_RF_IPSW = Y1_IHDP_IPSW_RF - Y0_IHDP_IPSW_RF
# #ITEs for Linear Regression Model IHDP IPSW
# Effect_Estimates_IHDP_LR_IPSW = Y1_IHDP_IPSW_LR - Y0_IHDP_IPSW_LR

In [38]:
# Metric calculations for random forest IPSW
# metrics = Metrics()
ATE_IHDP_RF_IPSW = metrics.abs_ate(ite_test, estimated_effect_RF_IPSW)
PEHE_IHDP_RF_IPSW = metrics.pehe(ite_test, estimated_effect_RF_IPSW)
# Metric calculations for linear regression IPSW
ATE_IHDP_LR_IPSW = metrics.abs_ate(ite_test, estimated_effect_RF_IPSW)
PEHE_IHDP_LR_IPSW = metrics.pehe(ite_test, estimated_effect_RF_IPSW)

In [39]:
results = []
results.append(['RF', ATE_IHDP, PEHE_IHDP])
results.append(['LR', ATE_IHDP_LR, PEHE_IHDP_LR])
results.append(['Optimized RF', ATE_IHDP_Optimized, PEHE_IHDP_Optimized])
# results.append(['Optimized LR', ATE_IHDP_LR_Optimized, PEHE_IHDP_LR_Optimized])
results.append(['IPSW RF', ATE_IHDP_RF_IPSW, PEHE_IHDP_RF_IPSW])
# results.append(['IPSW LR',ATE_IHDP_LR_IPSW,PEHE_IHDP_LR_IPSW])
cols = ['Method', 'ATE test', 'PEHE test']
df_First = pd.DataFrame(results, columns=cols)
df_First

The inverse propensity score re-weighted models perform similarly to the others, outperforming the standard but not the optimised models. The optimised models appear to be optimal thus far. However, we must now examine how the relative relevance of the features has changed.

### Feature Importances for IPSW 

In [40]:
# Feature Importances for the IHDP dataset using random forest regression
importances_IHDP_RF = RF_IPSW_IHDP.feature_importances_
std_IHDP = np.std([tree.feature_importances_ for tree in RF_IPSW_IHDP.estimators_], axis=0)
indices_IHDP_RF = np.argsort(importances_IHDP_RF)[::-1]
print("Feature Importances - RF IPSW")
for f in range(5):
    print("%d. %s (%f)" % (f + 1, "Feature Column: "+ str(indices_IHDP_RF[f]), importances_IHDP_RF[indices_IHDP_RF[f]]))
pyplot.bar([x for x in range(len(importances_IHDP_RF))], importances_IHDP_RF)
pyplot.title("Feature Importance for Jobs dataset using RF")
pyplot.ylabel("Feature Score")
pyplot.xlabel("Feature Number")
pyplot.show()
# # Feature Importances for the IHDP dataset using linear regression
# importances_IHDP_LR = LR_IPSW_IHDP.coef_
# indices_IHDP_LR = np.argsort(importances_IHDP_LR)[::-1]
# print("Feature Importances - LR IPSW")
# for f in range(5):
#     print("%d. %s (%f)" % (f + 1, "Feature Column: "+ str(indices_IHDP_LR[f]), importances_IHDP_LR[indices_IHDP_LR[f]]))

Interestingly, the importance of treatment has not increased. Weighting usually minimises distributional differences between the treated and control units, thus I'd expect t's importance to rise. As can be seen from the graphs above, the non-weighted model has already determined the value of the treatment feature and hence may not require the increase.

## Advanced CATE Estimators

RF as the base learner of XLearner

In [42]:
xl_IHDP = XLearner(models=RandomForestRegressor(), propensity_model=RandomForestClassifier())
xl_IHDP.fit(yf_train, t_train.flatten(), X=x_train)
xl_te_test_IHDP = xl_IHDP.effect(x_test)

ATE and PEHE metrics 

In [45]:
xl_ate_test = metrics.abs_ate(ite_test, xl_te_test_IHDP)
xl_pehe_test = metrics.pehe(ite_test, xl_te_test_IHDP)

Now we can observe the performance of all the models in this script, all in the same table 

In [46]:
results = []
results.append(['Linear Regression', ATE_IHDP_LR, PEHE_IHDP_LR])
results.append(['Random Forest', ATE_IHDP, PEHE_IHDP])
results.append(['Hyperparameter tuned Random Forest', ATE_IHDP_Optimized, PEHE_IHDP_Optimized])
# results.append(['Optimized LR', ATE_IHDP_LR_Optimized, PEHE_IHDP_LR_Optimized])
results.append(['IPSW Random Forest', ATE_IHDP_RF_IPSW, PEHE_IHDP_RF_IPSW])
# results.append(['IPSW LR',ATE_IHDP_LR_IPSW,PEHE_IHDP_LR_IPSW])
results.append(['X-Learner',xl_ate_test ,xl_pehe_test])
cols = ['Methodology', 'ATE test', 'PEHE test']
df_First = pd.DataFrame(results, columns=cols)
df_First

Looking at the above table it is clear that XLearner is by far the most consistent model, with the best ATE result and the second best PEHE result.