In [None]:
!pip install econml

In [None]:
# External Libaries required for the majority of the assignment, more to be added as more tasks are completed 
from econml.metalearners import XLearner
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import matplotlib.pyplot as plt

In [None]:
class Metrics:
    
    def pehe(self,effect_true, effect_pred):
        """
        Precision in Estimating the Heterogeneous Treatment Effect (PEHE)
        :param effect_true: true treatment effect value
        :param effect_pred: predicted treatment effect value
        :return: PEHE
        """
        return np.abs(np.mean(effect_pred) - np.mean(effect_true))

    def abs_ate(self,effect_true, effect_pred):
        """
        Absolute error for the Average Treatment Effect (ATE)
        :param effect_true: true treatment effect value
        :param effect_pred: predicted treatment effect value
        :return: absolute error on ATE
        """
        return np.sqrt(np.mean((effect_true - effect_pred)**2))
    @staticmethod
    def abs_att(effect_pred, yf, t, e):
        """
        Absolute error for the Average Treatment Effect on the Treated
        :param effect_pred: predicted treatment effect value
        :param yf: factual (observed) outcome
        :param t: treatment status (treated/control)
        :param e: whether belongs to the experimental group
        :return: absolute error on ATT
        """
        att_true = np.mean(yf[t > 0]) - np.mean(yf[(1 - t + e) > 1])
        att_pred = np.mean(effect_pred[(t + e) > 1])

        return np.abs(att_pred - att_true)
    @staticmethod
    def policy_risk(effect_pred, yf, t, e):
        """
        Computes the risk of the policy defined by predicted effect
        :param effect_pred: predicted treatment effect value
        :param yf: factual (observed) outcome
        :param t: treatment status (treated/control)
        :param e: whether belongs to the experimental group
        :return: policy risk
        """
        # Consider only the cases for which we have experimental data (i.e., e > 0)
        t_e = t[e > 0]
        yf_e = yf[e > 0]
        effect_pred_e = effect_pred[e > 0]

        if np.any(np.isnan(effect_pred_e)):
            return np.nan

        policy = effect_pred_e > 0.0
        treat_overlap = (policy == t_e) * (t_e > 0)
        control_overlap = (policy == t_e) * (t_e < 1)

        if np.sum(treat_overlap) == 0:
            treat_value = 0
        else:
            treat_value = np.mean(yf_e[treat_overlap])

        if np.sum(control_overlap) == 0:
            control_value = 0
        else:
            control_value = np.mean(yf_e[control_overlap])

        pit = np.mean(policy)
        policy_value = pit * treat_value + (1.0 - pit) * control_value

        return 1.0 - policy_value
metrics = Metrics()

## Data Exploration, Preprocessing and Modelling

In [None]:
# x =  Background Variables, t = Treatment Variable (Support or no support), yf = Outcome Variable (Factual)
# ycf = Outcome Variable (Counterfactual), ite = individual treatment effect
data = np.load('../input/datasetihdp/ihdp.npz')
for f in data.files:
  print(f'{f}: {data[f].shape}')

In [None]:
df_x,df_t,df_yf,df_ycf,df_ite = data['x'],data['t'],data['yf'], data['ycf'],data['ite']
print('ATE : ', np.mean(df_ite))

In [None]:
IHDP_x,IHDP_t,IHDP_yf,IHDP_ycf, IHDP_ite = pd.DataFrame(df_x),pd.DataFrame(df_t), pd.DataFrame(df_yf), pd.DataFrame(df_ycf), pd.DataFrame(df_ite)
IHDP_x.info()

There appears to be no missing data or non-numerical values from the IHDP dataset therefore no preprocessing is needed in regards to encoding and filling Nan rows.

In [None]:
IHDP_x.describe().T

In [None]:
sns.pairplot(data=IHDP_x)

In [None]:
bins=20
fig, axs = plt.subplots(1, 5, figsize=(16, 4))
axs[0].hist(df_x, bins=bins)
axs[1].hist(df_t, bins=bins)
axs[2].hist(df_yf, bins=bins)
axs[3].hist(df_ycf, bins=bins)
axs[4].hist(df_ite, bins=bins)
plt.show()

In [None]:
IHDP_x.hist(bins=25,figsize=(12,10))

In [None]:
limit = 150
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
# These scatterplots have only been made via factual outcomes 
# More scatterplots could be made to model the counterfactual outcomes
axs[0].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 1][:limit]
               , df_yf[df_t == 1][:limit], label = "Treated")
axs[0].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 0][:limit]
               , df_yf[df_t == 0][:limit], label = "Control")
axs[1].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 1][:limit]
               , df_ycf[df_t == 1][:limit], label = "Treated")
axs[1].scatter(df_x[:, 0].reshape(-1, 1)[df_t == 0][:limit]
               , df_ycf[df_t == 0][:limit], label = "Control")
axs[0].legend(ncol=2)
axs[1].legend(ncol=2)
plt.show()

In [None]:
limit = 150
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
# These scatterplots have only been made via factual outcomes 
# More scatterplots could be made to model the counterfactual outcomes
axs[0].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 1][:limit]
               , df_yf[df_t == 1][:limit], label = "Treated")
axs[0].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 0][:limit]
               , df_yf[df_t == 0][:limit], label = "Control")
axs[1].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 1][:limit]
               , df_ycf[df_t == 1][:limit], label = "Treated")
axs[1].scatter(df_x[:, 1].reshape(-1, 1)[df_t == 0][:limit]
               , df_ycf[df_t == 0][:limit], label = "Control")
axs[0].legend(ncol=2)
axs[1].legend(ncol=2)
plt.show()

In [None]:
plt.figure(figsize=(18, 10))
heatmap = sns.heatmap(IHDP_x.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
bins=20
plt.figsize=(16, 4)
plt.hist(df_t, bins=bins, color = "orange")
plt.title("IHDP Control and treatment Distribution", fontsize=12, fontweight="bold")
plt.show()

### Standardizing and spliting


In [None]:
x_train, x_test, t_train, t_test, yf_train, yf_test, ite_train, ite_test = train_test_split(df_x, df_t, df_yf, df_ite, test_size=0.2)

In [None]:
temp_X_IHDP = pd.DataFrame(x_train)
temp_X_test_IHDP = pd.DataFrame(x_test)
#temp_X_IHDP.head()
temp_yf_IHDP = pd.DataFrame(yf_train)
#temp_yf_IHDP.head()
#[temp_X_IHDP[cols].unique() for cols in temp_X_IHDP]

#### Scaling the data

Columns 0-5 all require conventional scaling, however the remainder are binary and so do not. We also know that our outcome column requires Standard scaling based on previous modelling.

In [None]:
# IHDP
# Scale the first 6 columns of our features (all non binary)
temp_X_IHDP.iloc[:, 0:5] = StandardScaler().fit_transform(temp_X_IHDP.iloc[:, 0:5])
temp_X_test_IHDP.iloc[:, 0:5] = StandardScaler().fit_transform(temp_X_test_IHDP.iloc[:, 0:5])
# Scale our outcomes column 
yf_train_Stan = StandardScaler().fit_transform(temp_yf_IHDP) 
#temp_X_IHDP.head()
x_train_Stan = temp_X_IHDP.to_numpy()
x_test_Stan = temp_X_test_IHDP.to_numpy()