In [None]:
!pip install econml

In [None]:
#Required Libraries
from econml.metalearners import XLearner
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
import numpy as np
import pandas as pd
import seaborn as sns
import scipy.stats as st
import matplotlib.pyplot as plt

In [None]:
class metrics:
    
    def pehe(self,effect_true, effect_pred):
        """
        Precision in Estimating the Heterogeneous Treatment Effect (PEHE)
        :param effect_true: true treatment effect value
        :param effect_pred: predicted treatment effect value
        :return: PEHE
        """
        return np.abs(np.mean(effect_pred) - np.mean(effect_true))

    def abs_ate(self,effect_true, effect_pred):
        """
        Absolute error for the Average Treatment Effect (ATE)
        :param effect_true: true treatment effect value
        :param effect_pred: predicted treatment effect value
        :return: absolute error on ATE
        """
        return np.sqrt(np.mean((effect_true - effect_pred)**2))
    @staticmethod
    def abs_att(effect_pred, yf, t, e):
        """
        Absolute error for the Average Treatment Effect on the Treated
        :param effect_pred: predicted treatment effect value
        :param yf: factual (observed) outcome
        :param t: treatment status (treated/control)
        :param e: whether belongs to the experimental group
        :return: absolute error on ATT
        """
        att_true = np.mean(yf[t > 0]) - np.mean(yf[(1 - t + e) > 1])
        att_pred = np.mean(effect_pred[(t + e) > 1])

        return np.abs(att_pred - att_true)
    @staticmethod
    def policy_risk(effect_pred, yf, t, e):
        """
        Computes the risk of the policy defined by predicted effect
        :param effect_pred: predicted treatment effect value
        :param yf: factual (observed) outcome
        :param t: treatment status (treated/control)
        :param e: whether belongs to the experimental group
        :return: policy risk
        """
        # Consider only the cases for which we have experimental data (i.e., e > 0)
        t_e = t[e > 0]
        yf_e = yf[e > 0]
        effect_pred_e = effect_pred[e > 0]

        if np.any(np.isnan(effect_pred_e)):
            return np.nan

        policy = effect_pred_e > 0.0
        treat_overlap = (policy == t_e) * (t_e > 0)
        control_overlap = (policy == t_e) * (t_e < 1)

        if np.sum(treat_overlap) == 0:
            treat_value = 0
        else:
            treat_value = np.mean(yf_e[treat_overlap])

        if np.sum(control_overlap) == 0:
            control_value = 0
        else:
            control_value = np.mean(yf_e[control_overlap])

        pit = np.mean(policy)
        policy_value = pit * treat_value + (1.0 - pit) * control_value

        return 1.0 - policy_value


In [None]:
metrics = metrics()


### Data exploration, Preprocessing & Modelling

In [None]:
# loading jobs Dataset
df = np.load('../input/datatest/jobs.npz')
"""x = Feature Variable, t --> Treatment, y --> Outcome Variable (Factual)
   e --> experimental or observational data"""
for f in df.files:
  print(f'{f}: {df[f].shape}')
jx , jt , jy, je = df['x'], df['t'], df['y'], df['e']
dfX,dfT,dfY,dfE =  pd.DataFrame(df['x']),pd.DataFrame(df['t']),pd.DataFrame(df['y']),pd.DataFrame(df['e'])
print(dfX.info())


In [None]:
dfX.describe().T

In [None]:
dfX.boxplot()

In [None]:
sns.pairplot(data=dfX)

In [None]:

fig, axs = plt.subplots(1,4, figsize=(16, 4))
axs[0].hist(dfX, bins=20)
axs[1].hist(dfT, bins=20)
axs[2].hist(jy, bins=20)
axs[3].hist(je, bins=20)
plt.show()

In [None]:
plt.figure(figsize=(16, 10))
heatmap = sns.heatmap(dfX.corr(), vmin=-1, vmax=1, annot=True)
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

Because there appear to be no missing data or non-numerical values in jobs, there is no need for preprocessing when encoding and filling Nan rows, as there is with the IHDP dataset. Jobs, like IHDP, have a lot of outliers in the background variables, which requires a similar experiment with normalisation approach. However, we may investigate random forest regression models, which should manage any outliers internally and reduce the likelihood of our results being skewed or biassed in any way.

In [None]:
dfX.hist(bins=25,figsize=(14,10))


the background variables in Jobs seem to be unbalanced

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
limit = 20
axs[0].scatter(jx[:, 0].reshape(-1, 1)[jt == 1][:limit]
               , jy[jt == 1][:limit], label = "Treated")
axs[0].scatter(jx[:, 0].reshape(-1, 1)[jt == 0][:limit]
               , jy[jt == 0][:limit], label = "Control")
axs[1].scatter(jx[:, 1].reshape(-1, 1)[jt == 1][:limit]
               , jy[jt == 1][:limit], label = "Treated")
axs[1].scatter(jx[:, 1].reshape(-1, 1)[jt == 0][:limit]
               , jy[jt == 0][:limit], label = "Control")
axs[0].legend(ncol=2)
axs[1].legend(ncol=2)
plt.show()

In contrast to IHDP, jobs outcomes are recorded as binary variables, therefore scatter points are only plotted on 0 and 1. Given the four graphs above, it's far more difficult to spot noticeable effects; yet, given the background variables we've chosen to depict, this could just be a coincidence.

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(16, 4))
limit = 20
axs[0].scatter(jx[:, 2].reshape(-1, 1)[jt == 1][:limit],
               jy[jt == 1][:limit], label = "Treated")
axs[0].scatter(jx[:, 2].reshape(-1, 1)[jt == 0][:limit]
               , jy[jt == 0][:limit], label = "Control")
axs[1].scatter(jx[:, 3].reshape(-1, 1)[jt == 1][:limit]
               , jy[jt == 1][:limit], label = "Treated")
axs[1].scatter(jx[:, 3].reshape(-1, 1)[jt == 0][:limit],
               jy[jt == 0][:limit], label = "Control")
axs[0].legend(ncol=2)
axs[1].legend(ncol=2)
plt.show()

In [None]:
bins=20
plt.hist(jt, bins=bins, color='hotpink')
plt.title("Treatment and Control Distribution")
plt.show()

The graphs above demonstrate why we need to apply X-learner. In both datasets, there is an obvious imbalance in favour of the treatment and control groups; hopefully, X-learner will be able to account for this when calculating our CATE value.

### Data Modelling and Standardizing


In [None]:
jx_train, jx_test, jt_train, jt_test, jy_train, jy_test, je_train, je_test = train_test_split(jx, jt, jy, je, test_size=0.2)

In [None]:
temp_XJ = pd.DataFrame(jx_train)
temp_XJ_t = pd.DataFrame(jx_test)
#temp_X_Jobs.head()
#[temp_X_Jobs[cols].unique() for cols in temp_X_Jobs]
temp_XJ.head()

In [None]:
# Jobs
# Scale columns 0,1,6,7,8,9,10,11,12,15 (all non binary)
temp_XJ.iloc[:, [0,1,6,7,8,9,10,11,12,15]] = StandardScaler().fit_transform(temp_XJ.iloc[:, [0,1,6,7,8,9,10,11,12,15]])
temp_XJ_t.iloc[:, [0,1,6,7,8,9,10,11,12,15]] = StandardScaler().fit_transform(temp_XJ_t.iloc[:, [0,1,6,7,8,9,10,11,12,15]]) 
Jobs_xtrain_stan = temp_XJ.to_numpy()
Jobs_xtest_stan = temp_XJ_t.to_numpy()
temp_XJ.head()