In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import statsmodels.api as sm
from xgboost import XGBRegressor
import warnings

import evaluationData

from causalml.inference.meta import LRSRegressor
from causalml.inference.meta import XGBTRegressor, MLPTRegressor
from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor
from causalml.match import NearestNeighborMatch, MatchOptimizer, create_table_one
from causalml.propensity import ElasticNetPropensityModel
from causalml.metrics import *

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')

%matplotlib inline

In [2]:
import importlib
print(importlib.metadata.version('causalml') )

0.14.1


## BPIC2017 Dataset

In [3]:
df = pd.read_csv("bpi2017_final.csv")
print(df.columns)
df.head()

Index(['case:concept:name', 'NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'treatedCase',
       'caseSuccesful', 'treatmentSuccess', 'offerNumber', 'offerSuccess',
       'treatmentOffer', 'timeApplication', 'weekdayApplication'],
      dtype='object')


Unnamed: 0,case:concept:name,NumberOfOffers,Action,org:resource,concept:name,EventOrigin,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,CreditScore,OfferedAmount,treatedCase,caseSuccesful,treatmentSuccess,offerNumber,offerSuccess,treatmentOffer,timeApplication,weekdayApplication
0,0.0,1.0,0.0,0.0,4.0,0.0,1.0,651433.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
1,0.0,1.0,4.0,0.0,8.0,0.0,1.0,651434.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.061,2.0
2,0.0,1.0,0.0,0.0,22.0,2.0,3.0,651435.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.29,2.0
3,0.0,1.0,1.0,0.0,22.0,2.0,6.0,651437.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.613,2.0
4,0.0,1.0,0.0,0.0,21.0,2.0,3.0,651438.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.62,2.0


In [4]:
feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

In [5]:
treatment=df['treatmentOffer']
X = df[feature_names]
y=df['offerSuccess']

In [6]:
#propensity model for X- & R-Learner
p_model = ElasticNetPropensityModel()
p = p_model.fit_predict(X, treatment)
print(p)

[0.17406175 0.17406048 0.17405714 ... 0.07747542 0.07747561 0.07747409]


NameError: name 'numpy' is not defined

In [None]:
np.save('propensity-score-MetaLearners', p)

In [6]:
p = np.load('propensity-score-MetaLearners.npy')
print(p)

[0.17406175 0.17406048 0.17405714 ... 0.07747542 0.07747561 0.07747409]


In [None]:
# Ready-to-use S-Learner using LinearRegression
learner_s = LRSRegressor()
ate_s = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
print('Using the S-Learner LinearRegression')
print(ate_s)
# print('ATE estimate: {:.03f}'.format(ate_s[0][0]))
# print('ATE lower bound: {:.03f}'.format(ate_s[1][0]))
# print('ATE upper bound: {:.03f}'.format(ate_s[2][0]))

# Ready-to-use T-Learner using XGB
learner_t = XGBTRegressor()
ate_t = learner_t.estimate_ate(X=X, treatment=treatment, y=y)
print('\nUsing the XGBTRegressor class')
print(ate_t)

# Calling the Base Learner class and feeding in XGB
learner_t_xgb = BaseTRegressor(learner=XGBRegressor())
ate_t_xgb = learner_t_xgb.estimate_ate(X=X, treatment=treatment, y=y)
print('\nUsing the BaseTRegressor class and using XGB (same result):')
print(ate_t_xgb)

# Calling the Base Learner class and feeding in LinearRegression
learner_t_lr = BaseTRegressor(learner=LinearRegression())
ate_t_lr= learner_t_lr.estimate_ate(X=X, treatment=treatment, y=y)
print('\nUsing the BaseTRegressor class and using Linear Regression (different result):')
print(ate_t_lr)

# X Learner with propensity score input
# Calling the Base Learner class and feeding in XGB
learner_x_p_xgb = BaseXRegressor(learner=XGBRegressor())
ate_x_p_xgb = learner_x_p_xgb.estimate_ate(X=X, treatment=treatment, y=y, p=p)
print('Using the BaseXRegressor class and using XGB:')
print(ate_x_p_xgb)

# X Learner with propensity score input
# Calling the Base Learner class and feeding in LinearRegression
learner_x_p_lr = BaseXRegressor(learner=LinearRegression())
ate_x_p_lr = learner_x_p_lr.estimate_ate(X=X, treatment=treatment, y=y, p=p)
print('\nUsing the BaseXRegressor class and using Linear Regression:')
print(ate_x_p_lr)

# X Learner without propensity score input
# Calling the Base Learner class and feeding in XGB
learner_x_xgb = BaseXRegressor(XGBRegressor())
ate_x_xgb = learner_x_xgb.estimate_ate(X=X, treatment=treatment, y=y)
print('Using the BaseXRegressor class and using XGB without propensity score input:')
print(ate_x_xgb)

# X Learner without propensity score input
# Calling the Base Learner class and feeding in LinearRegression
learner_x_lr = BaseXRegressor(learner=LinearRegression())
ate_x_lr = learner_x_lr.estimate_ate(X=X, treatment=treatment, y=y)
print('\nUsing the BaseXRegressor class and using Linear Regression without propensity score input:')
print(ate_x_lr)

# R Learner with propensity score input
# Calling the Base Learner class and feeding in XGB
learner_r_p_xgb = BaseRRegressor(learner=XGBRegressor())
ate_r_p_xgb = learner_r_p_xgb.estimate_ate(X=X, treatment=treatment, y=y, p=p)
print('Using the BaseRRegressor class and using XGB:')
print(ate_r_p_xgb)

# R Learner with propensity score input
# Calling the Base Learner class and feeding in LinearRegression
learner_r_p_lr = BaseRRegressor(learner=LinearRegression())
ate_r_p_lr = learner_r_p_lr.estimate_ate(X=X, treatment=treatment, y=y, p=p)
print('Using the BaseRRegressor class and using Linear Regression:')
print(ate_r_p_lr)

# R Learner with propensity score input and random sample weight
# Calling the Base Learner class and feeding in XGB
learner_r_pw_xgb = BaseRRegressor(learner=XGBRegressor())
sample_weight = np.random.randint(1, 3, len(y))
ate_r_pw_xgb = learner_r_pw_xgb.estimate_ate(X=X, treatment=treatment, y=y, p=p, sample_weight=sample_weight)
print('Using the BaseRRegressor class with random weight and using XGB:')
print(ate_r_pw_xgb)

# R Learner without propensity score input
# Calling the Base Learner class and feeding in XGB
learner_r_xgb = BaseRRegressor(learner=XGBRegressor())
ate_r_xgb = learner_r_xgb.estimate_ate(X=X, treatment=treatment, y=y)
print('Using the BaseRRegressor class and using XGB without propensity score input:')
print(ate_r_xgb)

# Calling the Base Learner class and feeding in LinearRegression
learner_r_lr = BaseRRegressor(learner=LinearRegression())
ate_r_lr = learner_r_lr.estimate_ate(X=X, treatment=treatment, y=y)
print('Using the BaseRRegressor class and using Linear Regression without propensity score input:')
print(ate_r_lr)

Using the S-Learner LinearRegression
(array([0.06477474]), array([0.06185492]), array([0.06769455]))

Using the XGBTRegressor class
(array([0.18294639]), array([0.18202016]), array([0.18387262]))

Using the BaseTRegressor class and using XGB (same result):
(array([0.18294639]), array([0.18202016]), array([0.18387262]))

Using the BaseTRegressor class and using Linear Regression (different result):
(array([0.09409883]), array([0.09269185]), array([0.09550582]))
Using the BaseXRegressor class and using XGB:
(array([0.51276654]), array([0.51179811]), array([0.51373498]))

Using the BaseXRegressor class and using Linear Regression:
(array([0.07960877]), array([0.0782071]), array([0.08101043]))


### 7. Calculate Individual Treatment Effect

In [7]:
# S Learner
learner_s = LRSRegressor()
ite_s = learner_s.fit_predict(X=X, treatment=treatment, y=y)
print("ITE S: ", ite_s)

# T Learner
learner_t = BaseTRegressor(learner=XGBRegressor())
ite_t = learner_t.fit_predict(X=X, treatment=treatment, y=y)
print("ITE T: ", ite_t)

# T Learner feeding in XGB (same as above)
learner_t_xgb = BaseTRegressor(learner=XGBRegressor())
ite_t_xgb = learner_t_xgb.fit_predict(X=X, treatment=treatment, y=y)
print("ITE XGB T: ", ite_t_xgb)

# T Learner LinearRegression
learner_t_lr = BaseTRegressor(learner=LinearRegression())
ite_t_lr= learner_t_lr.fit_predict(X=X, treatment=treatment, y=y)
print("ITE T RL: ", ite_t_lr)

ITE S:  [[0.06477474]
 [0.06477474]
 [0.06477474]
 ...
 [0.06477474]
 [0.06477474]
 [0.06477474]]
ITE T:  [[0.30053258]
 [0.18866764]
 [0.14396259]
 ...
 [0.00306845]
 [0.00248682]
 [0.00195444]]
ITE XGB T:  [[0.30053258]
 [0.18866764]
 [0.14396259]
 ...
 [0.00306845]
 [0.00248682]
 [0.00195444]]
ITE T RL:  [[0.17752653]
 [0.16730068]
 [0.18100416]
 ...
 [0.00463327]
 [0.01120845]
 [0.02505036]]


In [24]:
# X Learner with propensity score input
learner_x_p = BaseXRegressor(learner=XGBRegressor())
ite_x_p = learner_x_p.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE X: ", ite_x_p)

# X Learner without propensity score input
learner_x = BaseXRegressor(learner=XGBRegressor())
ite_x = learner_x.fit_predict(X=X, treatment=treatment, y=y)
print("ITE X (wihtout p): ", ite_x)

# X Learner LR with propensity score input
learner_x_p_lr = BaseXRegressor(learner=LinearRegression())
ite_x_p_lr = learner_x_p_lr.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE X LR: ", ite_x_p_lr)

# X Learner LR without propensity score input
learner_x_lr = BaseXRegressor(learner=LinearRegression())
ite_x_lr = learner_x_lr.fit_predict(X=X, treatment=treatment, y=y)
print("ITE X LR (wihtout p): ", ite_x_lr)

ITE X:  [[ 0.04489529]
 [ 0.02423898]
 [-0.00709323]
 ...
 [ 0.93208489]
 [ 0.93209036]
 [ 0.93024249]]
ITE X (wihtout p):  [[ 0.03856069]
 [ 0.02042402]
 [-0.01083504]
 ...
 [ 0.89728747]
 [ 0.89700203]
 [ 0.89538468]]
ITE X LR:  [[0.17752653]
 [0.16730068]
 [0.18100416]
 ...
 [0.00463327]
 [0.01120845]
 [0.02505036]]
ITE X LR (wihtout p):  [[0.17752653]
 [0.16730068]
 [0.18100416]
 ...
 [0.00463327]
 [0.01120845]
 [0.02505036]]


In [7]:
# R Learner with propensity score input 
learner_r = BaseRRegressor(learner=XGBRegressor())
ite_r = learner_r.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE R: ", ite_r)

# R Learner without propensity score input
learner_r_no_p = BaseRRegressor(learner=XGBRegressor())
ite_r_no_p = learner_r_no_p.fit_predict(X=X, treatment=treatment, y=y)
print("ITE R (without p): ", ite_r_no_p)

# R Learner LR with propensity score input
learner_r_p_lr = BaseRRegressor(learner=LinearRegression())
ite_r_p_lr = learner_r_p_lr.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE R LR: ", ite_r_p_lr)

# R Learner with propensity score input and random sample weight
learner_r_pw_xgb = BaseRRegressor(learner=XGBRegressor())
sample_weight = np.random.randint(1, 3, len(y))
ite_r_pw_xgb = learner_r_pw_xgb.fit_predict(X=X, treatment=treatment, y=y, p=p, sample_weight=sample_weight)
print("ITE R (with random weight): ", ite_r_pw_xgb)

# # R Learner LR without propensity score input
# learner_r_lr = BaseRRegressor(learner=LinearRegression())
# ite_r_lr = learner_r_lr.fit_predict(X=X, treatment=treatment, y=y)
# print("ITE R LR (without p): ", ite_r_lr)

ITE R:  [[-0.00456186]
 [-0.00456186]
 [-0.00231892]
 ...
 [-0.00086562]
 [-0.00025799]
 [ 0.00103475]]
ITE R (without p):  [[ 0.09482569]
 [ 0.11773356]
 [ 0.10719296]
 ...
 [-0.00409879]
 [-0.00573711]
 [-0.00405265]]
ITE R LR:  [[0.12940255]
 [0.12646731]
 [0.12844959]
 ...
 [0.0006854 ]
 [0.0058162 ]
 [0.01361836]]
ITE R (with random weight):  [[-0.02148819]
 [-0.02148819]
 [-0.01485084]
 ...
 [-0.02504064]
 [-0.02504064]
 [-0.02160337]]


In [None]:
# R Learner LR without propensity score input
learner_r_lr = BaseRRegressor(learner=LinearRegression())
ite_r_lr = learner_r_lr.fit_predict(X=X, treatment=treatment, y=y)
print("ITE R LR (without p): ", ite_r_lr)

### Other Methods

In [9]:
nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
                 learning_rate_init=.1,
                 early_stopping=True,
                 random_state=42)
te_nn, lb_nn, ub_nn = nn.estimate_ate(X, treatment, y)
print('Average Treatment Effect (Neural Network (MLP)): {:.2f} ({:.2f}, {:.2f})'.format(te_nn[0], lb_nn[0], ub_nn[0]))

Average Treatment Effect (Neural Network (MLP)): 0.14 (0.13, 0.14)


IndexError: invalid index to scalar variable.

In [10]:
print(te_nn[0])

0.13711836145561046


## Input Results in Results Dataframe

In [None]:
# result_list = {'Method': ['S-Learner LR', 'XGBTRegressor', 'BaseTRegressor XGB', 'BaseTRegressor LR', 'BaseXRegressor XGB', 'BaseXRegressor LR',
#                           'BaseXRegressor XGB (without propensity score)','BaseXRegressor LR (without propensity score)', 'BaseRRegressor XGB', 
#                           'BaseRRegressor LR', 'BaseRRegressor XGB (with random weight)', 'BaseRRegressor XGB (without propensity score)',
#                          'BaseRRegressor LR (without propensity score)', 'Neural Network (MLP)'],
#         'ATE': [ate_s[0][0], ate_t[0][0], ate_t_xgb[0][0], ate_t_lr[0][0], ate_x_p_xgb[0][0], ate_x_p_lr[0][0], ate_x_xgb[0][0], ate_x_lr[0][0], ate_r_p_xgb[0][0], ate_r_p_lr[0][0], ate_r_pw_xgb[0][0], ate_r_xgb[0][0], ate_r_lr[0][0], te_nn[0][0]],
#         'ITE': [ite_s, ite_t, ite_t_xgb, ite_t_lr, ite_x_p,  ite_x_p_lr, ite_x, ite_x_lr, ite_r, ite_r_p_lr, ite_r_pw_xgb, ite_r_no_p, ite_r_lr, '']}

In [8]:
list_r = [ite_r, ite_r_p_lr, ite_r_pw_xgb, ite_r_no_p]

In [9]:
y = 1
for i in list_r:

    # Calculate statistics
    data = np.reshape(i, -1)
    minimum = np.min(data)
    first_quartile = np.percentile(data, 25)
    median = np.median(data)
    third_quartile = np.percentile(data, 75)
    maximum = np.max(data)
    
    # Interquartile range (IQR)
    iqr = third_quartile - first_quartile
    
    # Define upper and lower bounds for outliers
    upper_bound = third_quartile + 1.5 * iqr
    lower_bound = first_quartile - 1.5 * iqr
    
    # Detect outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    
    # Print the statistics
    print("Minimum:", minimum)
    print("First Quartile:", first_quartile)
    print("Median:", median)
    print("Third Quartile:", third_quartile)
    print("Maximum:", maximum)
    print("Interquartile Range:", iqr)
    print("Upper Bound (Outliers):", upper_bound)
    print("Lower Bound (Outliers):", lower_bound)
    print("Outliers:", outliers)

    exec(f'boxplot_r_{y} = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]')
    y +=1

Minimum: -2.841172695159912
First Quartile: -0.022177141159772873
Median: -0.0011553335934877396
Third Quartile: 0.022602353245019913
Maximum: 1.593336582183838
Interquartile Range: 0.044779494404792786
Upper Bound (Outliers): 0.08977159485220909
Lower Bound (Outliers): -0.08934638276696205
Outliers: [ 0.09258621  0.09225269  0.09225269 ... -0.14480454 -0.14992988
 -0.14992988]
Minimum: -0.6539629778797732
First Quartile: -0.014001676778572877
Median: 0.02244088467886033
Third Quartile: 0.08991063850769189
Maximum: 0.29162906073547185
Interquartile Range: 0.10391231528626477
Upper Bound (Outliers): 0.24577911143708903
Lower Bound (Outliers): -0.16987014970797004
Outliers: [-0.19172352 -0.19465876 -0.19267647 ... -0.20314153 -0.20943951
 -0.20714403]
Minimum: -3.097541332244873
First Quartile: -0.020540252327919006
Median: -0.0008536885143257678
Third Quartile: 0.021109196357429028
Maximum: 1.5432970523834229
Interquartile Range: 0.041649448685348034
Upper Bound (Outliers): 0.0835833693

In [16]:
result_list_s_t = {'Method': ['S-Learner LR', 'XGBTRegressor', 'BaseTRegressor XGB', 'BaseTRegressor LR'],
        'ATE': [ite_s.mean(), ite_t.mean(), ite_t_xgb.mean(), ite_t_lr.mean()],
        'ITE': [boxplot_1, boxplot_2, boxplot_3, boxplot_4]}

In [28]:
result_list_x = {'Method': ['BaseXRegressor XGB', 'BaseXRegressor LR', 'BaseXRegressor XGB (without propensity score)','BaseXRegressor LR (without propensity score)'],
        'ATE': [ite_x_p.mean(),  ite_x_p_lr.mean(), ite_x.mean(), ite_x_lr.mean()],
        'ITE': [boxplot_x_1,  boxplot_x_2, boxplot_x_3, boxplot_x_4]}

In [10]:
result_list_r = {'Method': ['BaseRRegressor XGB', 'BaseRRegressor LR', 'BaseRRegressor XGB (with random weight)', 'BaseRRegressor XGB (without propensity score)'],
        'ATE': [ite_r.mean(), ite_r_p_lr.mean(), ite_r_pw_xgb.mean(), ite_r_no_p.mean()],
        'ITE': [boxplot_r_1, boxplot_r_2, boxplot_r_3, boxplot_r_4]}

In [11]:
%store -r df_results
lib = "CausalML"
result_list = result_list_r

for i in range(len(result_list['Method'])):
    method = result_list['Method'][i]
    ate = result_list['ATE'][i]
    ite = result_list['ITE'][i]

    if method in df_results['method'].values:
         # If the method is already in the DataFrame, update the ATE and ITE columns
        df_results.loc[df_results['method'] == method, 'ATE'] = ate
        index = df_results[df_results['method'] == method].index[0]
        df_results.at[index, 'ITE'] = ite
        df_results.loc[df_results['method'] == method, 'Library'] = lib
    else:
        # If the method is not in the DataFrame, add a new row
        df_results = df_results._append({'method': method, 'ATE': ate, 'ITE': ite, 'Library': lib}, ignore_index=True)

print(df_results)
%store df_results

                                           method       ATE  \
0                               Linear Regression  0.449046   
1                                       Double ML  0.471019   
2                                             IPW  0.311352   
3                                       IPW Hajek  0.311352   
4                                  IPW Stabalized  0.311352   
5                       Propensity Score Matching -0.179289   
6                               Distance Matching  0.630322   
7                                          IPW LR  0.149171   
8                                       CausalEGM  0.124771   
9                                     Causal Tree  0.034357   
10                                    cforest_mse  0.087602   
11                                   cforest_cmse  0.301697   
12                             cforest_cmse_p=0.5  0.032615   
13                        cforest_cmse_p=0.5_md=3  0.042756   
14                                  cforest_ttest  0.13

In [12]:
%store -r df_results
lib = "CausalML"
result_list = result_list_r_nn

for i in range(len(result_list['Method'])):
    method = result_list['Method'][i]
    ate = result_list['ATE'][i]
    ite = result_list['ITE'][i]

    if method in df_results['method'].values:
         # If the method is already in the DataFrame, update the ATE and ITE columns
        df_results.loc[df_results['method'] == method, 'ATE'] = ate
        df_results.loc[df_results['method'] == method, 'ITE'] = [ite]
        df_results.loc[df_results['method'] == method, 'Library'] = lib
    else:
        # If the method is not in the DataFrame, add a new row
        df_results = df_results._append({'method': method, 'ATE': ate, 'ITE': ite, 'Library': lib}, ignore_index=True)

print(df_results)
%store df_results

                                           method       ATE  \
0                               Linear Regression  0.449046   
1                                       Double ML  0.471019   
2                                             IPW  0.311352   
3                                       IPW Hajek  0.311352   
4                                  IPW Stabalized  0.311352   
5                       Propensity Score Matching -0.179289   
6                               Distance Matching  0.630322   
7                                          IPW LR  0.149171   
8                                       CausalEGM  0.124771   
9                                     Causal Tree  0.034357   
10                                    cforest_mse  0.087602   
11                                   cforest_cmse  0.301697   
12                             cforest_cmse_p=0.5  0.032615   
13                        cforest_cmse_p=0.5_md=3  0.042756   
14                                  cforest_ttest  0.13

## Synthetic Dataset

In [4]:
df_synth = pd.read_csv("synthetic_dataset.csv")
df_synth.head()
synthetic_features = ['NumberOfOffers', 'concept:name',
       'lifecycle:transition', 'time:timestamp', 'elementId', 'resourceId',
       'weekdayApplication', 'timeApplication']
treatment=df_synth['treatment']
X = df_synth[synthetic_features]
y=df_synth['treatmentSuccess']

In [5]:
#propensity model for X- & R-Learner
p_model = ElasticNetPropensityModel()
p = p_model.fit_predict(X, treatment)
print(p)

[0.48723344 0.48719778 0.48721561 ... 0.26391789 0.26389016 0.26390402]


In [7]:
nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
                 learning_rate_init=.1,
                 early_stopping=True,
                 random_state=42)
te_nn, lb_nn, ub_nn = nn.estimate_ate(X, treatment, y)
print('Average Treatment Effect (Neural Network (MLP)): {:.2f} ({:.2f}, {:.2f})'.format(te_nn[0], lb_nn[0], ub_nn[0]))
# S Learner
learner_s = LRSRegressor()
ite_s = learner_s.fit_predict(X=X, treatment=treatment, y=y)
print("ITE S: ", ite_s)

# T Learner
learner_t = BaseTRegressor(learner=XGBRegressor())
ite_t = learner_t.fit_predict(X=X, treatment=treatment, y=y)
print("ITE T: ", ite_t)

# T Learner feeding in XGB (same as above)
learner_t_xgb = BaseTRegressor(learner=XGBRegressor())
ite_t_xgb = learner_t_xgb.fit_predict(X=X, treatment=treatment, y=y)
print("ITE XGB T: ", ite_t_xgb)

# T Learner LinearRegression
learner_t_lr = BaseTRegressor(learner=LinearRegression())
ite_t_lr= learner_t_lr.fit_predict(X=X, treatment=treatment, y=y)
print("ITE T RL: ", ite_t_lr)
# X Learner with propensity score input
learner_x_p = BaseXRegressor(learner=XGBRegressor())
ite_x_p = learner_x_p.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE X: ", ite_x_p)

# X Learner without propensity score input
learner_x = BaseXRegressor(learner=XGBRegressor())
ite_x = learner_x.fit_predict(X=X, treatment=treatment, y=y)
print("ITE X (wihtout p): ", ite_x)

# X Learner LR with propensity score input
learner_x_p_lr = BaseXRegressor(learner=LinearRegression())
ite_x_p_lr = learner_x_p_lr.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE X LR: ", ite_x_p_lr)

# X Learner LR without propensity score input
learner_x_lr = BaseXRegressor(learner=LinearRegression())
ite_x_lr = learner_x_lr.fit_predict(X=X, treatment=treatment, y=y)
print("ITE X LR (wihtout p): ", ite_x_lr)
# R Learner with propensity score input 
learner_r = BaseRRegressor(learner=XGBRegressor())
ite_r = learner_r.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE R: ", ite_r)

# R Learner without propensity score input
learner_r_no_p = BaseRRegressor(learner=XGBRegressor())
ite_r_no_p = learner_r_no_p.fit_predict(X=X, treatment=treatment, y=y)
print("ITE R (without p): ", ite_r_no_p)

# R Learner LR with propensity score input
learner_r_p_lr = BaseRRegressor(learner=LinearRegression())
ite_r_p_lr = learner_r_p_lr.fit_predict(X=X, treatment=treatment, y=y, p=p)
print("ITE R LR: ", ite_r_p_lr)

# R Learner with propensity score input and random sample weight
learner_r_pw_xgb = BaseRRegressor(learner=XGBRegressor())
sample_weight = np.random.randint(1, 3, len(y))
ite_r_pw_xgb = learner_r_pw_xgb.fit_predict(X=X, treatment=treatment, y=y, p=p, sample_weight=sample_weight)
print("ITE R (with random weight): ", ite_r_pw_xgb)

# R Learner LR without propensity score input
learner_r_lr = BaseRRegressor(learner=LinearRegression())
ite_r_lr = learner_r_lr.fit_predict(X=X, treatment=treatment, y=y)
print("ITE R LR (without p): ", ite_r_lr)

Average Treatment Effect (Neural Network (MLP)): 2.00 (2.00, 2.00)
ITE S:  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE T:  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE XGB T:  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE T RL:  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE X:  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE X (wihtout p):  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE X LR:  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE X LR (wihtout p):  [[2.]
 [2.]
 [2.]
 ...
 [2.]
 [2.]
 [2.]]
ITE R:  [[3.62236483e-06]
 [3.62236483e-06]
 [3.62236483e-06]
 ...
 [3.62236483e-06]
 [3.62236483e-06]
 [3.62236483e-06]]
ITE R (without p):  [[3.68120891e-06]
 [3.68120891e-06]
 [3.68120891e-06]
 ...
 [3.68120891e-06]
 [3.68120891e-06]
 [3.68120891e-06]]
ITE R LR:  [[0.56567101]
 [0.56207732]
 [0.56387416]
 ...
 [1.48490208]
 [1.48130838]
 [1.48310523]]
ITE R (with random weight):  [[3.62227593e-06]
 [3.62227593e-06]
 [3.62227593e-06]
 ...
 [3.62227593e-06]
 [3.62227593e-06]
 [3.62227593e-0

In [10]:
import evaluation_metrics
synth_ite_list = [ite_s, ite_t, ite_t_xgb, ite_t_lr, ite_x_p,  ite_x_p_lr, ite_x, ite_x_lr, ite_r, ite_r_p_lr, ite_r_pw_xgb, ite_r_no_p]
y=1
synth_true_ate = 2
for i in synth_ite_list:
    boxplot = evaluation_metrics.boxplot_ite(i)
    exec(f'boxplot_synth_{y} = boxplot')
    metric = evaluation_metrics.evaluation_metrics(synth_true_ate, i)
    exec(f'metric_synth_{y} = metric')
    y +=1

In [7]:
synth_ite_list = [ite_s, ite_t, ite_t_xgb, ite_t_lr, ite_x_p,  ite_x_p_lr, ite_x, ite_x_lr, ite_r, ite_r_p_lr, ite_r_pw_xgb, ite_r_no_p]
y = 1
for i in synth_ite_list:

    # Calculate statistics
    data = np.reshape(i, -1)
    minimum = np.min(data)
    first_quartile = np.percentile(data, 25)
    median = np.median(data)
    third_quartile = np.percentile(data, 75)
    maximum = np.max(data)
    
    # Interquartile range (IQR)
    iqr = third_quartile - first_quartile
    
    # Define upper and lower bounds for outliers
    upper_bound = third_quartile + 1.5 * iqr
    lower_bound = first_quartile - 1.5 * iqr
    
    # Detect outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    
    # Print the statistics
    print("Minimum:", minimum)
    print("First Quartile:", first_quartile)
    print("Median:", median)
    print("Third Quartile:", third_quartile)
    print("Maximum:", maximum)
    print("Interquartile Range:", iqr)
    print("Upper Bound (Outliers):", upper_bound)
    print("Lower Bound (Outliers):", lower_bound)
    print("Outliers:", outliers)

    exec(f'boxplot_synth_{y} = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]')
    y +=1

Minimum: 1.999999999999998
First Quartile: 1.9999999999999987
Median: 1.9999999999999987
Third Quartile: 1.999999999999999
Maximum: 1.9999999999999993
Interquartile Range: 2.220446049250313e-16
Upper Bound (Outliers): 1.9999999999999991
Lower Bound (Outliers): 1.9999999999999982
Outliers: [2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2.
 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2. 2

In [11]:
synth_result_list = {'Method': ['S-Learner LR', 'XGBTRegressor', 'BaseTRegressor XGB', 'BaseTRegressor LR', 'BaseXRegressor XGB', 'BaseXRegressor LR',
                          'BaseXRegressor XGB (without propensity score)','BaseXRegressor LR (without propensity score)', 'BaseRRegressor XGB', 
                          'BaseRRegressor LR', 'BaseRRegressor XGB (with random weight)', 'BaseRRegressor XGB (without propensity score)',
                         'Neural Network (MLP)'],
        'ATE': [ite_s.mean(), ite_t.mean(), ite_t_xgb.mean(), ite_t_lr.mean(), ite_x_p.mean(),  ite_x_p_lr.mean(), ite_x.mean(), ite_x_lr.mean(), ite_r.mean(), ite_r_p_lr.mean(), ite_r_pw_xgb.mean(), ite_r_no_p.mean(), te_nn[0]],
        'ITE': [boxplot_synth_1, boxplot_synth_2, boxplot_synth_3, boxplot_synth_4, boxplot_synth_5,  boxplot_synth_6, boxplot_synth_7, boxplot_synth_8, boxplot_synth_9, boxplot_synth_10, boxplot_synth_11, boxplot_synth_12, ''],
        'Metric': [metric_synth_1, metric_synth_2, metric_synth_3, metric_synth_4, metric_synth_5, metric_synth_6, metric_synth_7, metric_synth_8, metric_synth_9, metric_synth_10, metric_synth_11, metric_synth_12, '']}

In [12]:
%store -r df_synthetic_results_metric
result_list = synth_result_list

for i in range(len(result_list['Method'])):
    method = result_list['Method'][i]
    ate = result_list['ATE'][i]
    ite = result_list['ITE'][i]
    metric = result_list['Metric'][i]

    df_synthetic_results_metric = df_synthetic_results_metric._append({'method': method, 'ATE': ate, 'ITE': ite, 'metrics': metric}, ignore_index=True)

print(df_synthetic_results_metric)
%store df_synthetic_results_metric

                                           method  \
0                                    S-Learner LR   
1                                   XGBTRegressor   
2                              BaseTRegressor XGB   
3                               BaseTRegressor LR   
4                              BaseXRegressor XGB   
5                               BaseXRegressor LR   
6   BaseXRegressor XGB (without propensity score)   
7    BaseXRegressor LR (without propensity score)   
8                              BaseRRegressor XGB   
9                               BaseRRegressor LR   
10        BaseRRegressor XGB (with random weight)   
11  BaseRRegressor XGB (without propensity score)   
12                           Neural Network (MLP)   

                                                  ITE       ATE  \
0   [1.999999999999998, 1.9999999999999987, 1.9999...  2.000000   
1            [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]  2.000000   
2            [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]  2

## Refutation Test

In [3]:
import os
# Define folder path
folder_path = "./evaluationDatasets/Subset/"

# List to store treatment effects
ate_values = []

feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

columns_to_drop = ['offerSuccess', 'treatmentOffer']

# Iterate through files in the folder
for file_name in os.listdir(folder_path):
    print(file_name)
    # Read CSV file
    file_path = os.path.join(folder_path, file_name)
    refutation = pd.read_csv(file_path)

    #X = refutation[feature_names]
    X = refutation.drop(columns=columns_to_drop)    
    y = refutation['offerSuccess']
    treatment = refutation['treatmentOffer']
    
    nn = MLPTRegressor(hidden_layer_sizes=(10, 10),
                 learning_rate_init=.1,
                 early_stopping=True,
                 random_state=42)
    te_nn, lb_nn, ub_nn = nn.estimate_ate(X, treatment, y)
    print('Average Treatment Effect: ', te_nn[0])
    ate_values.append(te_nn[0])

print("Average ATE: ",ate_values)

addRandomCauseDataset2.csv
Average Treatment Effect:  0.019022724341103563
addRandomCauseDataset1.csv
Average Treatment Effect:  -0.003896047220189547
Average ATE:  [0.019022724341103563, -0.003896047220189547]


In [4]:
import os
# Define folder path
folder_path = "./evaluationDatasets/Cause/"

# List to store treatment effects
ate_r_pw_xgb_values = []
ate_r_p_lr_values = []
ate_r_p_xgb_values = []
ate_x_p_lr_values = []
ate_x_p_xgb_values = []

feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

columns_to_drop = ['offerSuccess', 'treatmentOffer']

# Iterate through files in the folder
for file_name in os.listdir(folder_path):
    print(file_name)
    # Read CSV file
    file_path = os.path.join(folder_path, file_name)
    refutation = pd.read_csv(file_path)

    #X = refutation[feature_names]
    X = refutation.drop(columns=columns_to_drop)    
    y = refutation['offerSuccess']
    treatment = refutation['treatmentOffer']

    p = np.load('propensity-score-MetaLearners.npy')

    # X Learner with propensity score input
    # Calling the Base Learner class and feeding in XGB
    learner_x_p_xgb = BaseXRegressor(learner=XGBRegressor())
    ate_x_p_xgb = learner_x_p_xgb.estimate_ate(X=X, treatment=treatment, y=y, p=p)
    print('Using the BaseXRegressor class and using XGB:')
    print(ate_x_p_xgb[0][0])
    ate_x_p_xgb_values.append(ate_x_p_xgb[0][0])
    
    # X Learner with propensity score input
    # Calling the Base Learner class and feeding in LinearRegression
    learner_x_p_lr = BaseXRegressor(learner=LinearRegression())
    ate_x_p_lr = learner_x_p_lr.estimate_ate(X=X, treatment=treatment, y=y, p=p)
    print('\nUsing the BaseXRegressor class and using Linear Regression:')
    print(ate_x_p_lr[0][0])
    ate_x_p_lr_values.append(ate_x_p_lr[0][0])
    
    # R Learner with propensity score input
    # Calling the Base Learner class and feeding in XGB
    learner_r_p_xgb = BaseRRegressor(learner=XGBRegressor())
    ate_r_p_xgb = learner_r_p_xgb.estimate_ate(X=X, treatment=treatment, y=y, p=p)
    print('Using the BaseRRegressor class and using XGB:')
    print(ate_r_p_xgb[0][0])
    ate_r_p_xgb_values.append(ate_r_p_xgb[0][0])
    
    # R Learner with propensity score input
    # Calling the Base Learner class and feeding in LinearRegression
    learner_r_p_lr = BaseRRegressor(learner=LinearRegression())
    ate_r_p_lr = learner_r_p_lr.estimate_ate(X=X, treatment=treatment, y=y, p=p)
    print('Using the BaseRRegressor class and using Linear Regression:')
    print(ate_r_p_lr[0][0])
    ate_r_p_lr_values.append(ate_r_p_lr[0][0])
    
    # R Learner with propensity score input and random sample weight
    # Calling the Base Learner class and feeding in XGB
    learner_r_pw_xgb = BaseRRegressor(learner=XGBRegressor())
    sample_weight = np.random.randint(1, 3, len(y))
    ate_r_pw_xgb = learner_r_pw_xgb.estimate_ate(X=X, treatment=treatment, y=y, p=p, sample_weight=sample_weight)
    print('Using the BaseRRegressor class with random weight and using XGB:')
    print(ate_r_pw_xgb[0][0])
    ate_r_pw_xgb_values.append(ate_r_pw_xgb[0][0])

print("Average ATE")
print(sum(ate_x_p_xgb_values) / len(ate_x_p_xgb_values))
print(sum(ate_x_p_lr_values) / len(ate_x_p_lr_values))
print(sum(ate_r_p_xgb_values) / len(ate_r_p_xgb_values))
print(sum(ate_r_p_lr_values) / len(ate_r_p_lr_values))
print(sum(ate_r_pw_xgb_values) / len(ate_r_pw_xgb_values))

randomReplacedDataset2.csv
Using the BaseXRegressor class and using XGB:
0.5084008364086656

Using the BaseXRegressor class and using Linear Regression:
0.0797248317521689
Using the BaseRRegressor class and using XGB:
0.0021986893459338547
Using the BaseRRegressor class and using Linear Regression:
0.03122860087423773
Using the BaseRRegressor class with random weight and using XGB:
0.002141273791038801
randomReplacedDataset3.csv
Using the BaseXRegressor class and using XGB:
0.5130356657056437

Using the BaseXRegressor class and using Linear Regression:
0.07901551188124097
Using the BaseRRegressor class and using XGB:
0.002713471899893684
Using the BaseRRegressor class and using Linear Regression:
0.030935816815594788
Using the BaseRRegressor class with random weight and using XGB:
0.0017583169797670164
randomReplacedDataset7.csv
Using the BaseXRegressor class and using XGB:
0.5110942379859394

Using the BaseXRegressor class and using Linear Regression:
0.07956531067948439
Using the Bas

In [None]:
import os
# Define folder path
folder_path = "./evaluationDatasets/Replace/"

# List to store treatment effects
ate_r_xgb_values = []
ate_x_lr_values = []
ate_x_xgb_values = []
ate_t_lr_values = []
ate_t_xgb_values = []
ate_t_values = []
ate_s_values = []

feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

columns_to_drop = ['offerSuccess', 'treatmentOffer']

# Iterate through files in the folder
for file_name in os.listdir(folder_path):
    print(file_name)
    # Read CSV file
    file_path = os.path.join(folder_path, file_name)
    refutation = pd.read_csv(file_path)

    #X = refutation[feature_names]
    X = refutation.drop(columns=columns_to_drop)    
    y = refutation['offerSuccess']
    treatment = refutation['treatmentOffer']

    # Ready-to-use S-Learner using LinearRegression
    learner_s = LRSRegressor()
    ate_s = learner_s.estimate_ate(X=X, treatment=treatment, y=y)
    print('Using the S-Learner LinearRegression')
    print(ate_s[0][0])
    ate_s_values.append(ate_s[0][0])
    
    # Ready-to-use T-Learner using XGB
    learner_t = XGBTRegressor()
    ate_t = learner_t.estimate_ate(X=X, treatment=treatment, y=y)
    print('\nUsing the XGBTRegressor class')
    print(ate_t[0][0])
    ate_t_values.append(ate_t[0][0])
    
    # Calling the Base Learner class and feeding in LinearRegression
    learner_t_lr = BaseTRegressor(learner=LinearRegression())
    ate_t_lr= learner_t_lr.estimate_ate(X=X, treatment=treatment, y=y)
    print('\nUsing the BaseTRegressor class and using Linear Regression (different result):')
    print(ate_t_lr[0][0])
    ate_t_lr_values.append(ate_t_lr[0][0])
    
    # X Learner without propensity score input
    # Calling the Base Learner class and feeding in XGB
    learner_x_xgb = BaseXRegressor(XGBRegressor())
    ate_x_xgb = learner_x_xgb.estimate_ate(X=X, treatment=treatment, y=y)
    print('Using the BaseXRegressor class and using XGB without propensity score input:')
    print(ate_x_xgb[0][0])
    ate_x_xgb_values.append(ate_x_xgb[0][0])
    
    # # X Learner without propensity score input
    # # Calling the Base Learner class and feeding in LinearRegression
    # learner_x_lr = BaseXRegressor(learner=LinearRegression())
    # ate_x_lr = learner_x_lr.estimate_ate(X=X, treatment=treatment, y=y)
    # print('\nUsing the BaseXRegressor class and using Linear Regression without propensity score input:')
    # print(ate_x_lr[0][0])
    # ate_x_lr_values.append(ate_x_lr[0][0])
    
    # # R Learner without propensity score input
    # # Calling the Base Learner class and feeding in XGB
    # learner_r_xgb = BaseRRegressor(learner=XGBRegressor())
    # ate_r_xgb = learner_r_xgb.estimate_ate(X=X, treatment=treatment, y=y)
    # print('Using the BaseRRegressor class and using XGB without propensity score input:')
    # print(ate_r_xgb[0][0])
    # ate_r_xgb_values.append(ate_r_xgb[0][0])
    

print("Average ATE: ")
print(sum(ate_r_xgb_values) / len(ate_r_xgb_values))
print(sum(ate_x_lr_values) / len(ate_x_lr_values))
print(sum(ate_x_xgb_values) / len(ate_x_xgb_values))
print(sum(ate_t_lr_values) / len(ate_t_lr_values))
print(sum(ate_t_values) / len(ate_t_values))
print(sum(ate_s_values) / len(ate_s_values))

randomReplacedDataset2.csv
Using the S-Learner LinearRegression
0.06480794448195776

Using the XGBTRegressor class
0.1825832869111424

Using the BaseTRegressor class and using Linear Regression (different result):
0.09420411477983939
