In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
from collections import defaultdict

np.random.seed(42)

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [2]:
import causalml
#from causalml.metrics import plot_gain, plot_qini, qini_score
#from causalml.inference.tree import plot_dist_tree_leaves_values, get_tree_leaves_mask
from causalml.inference.meta import BaseSRegressor, BaseXRegressor, BaseTRegressor, BaseDRRegressor
from causalml.inference.tree import CausalRandomForestRegressor
from causalml.inference.tree import CausalTreeRegressor
from causalml.inference.tree import UpliftTreeClassifier, UpliftRandomForestClassifier
from causalml.inference.tree import uplift_tree_string, uplift_tree_plot

Failed to import duecredit due to No module named 'duecredit'


In [3]:
df = pd.read_csv("bpi2017_final.csv")
print(df.columns)
df.head()

Index(['case:concept:name', 'NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'treatedCase',
       'caseSuccesful', 'treatmentSuccess', 'offerNumber', 'offerSuccess',
       'treatmentOffer', 'timeApplication', 'weekdayApplication'],
      dtype='object')


Unnamed: 0,case:concept:name,NumberOfOffers,Action,org:resource,concept:name,EventOrigin,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,CreditScore,OfferedAmount,treatedCase,caseSuccesful,treatmentSuccess,offerNumber,offerSuccess,treatmentOffer,timeApplication,weekdayApplication
0,0.0,1.0,0.0,0.0,4.0,0.0,1.0,651433.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
1,0.0,1.0,4.0,0.0,8.0,0.0,1.0,651434.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.061,2.0
2,0.0,1.0,0.0,0.0,22.0,2.0,3.0,651435.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.29,2.0
3,0.0,1.0,1.0,0.0,22.0,2.0,6.0,651437.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.613,2.0
4,0.0,1.0,0.0,0.0,21.0,2.0,3.0,651438.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.62,2.0


In [4]:
# Look at the conversion rate and sample size in each group
df.pivot_table(values='offerSuccess',
               index='treatmentOffer',
               aggfunc=[np.mean, np.size],
               margins=True)

Unnamed: 0_level_0,mean,size
Unnamed: 0_level_1,offerSuccess,offerSuccess
treatmentOffer,Unnamed: 1_level_2,Unnamed: 2_level_2
0.0,0.492801,1021127
1.0,0.630322,177192
All,0.513135,1198319


In [5]:
#new empty column treatment_effect
df['treatment_effect'] = 0

In [6]:
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=11101)

In [7]:
feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

In [8]:
# Table to gather estimated ITEs by causal forest models
df_cforest = pd.DataFrame({
    'outcome': df_test['offerSuccess'],
    'is_treated': df_test['treatmentOffer'],
    'treatment_effect': df_test['treatment_effect']
})

### CausalRandomForestRegressor

In [3]:
cforests = {
    'cforest_mse': {
        'params':
        dict(criterion='standard_mse',
             control_name=0,
             min_impurity_decrease=0,
             min_samples_leaf=400,
             groups_penalty=0.,
             groups_cnt=True),
    },
    'cforest_cmse': {
        'params':
        dict(
            criterion='causal_mse',
            control_name=0,
            min_samples_leaf=400,
            groups_penalty=0.,
            groups_cnt=True
        ),
    },
    'cforest_cmse_p=0.5': {
        'params':
        dict(
            criterion='causal_mse',
            control_name=0,
            min_samples_leaf=400,
            groups_penalty=0.5,
            groups_cnt=True,
        ),
    },
    'cforest_cmse_p=0.5_md=3': {
        'params':
        dict(
            criterion='causal_mse',
            control_name=0,
            max_depth=3,
            min_samples_leaf=400,
            groups_penalty=0.5,
            groups_cnt=True,
        ),
    },
    'cforest_ttest': {
        'params':
        dict(criterion='t_test',
             control_name=0,
             min_samples_leaf=400,
             groups_penalty=0.,
             groups_cnt=True),
    },
}

In [10]:
# Model treatment effect
for cforest_name, cforest_info in cforests.items():
    print(f"Fitting: {cforest_name}")
    cforest = CausalRandomForestRegressor(**cforest_info['params'])
    cforest.fit(X=df_train[feature_names].values,
              y=df_train['offerSuccess'].values, 
                treatment=df_train['treatmentOffer'].values)
    cforests[cforest_name].update({'model': cforest})
    df_cforest[cforest_name] = cforest.predict(df_test[feature_names].values)

Fitting: cforest_mse
Fitting: cforest_cmse
Fitting: cforest_cmse_p=0.5
Fitting: cforest_cmse_p=0.5_md=3
Fitting: cforest_ttest


In [14]:
df_cforest

Unnamed: 0,outcome,is_treated,treatment_effect,cforest_mse,cforest_cmse,cforest_cmse_p=0.5,cforest_cmse_p=0.5_md=3,cforest_ttest
862609,1.0,0.0,0,0.009599,0.533586,-0.032120,-0.070166,0.023895
1158138,0.0,0.0,0,0.155407,0.252577,-0.032120,-0.070166,0.124085
603398,0.0,0.0,0,0.155407,0.240823,-0.032120,-0.070166,0.037601
780790,0.0,0.0,0,0.155407,0.258056,-0.061270,-0.070166,0.209644
531743,1.0,0.0,0,0.009599,-0.107408,-0.002474,0.008442,0.048817
...,...,...,...,...,...,...,...,...
921849,0.0,0.0,0,0.155407,0.444048,-0.046021,-0.013725,-0.114005
189652,1.0,0.0,0,0.009599,0.533586,0.073335,0.073439,0.086531
914548,0.0,0.0,0,0.155407,0.533586,0.074840,0.073439,0.297262
238437,1.0,0.0,0,0.009599,0.052540,-0.019629,-0.013725,0.025638


### Causal Tree

In [9]:
ctree = CausalTreeRegressor()
ctree.fit(X=df_train[feature_names].values, y=df_train['offerSuccess'].values, treatment=df_train['treatmentOffer'].values)

In [10]:
%%time
prediction = ctree.predict(df_test[feature_names].values)
print(prediction)

[ 0.01388495  0.01388495  0.01388495 ...  0.18022429 -0.02789474
  0.26546003]
CPU times: user 45.8 ms, sys: 0 ns, total: 45.8 ms
Wall time: 88.8 ms


In [None]:
y = 1

# Calculate statistics
data = np.reshape(prediction, -1)
minimum = np.min(data)
first_quartile = np.percentile(data, 25)
median = np.median(data)
third_quartile = np.percentile(data, 75)
maximum = np.max(data)

# Interquartile range (IQR)
iqr = third_quartile - first_quartile

# Define upper and lower bounds for outliers
upper_bound = third_quartile + 1.5 * iqr
lower_bound = first_quartile - 1.5 * iqr

# Detect outliers
outliers = data[(data < lower_bound) | (data > upper_bound)]

# Print the statistics
print("Minimum:", minimum)
print("First Quartile:", first_quartile)
print("Median:", median)
print("Third Quartile:", third_quartile)
print("Maximum:", maximum)
print("Interquartile Range:", iqr)
print("Upper Bound (Outliers):", upper_bound)
print("Lower Bound (Outliers):", lower_bound)
print("Outliers:", outliers)

ite_tree = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]

In [16]:
print(ite_tree)

[-0.7879656160458453, -0.043494462980468274, 0.027727546714888418, 0.11155913978494625, 0.9245283018867925, 0.15505360276541452, 0.344139543933068, -0.2760748671285901]


In [15]:
#input random forest results in df
%store -r df_results
lib = "CausalML"
method = "Causal Tree"
ite = ite_tree
ate = prediction.mean()

if method in df_results['method'].values:
     # If the method is already in the DataFrame, update the ATE and ITE columns
    df_results.loc[df_results['method'] == method, 'ATE'] = ate
    index = df_results[df_results['method'] == method].index[0]
    df_results.at[index, 'ITE'] = ite
else:
    # If the method is not in the DataFrame, add a new row
    df_results = df_results._append({'method': method, 'ATE': ate, 'ITE': ite, 'Library': lib}, ignore_index=True)



print(df_results)
%store df_results

                                           method       ATE  \
0                               Linear Regression  0.449046   
1                                       Double ML  0.471019   
2                                             IPW  0.311352   
3                                       IPW Hajek  0.311352   
4                                  IPW Stabalized  0.311352   
5                       Propensity Score Matching -0.179289   
6                               Distance Matching  0.630322   
7                                          IPW LR  0.149171   
8                                       CausalEGM  0.124771   
9                                     Causal Tree  0.034357   
10                                    cforest_mse  0.087602   
11                                   cforest_cmse  0.301697   
12                             cforest_cmse_p=0.5  0.032615   
13                        cforest_cmse_p=0.5_md=3  0.042756   
14                                  cforest_ttest  0.13

### Result in result df
Das sind die ITE, also der Durchschnitt ist dann ATE

In [17]:
y = 1

for i in range(4, len(df_cforest.columns)):

    # Calculate statistics
    data = np.reshape(df_cforest.iloc[:, i], -1)
    minimum = np.min(data)
    first_quartile = np.percentile(data, 25)
    median = np.median(data)
    third_quartile = np.percentile(data, 75)
    maximum = np.max(data)
    
    # Interquartile range (IQR)
    iqr = third_quartile - first_quartile
    
    # Define upper and lower bounds for outliers
    upper_bound = third_quartile + 1.5 * iqr
    lower_bound = first_quartile - 1.5 * iqr
    
    # Detect outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    
    # Print the statistics
    print("Minimum:", minimum)
    print("First Quartile:", first_quartile)
    print("Median:", median)
    print("Third Quartile:", third_quartile)
    print("Maximum:", maximum)
    print("Interquartile Range:", iqr)
    print("Upper Bound (Outliers):", upper_bound)
    print("Lower Bound (Outliers):", lower_bound)
    print("Outliers:", outliers)
    
    exec(f'boxplot_{y} = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]')
    y +=1

Minimum: -0.5945348148740174
First Quartile: 0.10957277598486484
Median: 0.2953638085517083
Third Quartile: 0.48510489867500367
Maximum: 0.9415649910897447
Interquartile Range: 0.3755321226901388
Upper Bound (Outliers): 1.048403082710212
Lower Bound (Outliers): -0.45372540805034345
Outliers: [-0.59453481 -0.47291844 -0.59453481 -0.59453481 -0.59453481 -0.59453481
 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481
 -0.59453481 -0.59453481 -0.59453481 -0.47291844 -0.47291844 -0.47291844
 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.47291844
 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481
 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481
 -0.59453481 -0.47291844 -0.59453481 -0.47291844 -0.59453481 -0.47291844
 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.47291844
 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -0.59453481
 -0.59453481 -0.59453481 -0.59453481 -0.59453481 -

In [18]:
list_cforest_ite = [boxplot_1, boxplot_2, boxplot_3, boxplot_4, boxplot_5]
list_cforest_ite

[[-0.5945348148740174,
  0.10957277598486484,
  0.2953638085517083,
  0.48510489867500367,
  0.9415649910897447,
  0.3755321226901388,
  1.048403082710212,
  -0.45372540805034345],
 [-0.13110388553923152,
  -0.00950147890263294,
  0.030715912622950002,
  0.07381461280358302,
  0.3622874558132804,
  0.08331609170621596,
  0.19878875036290694,
  -0.13447561646195688],
 [-0.07016614309124898,
  0.008441563662790573,
  0.042050657499204745,
  0.07343896302933466,
  0.1578821278976667,
  0.06499739936654408,
  0.17093506207915077,
  -0.08905453538702555],
 [-0.3886643569251879,
  0.03914733450821808,
  0.0889671251618295,
  0.19288772958493325,
  0.9152243857670506,
  0.15374039507671516,
  0.423498322200006,
  -0.19146325810685466],
 [-0.3886643569251879,
  0.03914733450821808,
  0.0889671251618295,
  0.19288772958493325,
  0.9152243857670506,
  0.15374039507671516,
  0.423498322200006,
  -0.19146325810685466]]

In [21]:
%store -r df_results
lib = "CausalML"
ite_list = list_cforest_ite
j = 0

for i in range(3, len(df_cforest.columns)):
    method = df_cforest.columns[i]
    ite = ite_list[j]
    ate = df_cforest.iloc[:, i].mean()

    if method in df_results['method'].values:
         # If the method is already in the DataFrame, update the ATE and ITE columns
        df_results.loc[df_results['method'] == method, 'ATE'] = ate
        index = df_results[df_results['method'] == method].index[0]
        df_results.at[index, 'ITE'] = ite
    else:
        # If the method is not in the DataFrame, add a new row
        df_results = df_results._append({'method': method, 'ATE': ate, 'ITE': ite, 'Library': lib}, ignore_index=True)
    j +=1


print(df_results)
%store df_results

                                           method       ATE  \
0                               Linear Regression  0.449046   
1                                       Double ML  0.471019   
2                                             IPW  0.311352   
3                                       IPW Hajek  0.311352   
4                                  IPW Stabalized  0.311352   
5                       Propensity Score Matching -0.179289   
6                               Distance Matching  0.630322   
7                                          IPW LR  0.149171   
8                                       CausalEGM  0.124771   
9                                     Causal Tree  0.034357   
10                                    cforest_mse  0.087938   
11                                   cforest_cmse  0.301958   
12                             cforest_cmse_p=0.5  0.033445   
13                        cforest_cmse_p=0.5_md=3  0.042922   
14                                  cforest_ttest  0.13

### Synthetic Dataset

In [22]:
df_synth = pd.read_csv("synthetic_dataset.csv")
print(df_synth.columns)
df_synth.head()

Index(['case:concept:name', 'NumberOfOffers', 'concept:name',
       'lifecycle:transition', 'time:timestamp', 'elementId', 'resourceId',
       'treatment', 'successful', 'treatmentSuccess', 'weekdayApplication',
       'timeApplication'],
      dtype='object')


Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,resourceId,treatment,successful,treatmentSuccess,weekdayApplication,timeApplication
0,0.0,2.0,29.0,0.0,152.0,28.0,1.0,1.0,1.0,2.0,1.0,0.0
1,0.0,2.0,29.0,2.0,152.0,28.0,1.0,1.0,1.0,2.0,1.0,0.0
2,0.0,2.0,29.0,1.0,152.0,28.0,1.0,1.0,1.0,2.0,1.0,0.0
3,0.0,2.0,5.0,0.0,152.0,7.0,1.0,1.0,1.0,2.0,1.0,0.0
4,0.0,2.0,5.0,2.0,152.0,7.0,0.0,1.0,1.0,2.0,1.0,0.0


In [23]:
synthetic_features = ['NumberOfOffers', 'concept:name',
       'lifecycle:transition', 'time:timestamp', 'elementId', 'resourceId',
       'weekdayApplication', 'timeApplication']

In [24]:
#new empty column treatment_effect
df_synth['treatment_effect'] = 0

In [25]:
# Split data to training and testing samples for model validation (next section)
df_synth_train, df_synth_test = train_test_split(df_synth, test_size=0.2, random_state=11101)

In [26]:
# Table to gather estimated ITEs by causal forest models
df_synth_cforest = pd.DataFrame({
    'outcome': df_synth_test['treatmentSuccess'],
    'is_treated': df_synth_test['treatment'],
    'treatment_effect': df_synth_test['treatment_effect']
})

In [28]:
# Model treatment effect
cforests_synth = cforests
for cforest_name, cforest_info in cforests_synth.items():
    print(f"Fitting: {cforest_name}")
    cforest_synth = CausalRandomForestRegressor(**cforest_info['params'])
    cforest_synth.fit(X=df_synth_train[synthetic_features].values,
              y=df_synth_train['treatmentSuccess'].values, 
                treatment=df_synth_train['treatment'].values)
    cforests_synth[cforest_name].update({'model': cforest_synth})
    df_synth_cforest[cforest_name] = cforest_synth.predict(df_synth_test[synthetic_features].values)

Fitting: cforest_mse
Fitting: cforest_cmse
Fitting: cforest_cmse_p=0.5
Fitting: cforest_cmse_p=0.5_md=3
Fitting: cforest_ttest


In [29]:
df_synth_cforest

Unnamed: 0,outcome,is_treated,treatment_effect,cforest_mse,cforest_cmse,cforest_cmse_p=0.5,cforest_cmse_p=0.5_md=3,cforest_ttest
11513,0.0,0.0,0,,2.0,2.0,2.0,2.0
191925,2.0,1.0,0,,2.0,2.0,2.0,2.0
77381,2.0,1.0,0,,2.0,2.0,2.0,2.0
48249,2.0,1.0,0,,2.0,2.0,2.0,2.0
123123,0.0,0.0,0,,2.0,2.0,2.0,2.0
...,...,...,...,...,...,...,...,...
46381,0.0,0.0,0,,2.0,2.0,2.0,2.0
146618,0.0,0.0,0,,2.0,2.0,2.0,2.0
174779,0.0,0.0,0,,2.0,2.0,2.0,2.0
220674,0.0,0.0,0,,2.0,2.0,2.0,2.0


In [30]:
ctree = CausalTreeRegressor()
ctree.fit(X=df_synth_train[synthetic_features].values, y=df_synth_train['treatmentSuccess'].values, treatment=df_synth_train['treatment'].values)

In [31]:
%%time
prediction = ctree.predict(df_synth_test[synthetic_features].values)
print(prediction)

[2. 2. 2. ... 2. 2. 2.]
CPU times: user 7.35 ms, sys: 998 µs, total: 8.35 ms
Wall time: 23.2 ms


In [33]:
y = 1

# Calculate statistics
data = np.reshape(prediction, -1)
minimum = np.min(data)
first_quartile = np.percentile(data, 25)
median = np.median(data)
third_quartile = np.percentile(data, 75)
maximum = np.max(data)

# Interquartile range (IQR)
iqr = third_quartile - first_quartile

# Define upper and lower bounds for outliers
upper_bound = third_quartile + 1.5 * iqr
lower_bound = first_quartile - 1.5 * iqr

# Detect outliers
outliers = data[(data < lower_bound) | (data > upper_bound)]

# Print the statistics
print("Minimum:", minimum)
print("First Quartile:", first_quartile)
print("Median:", median)
print("Third Quartile:", third_quartile)
print("Maximum:", maximum)
print("Interquartile Range:", iqr)
print("Upper Bound (Outliers):", upper_bound)
print("Lower Bound (Outliers):", lower_bound)
print("Outliers:", outliers)

ite_synth_tree = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]

Minimum: 2.0
First Quartile: 2.0
Median: 2.0
Third Quartile: 2.0
Maximum: 2.0
Interquartile Range: 0.0
Upper Bound (Outliers): 2.0
Lower Bound (Outliers): 2.0
Outliers: []


In [34]:
#input random forest results in df
%store -r df_synthetic_results
method = "Causal Tree"
ite = ite_synth_tree
ate = prediction.mean()

df_synthetic_results = df_synthetic_results._append({'method': method, 'ATE': ate, 'ITE': ite}, ignore_index=True)

%store df_synthetic_results

Stored 'df_synthetic_results' (DataFrame)


In [37]:
y = 1
for i in range(4, len(df_synth_cforest.columns)):

    # Calculate statistics
    data = np.reshape(df_synth_cforest.iloc[:, i], -1)
    minimum = np.min(data)
    first_quartile = np.percentile(data, 25)
    median = np.median(data)
    third_quartile = np.percentile(data, 75)
    maximum = np.max(data)
    
    # Interquartile range (IQR)
    iqr = third_quartile - first_quartile
    
    # Define upper and lower bounds for outliers
    upper_bound = third_quartile + 1.5 * iqr
    lower_bound = first_quartile - 1.5 * iqr
    
    # Detect outliers
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    
    # Print the statistics
    print("Minimum:", minimum)
    print("First Quartile:", first_quartile)
    print("Median:", median)
    print("Third Quartile:", third_quartile)
    print("Maximum:", maximum)
    print("Interquartile Range:", iqr)
    print("Upper Bound (Outliers):", upper_bound)
    print("Lower Bound (Outliers):", lower_bound)
    print("Outliers:", outliers)
    
    exec(f'boxplot_synth_{y} = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]')
    y +=1

Minimum: 2.0
First Quartile: 2.0
Median: 2.0
Third Quartile: 2.0
Maximum: 2.0
Interquartile Range: 0.0
Upper Bound (Outliers): 2.0
Lower Bound (Outliers): 2.0
Outliers: []
Minimum: 2.0
First Quartile: 2.0
Median: 2.0
Third Quartile: 2.0
Maximum: 2.0
Interquartile Range: 0.0
Upper Bound (Outliers): 2.0
Lower Bound (Outliers): 2.0
Outliers: []
Minimum: 2.0
First Quartile: 2.0
Median: 2.0
Third Quartile: 2.0
Maximum: 2.0
Interquartile Range: 0.0
Upper Bound (Outliers): 2.0
Lower Bound (Outliers): 2.0
Outliers: []
Minimum: 2.0
First Quartile: 2.0
Median: 2.0
Third Quartile: 2.0
Maximum: 2.0
Interquartile Range: 0.0
Upper Bound (Outliers): 2.0
Lower Bound (Outliers): 2.0
Outliers: []


In [38]:
synth_cforest_ite = ['NaN', boxplot_synth_1, boxplot_synth_2, boxplot_synth_3, boxplot_synth_4]
synth_cforest_ite

['NaN',
 [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0],
 [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0],
 [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0],
 [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]]

In [39]:
%store -r df_synthetic_results
lib = "CausalML"
ite_list = synth_cforest_ite
j = 0

for i in range(3, len(df_synth_cforest.columns)):
    method = df_synth_cforest.columns[i]
    ite = ite_list[j]
    ate = df_synth_cforest.iloc[:, i].mean()

    df_synthetic_results = df_synthetic_results._append({'method': method, 'ATE': ate, 'ITE': ite}, ignore_index=True)
    j +=1


print(df_synthetic_results)
%store df_synthetic_results

                    method  ATE                                       ITE
0              Causal Tree  2.0  [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]
1              cforest_mse  NaN                                       NaN
2             cforest_cmse  2.0  [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]
3       cforest_cmse_p=0.5  2.0  [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]
4  cforest_cmse_p=0.5_md=3  2.0  [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]
5            cforest_ttest  2.0  [2.0, 2.0, 2.0, 2.0, 2.0, 0.0, 2.0, 2.0]
Stored 'df_synthetic_results' (DataFrame)


## Refutation Test

In [None]:
import os
# Define folder path
folder_path = "./evaluationDatasets/Placebo/"

# List to store treatment effects
cforest_mse_values = []
cforest_ttest_values  = []
cforest_cmse_p_md_values = []
cforest_cmse_p_values = []
cforest_cmse_values = []

feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

columns_to_drop = ['offerSuccess', 'treatmentOffer']

# Iterate through files in the folder
for file_name in os.listdir(folder_path):
    print(file_name)
    # Read CSV file
    file_path = os.path.join(folder_path, file_name)
    refutation = pd.read_csv(file_path)
    refutation['treatment_effect'] = 0

    train, test = train_test_split(refutation, test_size=0.2, random_state=11101)
    
    # Table to gather estimated ITEs by causal forest models
    df_cforest = pd.DataFrame({
        'outcome': test['offerSuccess'],
        'is_treated': test['treatmentOffer'],
        'treatment_effect': test['treatment_effect']
    })
    
    # Extract features and target
    # X_train = train[feature_names]
    # X_train = train.drop(columns=columns_to_drop)    
    # y_train = train['offerSuccess']
    
    # X_test = test[feature_names]
    # X_test = test.drop(columns=columns_to_drop)
    
    # Estimate treatment effect using model
    # Model treatment effect
    for cforest_name, cforest_info in cforests.items():
        print(f"Fitting: {cforest_name}")
        cforest = CausalRandomForestRegressor(**cforest_info['params'])
        cforest.fit(X=train[feature_names].values,
                  y=train['offerSuccess'].values, 
                    treatment=train['treatmentOffer'].values)
        cforests[cforest_name].update({'model': cforest})
        prediction_result = cforest.predict(test[feature_names].values)
        df_cforest[cforest_name] = prediction_result
        print(prediction_result.mean())

    # Append treatment effect value to list
    for i in range(3, len(df_cforest.columns)):
        ate = df_cforest.iloc[:, i].mean()
        print(ate)
        if i == 3:
            cforest_mse_values.append(ate)
        if i == 4:
            cforest_cmse_values.append(ate)
        if i == 5:
            cforest_cmse_p_values.append(ate)
        if i == 6:
            cforest_cmse_p_md_values.append(ate)
        if i == 7:
            cforest_ttest_values.append(ate)

    
# Calculate average treatment effect
print(sum(cforest_mse_values) /len(cforest_mse_values))
print(sum(cforest_cmse_values) / len(cforest_cmse_values))
print(sum(cforest_cmse_p_values) / len(cforest_cmse_p_values))
print(sum(cforest_cmse_p_md_values) / len(cforest_cmse_p_md_values))
print(sum(cforest_ttest_values) / len(cforest_ttest_values))

In [11]:
import os
# Define folder path
folder_path = "./evaluationDatasets/Cause/"

# List to store treatment effects
ate_values = []

feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

columns_to_drop = ['offerSuccess', 'treatmentOffer']

ctree = CausalTreeRegressor()

# Iterate through files in the folder
for file_name in os.listdir(folder_path):
    print(file_name)
    # Read CSV file
    file_path = os.path.join(folder_path, file_name)
    refutation = pd.read_csv(file_path)
    refutation['treatment_effect'] = 0

    train, test = train_test_split(refutation, test_size=0.2, random_state=11101)
    
    # Extract features and target
    # X_train = train[feature_names]
    # X_train = train.drop(columns=columns_to_drop)    
    # y_train = train['offerSuccess']
    
    # X_test = test[feature_names]
    # X_test = test.drop(columns=columns_to_drop)
    
    # Estimate treatment effect using model
    # Model treatment effect
    ctree.fit(X=train.drop(columns=columns_to_drop).values, y=train['offerSuccess'].values, treatment=train['treatmentOffer'].values)
    prediction = ctree.predict(test.drop(columns=columns_to_drop).values)
    print(prediction.mean())

    # Append treatment effect value to list
    ate_values.append(prediction.mean())

    
# Calculate average treatment effect
print("Avergae ATE: ", sum(ate_values) /len(ate_values))

addRandomCauseDataset2.csv
-0.0021052772140876955
addRandomCauseDataset1.csv
-0.006127737059615762
Avergae ATE:  -0.004116507136851729
