In [4]:
import pandas as pd
import numpy as np
import yaml
import CausalEGM as cegm
import evaluationData

## BPIC 2017 Dataset

In [2]:
df = pd.read_csv("bpi2017_final.csv")
print(df.columns)
df.head()

Index(['case:concept:name', 'NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'treatedCase',
       'caseSuccesful', 'treatmentSuccess', 'offerNumber', 'offerSuccess',
       'treatmentOffer', 'timeApplication', 'weekdayApplication'],
      dtype='object')


Unnamed: 0,case:concept:name,NumberOfOffers,Action,org:resource,concept:name,EventOrigin,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,CreditScore,OfferedAmount,treatedCase,caseSuccesful,treatmentSuccess,offerNumber,offerSuccess,treatmentOffer,timeApplication,weekdayApplication
0,0.0,1.0,0.0,0.0,4.0,0.0,1.0,651433.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
1,0.0,1.0,4.0,0.0,8.0,0.0,1.0,651434.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.061,2.0
2,0.0,1.0,0.0,0.0,22.0,2.0,3.0,651435.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.29,2.0
3,0.0,1.0,1.0,0.0,22.0,2.0,6.0,651437.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.613,2.0
4,0.0,1.0,0.0,0.0,21.0,2.0,3.0,651438.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.62,2.0


In [3]:
feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

In [4]:
params = yaml.safe_load(open('egm-nn/bpi2017.yaml', 'r'))
print(params)

{'dataset': 'BPI2017', 'output_dir': './egm-nn/', 'v_dim': 19, 'z_dims': [3, 3, 6, 6], 'x_min': 0, 'x_max': 3, 'lr': 0.0002, 'alpha': 1, 'beta': 1, 'gamma': 10, 'g_d_freq': 5, 'g_units': [64, 64, 64, 64, 64], 'e_units': [64, 64, 64, 64, 64], 'f_units': [64, 32, 8], 'h_units': [64, 32, 8], 'dz_units': [64, 32, 8], 'dv_units': [64, 32, 8], 'binary_treatment': True, 'use_z_rec': True, 'use_v_gan': True, 'save_model': False, 'save_res': True}


In [5]:
model = cegm.CausalEGM(params=params,random_seed=123)

In [6]:
#treatment (x), potential outcome (y), and covariates (v)
x = df['treatmentOffer']
y = df['offerSuccess']
v = df[feature_names]

In [7]:
model.train(data=[x,y,v],n_iter=100,save_format='npy',verbose=False)

The average treatment effect (ATE) is  0.1247714


In [24]:
results = np.load('./egm-nn/results/BPI2017/20240122_162745/causal_pre_final.npy')
results

array([[0.1953125 ],
       [0.16015625],
       [0.05664062],
       ...,
       [0.21972656],
       [0.09375   ],
       [0.19140625]], dtype=float32)

In [25]:
ite = results.reshape(-1)
ate = results.mean()

In [26]:
# Calculate statistics
data = np.reshape(results, -1)
minimum = np.min(data)
first_quartile = np.percentile(data, 25)
median = np.median(data)
third_quartile = np.percentile(data, 75)
maximum = np.max(data)

# Interquartile range (IQR)
iqr = third_quartile - first_quartile

# Define upper and lower bounds for outliers
upper_bound = third_quartile + 1.5 * iqr
lower_bound = first_quartile - 1.5 * iqr

# Detect outliers
outliers = data[(data < lower_bound) | (data > upper_bound)]

ite_egm = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]
print(ite_egm)

[-0.484375, 0.0556640625, 0.15234375, 0.203125, 0.6875, 0.1474609375, 0.42431640625, -0.16552734375]


In [31]:
#Store result in df of the results
%store -r df_results
lib = "CausalEGM"
method = "CausalEGM"

if method in df_results['method'].values:
    df_results.loc[df_results['method'] == method, 'ATE'] = ate
    index = df_results[df_results['method'] == method].index[0]
    df_results.at[index, 'ITE'] = ite_egm

else:
    df_results = df_results._append({'method': method, 'ATE': ate, 'ITE': ite_egm, 'Library': lib}, ignore_index=True)

%store df_results

Stored 'df_results' (DataFrame)


## Synthetic Dataset

In [6]:
df_synth = pd.read_csv("synthetic_dataset.csv")
synthetic_features = ['NumberOfOffers', 'concept:name',
       'lifecycle:transition', 'time:timestamp', 'elementId', 'resourceId',
       'weekdayApplication', 'timeApplication']
df_synth.head()

Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,resourceId,treatment,successful,treatmentSuccess,weekdayApplication,timeApplication
0,0.0,2.0,29.0,0.0,152.0,28.0,1.0,1.0,1.0,2.0,1.0,0.0
1,0.0,2.0,29.0,2.0,152.0,28.0,1.0,1.0,1.0,2.0,1.0,0.0
2,0.0,2.0,29.0,1.0,152.0,28.0,1.0,1.0,1.0,2.0,1.0,0.0
3,0.0,2.0,5.0,0.0,152.0,7.0,1.0,1.0,1.0,2.0,1.0,0.0
4,0.0,2.0,5.0,2.0,152.0,7.0,0.0,1.0,1.0,2.0,1.0,0.0


In [11]:
params_synth = yaml.safe_load(open('egm-nn/synthetic-Dataset.yaml', 'r'))
print(params_synth)

{'dataset': 'SynthData', 'output_dir': './egm-nn/', 'v_dim': 8, 'z_dims': [3, 3, 6, 6], 'x_min': 0, 'x_max': 3, 'lr': 0.0002, 'alpha': 1, 'beta': 1, 'gamma': 10, 'g_d_freq': 5, 'g_units': [64, 64, 64, 64, 64], 'e_units': [64, 64, 64, 64, 64], 'f_units': [64, 32, 8], 'h_units': [64, 32, 8], 'dz_units': [64, 32, 8], 'dv_units': [64, 32, 8], 'binary_treatment': True, 'use_z_rec': True, 'use_v_gan': True, 'save_model': False, 'save_res': True}


In [8]:
#treatment (x), potential outcome (y), and covariates (v)
x = df_synth['treatment']
y = df_synth['treatmentSuccess']
v = df_synth[synthetic_features]

In [12]:
synth_model = cegm.CausalEGM(params=params_synth,random_seed=123)

In [13]:
synth_model.train(data=[x,y,v],n_iter=100,save_format='npy',verbose=False)

The average treatment effect (ATE) is  0.07459373


In [19]:
synth_results = np.load('./egm-nn/results/SynthData/20240227_122224/causal_pre_final.npy')
synth_results

array([[0.12651193],
       [0.11582285],
       [0.1216473 ],
       ...,
       [0.08596039],
       [0.0859375 ],
       [0.08587646]], dtype=float32)

In [20]:
synth_ite = synth_results.reshape(-1)
synth_ate = synth_results.mean()
print(synth_ite, synth_ate)

[0.12651193 0.11582285 0.1216473  ... 0.08596039 0.0859375  0.08587646] 0.07459373


In [22]:
# Calculate statistics
data = np.reshape(synth_results, -1)
minimum = np.min(data)
first_quartile = np.percentile(data, 25)
median = np.median(data)
third_quartile = np.percentile(data, 75)
maximum = np.max(data)

# Interquartile range (IQR)
iqr = third_quartile - first_quartile

# Define upper and lower bounds for outliers
upper_bound = third_quartile + 1.5 * iqr
lower_bound = first_quartile - 1.5 * iqr

# Detect outliers
outliers = data[(data < lower_bound) | (data > upper_bound)]

synth_ite_egm = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]
print(synth_ite_egm)

[-0.12484968, 0.059909820556640625, 0.07635641, 0.09584903717041016, 0.3586205, 0.03593921661376953, 0.14975786209106445, 0.006000995635986328]


In [29]:
%store -r df_synthetic_results
method = "CausalEGM"
ite = synth_ite_egm
ate = synth_ate

if method in df_synthetic_results['method'].values:
     # If the method is already in the DataFrame, update the ATE and ITE columns
    df_synthetic_results.loc[df_synthetic_results['method'] == method, 'ATE'] = ate
    index = df_synthetic_results[df_synthetic_results['method'] == method].index[0]
    df_synthetic_results.at[index, 'ITE'] = ite
else:
    # If the method is not in the DataFrame, add a new row
    df_synthetic_results = df_synthetic_results._append({'method': method, 'ATE': ate, 'ITE': ite}, ignore_index=True)

print(df_synthetic_results)
%store df_synthetic_results

                                           method       ATE  \
0                                     Causal Tree  2.000000   
1                                     cforest_mse       NaN   
2                                    cforest_cmse  2.000000   
3                              cforest_cmse_p=0.5  2.000000   
4                         cforest_cmse_p=0.5_md=3  2.000000   
5                                   cforest_ttest  2.000000   
6                                          IPW LR  2.000000   
7                                       Double ML  1.994487   
8                               Linear Regression  2.000000   
9                                    S-Learner LR  2.000000   
10                                  XGBTRegressor  2.000000   
11                             BaseTRegressor XGB  2.000000   
12                              BaseTRegressor LR  2.000000   
13                             BaseXRegressor XGB  2.000000   
14                              BaseXRegressor LR  2.00

## Evaluation Dataset

In [25]:
# import importlib
# importlib.reload(evaluationData)

placeboData, placebo_X, placebo_y, placebo_t = evaluationData.placeboTreatment(df)

causeData, cause_X, cause_y, cause_t = evaluationData.randomCause(df)

replaceData, replace_X, replace_y, replace_t, replaced_feature = evaluationData.randomReplace(df)
print(replaced_feature)

subsetData, subset_X, subset_y, subset_t = evaluationData.randomSubsetData(df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  inference_features['random covariate'] = new_data


concept:name


In [31]:
model.train(data=[placebo_t,placebo_y,placebo_X],n_iter=100, save_format='npy',verbose=False)

The average treatment effect (ATE) is  0.14709416


In [32]:
#treatment (x), potential outcome (y), and covariates (v)
params_cause = yaml.safe_load(open('egm-nn/bpi2017-CommonCause.yaml', 'r'))
model_cause = cegm.CausalEGM(params=params_cause,random_seed=123)

model_cause.train(data=[cause_t,cause_y,cause_X],n_iter=100, save_format='npy',verbose=False)

The average treatment effect (ATE) is  0.09950337


In [33]:
model.train(data=[replace_t,replace_y,replace_X],n_iter=100, save_format='npy',verbose=False)

The average treatment effect (ATE) is  0.14270824


In [34]:
model.train(data=[subset_t,subset_y,subset_X],n_iter=100, save_format='npy',verbose=False)

The average treatment effect (ATE) is  0.13757065


In [35]:
# result_placebo = np.load('./egm-nn/results/BPI2017/')
# placebo_ate = result_placebo.mean()
placebo_ate = 0.14709416
cause_ate = 0.09950337
subset_ate = 0.14270824
replace_ate = 0.13757065

In [36]:
%store -r df_evaluation_results
method = "CausalEGM"

if method in df_evaluation_results['method'].values:
     # If the method is already in the DataFrame, update the ATE and ITE columns
    df_evaluation_results.loc[df_evaluation_results['method'] == method, 'Random Common Cause ATE'] = cause_ate
    df_evaluation_results.loc[df_evaluation_results['method'] == method, 'Placebo treatment ATE'] = placebo_ate
    df_evaluation_results.loc[df_evaluation_results['method'] == method, 'Random Subset ATE'] = subset_ate
    df_evaluation_results.loc[df_evaluation_results['method'] == method, 'Random Feature Replacement ATE'] = replace_ate
    df_evaluation_results.loc[df_evaluation_results['method'] == method, 'Replaced Feature'] = replaced_feature
else:
    # If the method is not in the DataFrame, add a new row
    df_evaluation_results = df_evaluation_results._append({'method': method, 'Random Common Cause ATE': cause_ate, 'Placebo treatment ATE': placebo_ate, 'Random Subset ATE': subset_ate, 'Random Feature Replacement ATE': replace_ate, 'Replaced Feature': replaced_feature}, ignore_index=True)

%store df_evaluation_results

Stored 'df_evaluation_results' (DataFrame)
