In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV, LogisticRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error as mse
from scipy.stats import entropy
import warnings

from causalml.inference.meta import LRSRegressor
from causalml.inference.meta import XGBTRegressor, MLPTRegressor
from causalml.inference.meta import BaseXRegressor, BaseRRegressor, BaseSRegressor, BaseTRegressor
from causalml.inference.tf import DragonNet
from causalml.match import NearestNeighborMatch, MatchOptimizer, create_table_one
from causalml.propensity import ElasticNetPropensityModel
from causalml.metrics import *


import torch
import logging
from causalml.inference.nn import CEVAE

import os, sys

%matplotlib inline

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
sns.set_palette('Paired')
plt.rcParams['figure.figsize'] = (12,8)

2024-02-28 11:36:44.986625: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-02-28 11:36:46.707311: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-28 11:36:46.707344: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-28 11:36:46.708685: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-28 11:36:46.895839: I tensorflow/core/platform/cpu_feature_guar

## BPIC 2017 

In [2]:
df = pd.read_csv("bpi2017_final.csv")
print(df.columns)
df.head()

Index(['case:concept:name', 'NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'Selected', 'CreditScore', 'OfferedAmount', 'treatedCase',
       'caseSuccesful', 'treatmentSuccess', 'offerNumber', 'offerSuccess',
       'treatmentOffer', 'timeApplication', 'weekdayApplication'],
      dtype='object')


Unnamed: 0,case:concept:name,NumberOfOffers,Action,org:resource,concept:name,EventOrigin,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,...,CreditScore,OfferedAmount,treatedCase,caseSuccesful,treatmentSuccess,offerNumber,offerSuccess,treatmentOffer,timeApplication,weekdayApplication
0,0.0,1.0,0.0,0.0,4.0,0.0,1.0,651433.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0
1,0.0,1.0,4.0,0.0,8.0,0.0,1.0,651434.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.061,2.0
2,0.0,1.0,0.0,0.0,22.0,2.0,3.0,651435.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,0.29,2.0
3,0.0,1.0,1.0,0.0,22.0,2.0,6.0,651437.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.613,2.0
4,0.0,1.0,0.0,0.0,21.0,2.0,3.0,651438.0,10.0,1.0,...,0.0,5000.0,0.0,0.0,0.0,1.0,0.0,0.0,66.62,2.0


In [3]:
feature_names = ['NumberOfOffers', 'Action', 'org:resource',
       'concept:name', 'EventOrigin', 'lifecycle:transition', 'time:timestamp',
       'case:LoanGoal', 'case:ApplicationType', 'case:RequestedAmount',
       'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost',
       'CreditScore', 'OfferedAmount', 'offerNumber','timeApplication', 'weekdayApplication']

In [4]:
y_outcome = df['offerSuccess'].values
x_feature = df[feature_names].values
t_treatment = np.array([np.array([value]) for value in df['treatmentOffer']])

In [None]:
dragon = DragonNet(neurons_per_layer=200, targeted_reg=True)
dragon_ite = dragon.fit_predict(x_feature, t_treatment, y_outcome, return_components=False)
dragon_ate = dragon_ite.mean()

In [None]:
print(dragon_ite)
print(dragon_ate)

In [6]:
y = 1

# Calculate statistics
data = np.reshape(dragon_ite, -1)
minimum = np.min(data)
first_quartile = np.percentile(data, 25)
median = np.median(data)
third_quartile = np.percentile(data, 75)
maximum = np.max(data)

# Interquartile range (IQR)
iqr = third_quartile - first_quartile

# Define upper and lower bounds for outliers
upper_bound = third_quartile + 1.5 * iqr
lower_bound = first_quartile - 1.5 * iqr

# Detect outliers
outliers = data[(data < lower_bound) | (data > upper_bound)]

# Print the statistics
print("Minimum:", minimum)
print("First Quartile:", first_quartile)
print("Median:", median)
print("Third Quartile:", third_quartile)
print("Maximum:", maximum)
print("Interquartile Range:", iqr)
print("Upper Bound (Outliers):", upper_bound)
print("Lower Bound (Outliers):", lower_bound)
print("Outliers:", outliers)

ite_dragon = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]

Minimum: 0.12840658
First Quartile: 0.1284065842628479
Median: 0.12840658
Third Quartile: 0.1284065842628479
Maximum: 0.12840658
Interquartile Range: 0.0
Upper Bound (Outliers): 0.1284065842628479
Lower Bound (Outliers): 0.1284065842628479
Outliers: []


### CEVAE Model

This module implements the Causal Effect Variational Autoencoder [1]

[1] C. Louizos, U. Shalit, J. Mooij, D. Sontag, R. Zemel, M. Welling (2017).
Causal Effect Inference with Deep Latent-Variable Models.
http://papers.nips.cc/paper/7223-causal-effect-inference-with-deep-latent-variable-models.pdf
https://github.com/AMLab-Amsterdam/CEVAE

In [4]:
# Split data to training and testing samples for model validation (next section)
df_train, df_test = train_test_split(df, test_size=0.2, random_state=11101)

In [5]:
X_train = df_train[feature_names].values
X_test = df_test[feature_names].values
treatment_train = df_train['treatmentOffer'].values
treatment_test = df_test['treatmentOffer'].values
y_train = df_train['offerSuccess'].values
y_test = df_test['offerSuccess'].values

In [6]:
# cevae model settings
outcome_dist = "normal"
latent_dim = 20
hidden_dim = 200
num_epochs = 5
batch_size = 1000
learning_rate = 0.001
learning_rate_decay = 0.01
num_layers = 2

In [7]:
cevae = CEVAE(outcome_dist=outcome_dist,
              latent_dim=latent_dim,
              hidden_dim=hidden_dim,
              num_epochs=num_epochs,
              batch_size=batch_size,
              learning_rate=learning_rate,
              learning_rate_decay=learning_rate_decay,
              num_layers=num_layers)

In [8]:
# fit
losses = cevae.fit(X=torch.tensor(X_train, dtype=torch.float),
                   treatment=torch.tensor(treatment_train, dtype=torch.float),
                   y=torch.tensor(y_train, dtype=torch.float))

INFO 	 Training with 959 minibatches per epoch
DEBUG 	 step     0 loss = 39.432
DEBUG 	 step   100 loss = 21.5587
DEBUG 	 step   200 loss = 17.7834
DEBUG 	 step   300 loss = 16.1056
DEBUG 	 step   400 loss = 14.7949
DEBUG 	 step   500 loss = 13.9928
DEBUG 	 step   600 loss = 12.7464
DEBUG 	 step   700 loss = 11.3728
DEBUG 	 step   800 loss = 11.5583
DEBUG 	 step   900 loss = 11.1287
DEBUG 	 step  1000 loss = 10.064
DEBUG 	 step  1100 loss = 10.4556
DEBUG 	 step  1200 loss = 9.80264
DEBUG 	 step  1300 loss = 9.04074
DEBUG 	 step  1400 loss = 8.62775
DEBUG 	 step  1500 loss = 8.42838
DEBUG 	 step  1600 loss = 7.67158
DEBUG 	 step  1700 loss = 8.49774
DEBUG 	 step  1800 loss = 8.13176
DEBUG 	 step  1900 loss = 7.54507
DEBUG 	 step  2000 loss = 7.20558
DEBUG 	 step  2100 loss = 7.25838
DEBUG 	 step  2200 loss = 7.25943
DEBUG 	 step  2300 loss = 7.16487
DEBUG 	 step  2400 loss = 6.76904
DEBUG 	 step  2500 loss = 7.35741
DEBUG 	 step  2600 loss = 7.30728
DEBUG 	 step  2700 loss = 6.72472
DEB

In [9]:
# predict
ate_val = cevae.predict(X_test)

INFO 	 Evaluating 240 minibatches
DEBUG 	 batch ate = -0.0518396
DEBUG 	 batch ate = -0.0578103
DEBUG 	 batch ate = 0.0179996
DEBUG 	 batch ate = -0.0672696
DEBUG 	 batch ate = -0.0212976
DEBUG 	 batch ate = -0.0394065
DEBUG 	 batch ate = -0.071669
DEBUG 	 batch ate = -0.0345337
DEBUG 	 batch ate = -0.0644459
DEBUG 	 batch ate = -0.0492067
DEBUG 	 batch ate = -0.0511732
DEBUG 	 batch ate = -0.0738739
DEBUG 	 batch ate = -0.0653151
DEBUG 	 batch ate = -0.0608201
DEBUG 	 batch ate = -0.0718545
DEBUG 	 batch ate = -0.0776044
DEBUG 	 batch ate = -0.0560759
DEBUG 	 batch ate = -0.089948
DEBUG 	 batch ate = -0.0721967
DEBUG 	 batch ate = -0.0310648
DEBUG 	 batch ate = -0.0875605
DEBUG 	 batch ate = -0.056983
DEBUG 	 batch ate = -0.0885981
DEBUG 	 batch ate = -0.0835431
DEBUG 	 batch ate = -0.0559851
DEBUG 	 batch ate = -0.096993
DEBUG 	 batch ate = -0.04468
DEBUG 	 batch ate = -0.0763461
DEBUG 	 batch ate = -0.0586184
DEBUG 	 batch ate = -0.0412979
DEBUG 	 batch ate = -0.0367664
DEBUG 	 batc

### Input results in result df

In [None]:
#for Dragonnet
%store -r df_results
lib = "CausalML"
method = "Dragonnet"

if method in df_results['method'].values:
    df_results.loc[df_results['method'] == method, 'ATE'] = dragon_ate
    df_results.loc[df_results['method'] == method, 'ITE'] = ite_dragon

else:
    df_results = df_results._append({'method': method, 'ATE': dragon_ate, 'ITE': ite_dragon, 'Library': lib}, ignore_index=True)

%store df_results

In [10]:
# for CEVAE
%store -r df_results
ate = ate_val.mean()
lib = "CausalML"
method = "CEVAE"

if method in df_results['method'].values:
    df_results.loc[df_results['method'] == method, 'ATE'] = ate

else:
    df_results = df_results._append({'method': method, 'ATE': ate, 'Library': lib}, ignore_index=True)

%store df_results

Stored 'df_results' (DataFrame)


## Synthetic Data

In [None]:
df_synth = pd.read_csv("synthetic_dataset.csv")
df_synth.head()
synthetic_features = ['NumberOfOffers', 'concept:name',
       'lifecycle:transition', 'time:timestamp', 'elementId', 'resourceId',
       'weekdayApplication', 'timeApplication']

t_treatment=np.array([np.array([value]) for value in df_synth['treatment']])
x_feature = df_synth[synthetic_features].values
y_outcome=df_synth['treatmentSuccess'].values

In [3]:
synth_dragon = DragonNet(neurons_per_layer=100, targeted_reg=True)
synth_dragon_ite = synth_dragon.fit_predict(x_feature, t_treatment, y_outcome, return_components=False)
synth_dragon_ate = synth_dragon_ite.mean()
print(synth_dragon_ite, synth_dragon_ate)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 1/100
   9/2795 [..............................] - ETA: 36s - loss: 522981824.0000 - regression_loss: 10491740.0000 - binary_classification_loss: 148.4771 - treatment_accuracy: 0.6545 - track_epsilon: 17.3554Batch 9: Invalid loss, terminating training
[[nan]
 [nan]
 [nan]
 ...
 [nan]
 [nan]
 [nan]] nan


In [26]:
y = 1

# Calculate statistics
data = np.reshape(synth_dragon_ite, -1)
minimum = np.min(data)
first_quartile = np.percentile(data, 25)
median = np.median(data)
third_quartile = np.percentile(data, 75)
maximum = np.max(data)

# Interquartile range (IQR)
iqr = third_quartile - first_quartile

# Define upper and lower bounds for outliers
upper_bound = third_quartile + 1.5 * iqr
lower_bound = first_quartile - 1.5 * iqr

# Detect outliers
outliers = data[(data < lower_bound) | (data > upper_bound)]

# Print the statistics
print("Minimum:", minimum)
print("First Quartile:", first_quartile)
print("Median:", median)
print("Third Quartile:", third_quartile)
print("Maximum:", maximum)
print("Interquartile Range:", iqr)
print("Upper Bound (Outliers):", upper_bound)
print("Lower Bound (Outliers):", lower_bound)
print("Outliers:", outliers)

ite_synth_dragon = [minimum, first_quartile, median, third_quartile, maximum, iqr, upper_bound, lower_bound]

Minimum: nan
First Quartile: nan
Median: nan
Third Quartile: nan
Maximum: nan
Interquartile Range: nan
Upper Bound (Outliers): nan
Lower Bound (Outliers): nan
Outliers: []


In [None]:
# Split data to training and testing samples for model validation (next section)
df_synth_train, df_synth_test = train_test_split(df_synth, test_size=0.2, random_state=11101)

In [None]:
X_train = df_synth_train[synthetic_features].values
X_test = df_synth_test[synthetic_features].values
treatment_train = df_synth_train['treatment'].values
treatment_test = df_synth_test['treatment'].values
y_train = df_synth_train['treatmentSuccess'].values
y_test = df_synth_test['treatmentSuccess'].values

In [None]:
# cevae model settings
outcome_dist = "normal"
latent_dim = 20
hidden_dim = 200
num_epochs = 5
batch_size = 1000
learning_rate = 0.001
learning_rate_decay = 0.01
num_layers = 2

cevae = CEVAE(outcome_dist=outcome_dist,
              latent_dim=latent_dim,
              hidden_dim=hidden_dim,
              num_epochs=num_epochs,
              batch_size=batch_size,
              learning_rate=learning_rate,
              learning_rate_decay=learning_rate_decay,
              num_layers=num_layers)

In [None]:
# fit
synth_losses = cevae.fit(X=torch.tensor(X_train, dtype=torch.float),
                   treatment=torch.tensor(treatment_train, dtype=torch.float),
                   y=torch.tensor(y_train, dtype=torch.float))

In [None]:
# predict
synth_ite = cevae.predict(X_test)

In [None]:
synth_ate = synth_ite.mean()

In [None]:
import evaluation_metrics
true_ate = 2
boxplot = evaluation_metrics.boxplot_ite(synth_ite)
metrics = evaluation_metrics.evaluation_metrics(true_ate, synth_ite)
print(boxplot, metrics)

In [None]:
%store -r df_synthetic_results_metric

method = "CEVAE"
ate = synth_ate
ite = boxplot
metric = metrics

df_synthetic_results_metric = df_synthetic_results_metric._append({'method': method, 'ATE': ate, 'ITE': ite, 'metrics': metric}, ignore_index=True)

print(df_synthetic_results_metric)
%store df_synthetic_results_metric