In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [2]:
df = pd.read_csv("bpi2017_filledLog.csv")
df['treatment'] = df['treatment'].replace({'treated': 1, 'notTreated': 0}).astype('int64')
df['treatmentSuccess'] = df['treatmentSuccess'].replace({'Yes': 2, 'No': 1, '0': 0}).astype('int64')

In [3]:
smaller_df = df.drop(columns=['case:concept:name', 'Action','org:resource', 'concept:name', 'EventOrigin', 'EventID', 'lifecycle:transition', 'time:timestamp','OfferID'])

In [4]:
struct_data = smaller_df.copy()
non_numeric_columns = list(struct_data.select_dtypes(exclude=[np.number]).columns)

le = LabelEncoder()

for col in non_numeric_columns:
    struct_data[col] = le.fit_transform(struct_data[col])

struct_data.head(5)

Unnamed: 0,NumberOfOffers,case:LoanGoal,case:ApplicationType,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,treatment,successful,treatmentSuccess
0,1,10,1,5000.0,5000.0,22.0,1,241.28,0,0.0,5000.0,0,0,0
1,1,10,1,5000.0,5000.0,22.0,1,241.28,0,0.0,5000.0,0,0,0
2,1,10,1,5000.0,5000.0,22.0,1,241.28,0,0.0,5000.0,0,0,0
3,1,10,1,5000.0,5000.0,22.0,1,241.28,0,0.0,5000.0,0,0,0
4,1,10,1,5000.0,5000.0,22.0,1,241.28,0,0.0,5000.0,0,0,0


In [5]:
struct_data.columns

Index(['NumberOfOffers', 'case:LoanGoal', 'case:ApplicationType',
       'case:RequestedAmount', 'FirstWithdrawalAmount', 'NumberOfTerms',
       'Accepted', 'MonthlyCost', 'Selected', 'CreditScore', 'OfferedAmount',
       'treatment', 'successful', 'treatmentSuccess'],
      dtype='object')

In [6]:
feature_names = ['NumberOfOffers', 'case:LoanGoal', 'case:ApplicationType',
       'case:RequestedAmount', 'FirstWithdrawalAmount', 'NumberOfTerms',
       'Accepted', 'MonthlyCost', 'Selected', 'CreditScore', 'OfferedAmount']

In [7]:
import yaml
params = yaml.safe_load(open('egm-nn/bpi2017.yaml', 'r'))
print(params)

{'dataset': 'BPI2017', 'output_dir': './', 'v_dim': 11, 'z_dims': [3, 3, 6, 6], 'x_min': 0, 'x_max': 3, 'lr': 0.0002, 'alpha': 1, 'beta': 1, 'gamma': 10, 'g_d_freq': 5, 'g_units': [64, 64, 64, 64, 64], 'e_units': [64, 64, 64, 64, 64], 'f_units': [64, 32, 8], 'h_units': [64, 32, 8], 'dz_units': [64, 32, 8], 'dv_units': [64, 32, 8], 'binary_treatment': True, 'use_z_rec': True, 'use_v_gan': True, 'save_model': False, 'save_res': True}


In [12]:
import CausalEGM as cegm
model = cegm.CausalEGM(params=params,random_seed=123)

In [10]:
#treatment (x), potential outcome (y), and covariates (v)
x = struct_data['treatment']
y = struct_data['successful']
v = struct_data[feature_names]

In [13]:
model.train(data=[x,y,v],n_iter=100,save_format='npy',verbose=False)

The average treatment effect (ATE) is  0.05254575
