In [1]:
%load_ext autoreload
%autoreload 2
%reload_ext autoreload

In [2]:
path = "c:\\Users\\u0166838\\OneDrive - KU Leuven\\Documents\\Doc\\Code\\SimBank from Simulation to Solution in Prescriptive Process Monitoring"

import math
import random
import pandas as pd
from datetime import datetime
from copy import deepcopy
from itertools import product
import simulation
import confounding_level
import sys
sys.path.append(path)
from src.utils.tools import save_data, load_data

In [3]:
#DATASET parameters
dataset_params = {}
#general
dataset_params["train_size"] = 100000
dataset_params["test_size"] = 10000
dataset_params["val_share"] = .5
dataset_params["train_val_size"] = 10000
dataset_params["test_val_size"] = min(int(dataset_params["val_share"] * dataset_params["test_size"]), 1000)
dataset_params["simulation_start"] = datetime(2024, 3, 20, 8, 0)
dataset_params["random_seed_train"] = 82
dataset_params["random_seed_test"] = 130
#process
dataset_params["log_cols"] = ["case_nr", "activity", "timestamp", "elapsed_time", "cum_cost", "est_quality", "unc_quality", "amount", "interest_rate", "discount_factor", "outcome", "quality", "noc", "nor", "min_interest_rate"]
dataset_params["case_cols"] = ["amount"]
dataset_params["event_cols"] = ["activity", "elapsed_time", "cum_cost", "est_quality", "unc_quality", "interest_rate", "discount_factor"]
dataset_params["cat_cols"] = ["activity"]
dataset_params["scale_cols"] = ["amount", "elapsed_time", "cum_cost", "est_quality", "unc_quality", "interest_rate", "discount_factor", "outcome"]
#intervention
dataset_params["intervention_info"] = {}
dataset_params["intervention_info"]["name"] = ["choose_procedure"]
# dataset_params["intervention_info"]["name"] = ["set_ir_3_levels"]
# dataset_params["intervention_info"]["name"] = ["time_contact_HQ"]
# dataset_params["intervention_info"]["name"] = ["choose_procedure", "set_ir_3_levels"]
if dataset_params["intervention_info"]["name"] == ["choose_procedure"]:
    dataset_params["intervention_info"]["data_impact"] = ["direct"]
    dataset_params["intervention_info"]["actions"] = [["start_standard", "start_priority"]] #If binary, last action is the 'treatment' action
    dataset_params["intervention_info"]["action_width"] = [2]
    dataset_params["intervention_info"]["action_depth"] = [1]
    dataset_params["intervention_info"]["activities"] = [["start_standard", "start_priority"]]
    dataset_params["intervention_info"]["column"] = ["activity"]
    dataset_params["intervention_info"]["start_control_activity"] = [["initiate_application"]]
    dataset_params["intervention_info"]["end_control_activity"] = [["initiate_application"]]
elif dataset_params["intervention_info"]["name"] == ["set_ir_3_levels"]:
    dataset_params["intervention_info"]["data_impact"] = ["indirect"]
    dataset_params["intervention_info"]["actions"] = [[0.07, 0.08, 0.09]]
    dataset_params["intervention_info"]["action_width"] = [3]
    dataset_params["intervention_info"]["action_depth"] = [1]
    dataset_params["intervention_info"]["activities"] = [["calculate_offer"]]
    dataset_params["intervention_info"]["column"] = ["interest_rate"]
    dataset_params["intervention_info"]["start_control_activity"] = [[]]
    dataset_params["intervention_info"]["end_control_activity"] = [[]]
elif dataset_params["intervention_info"]["name"] == ["time_contact_HQ"]:
    dataset_params["intervention_info"]["data_impact"] = ["direct"]
    dataset_params["intervention_info"]["actions"] = [["do_nothing","contact_headquarters"]] #If binary, last action is the 'treatment' action
    dataset_params["intervention_info"]["action_width"] = [2]
    dataset_params["intervention_info"]["action_depth"] = [4] 
    dataset_params["intervention_info"]["activities"] = [["do_nothing", "contact_headquarters"]]
    dataset_params["intervention_info"]["column"] = ["activity"]
    dataset_params["intervention_info"]["start_control_activity"] = [["start_standard"]]
    dataset_params["intervention_info"]["end_control_activity"] = [["start_standard", "email_customer", "call_customer"]]
elif dataset_params["intervention_info"]["name"] == ["choose_procedure", "set_ir_3_levels"]:
    dataset_params["intervention_info"]["data_impact"] = ["direct", "indirect"]
    dataset_params["intervention_info"]["actions"] = [["start_standard", "start_priority"], [0.07, 0.08, 0.09]]
    dataset_params["intervention_info"]["action_width"] = [2, 3] 
    dataset_params["intervention_info"]["action_depth"] = [1, 1] 
    dataset_params["intervention_info"]["activities"] = [["start_standard", "start_priority"], ["calculate_offer"]]
    dataset_params["intervention_info"]["column"] = ["activity", "interest_rate"]
    dataset_params["intervention_info"]["start_control_activity"] = [["initiate_application"], []]
    dataset_params["intervention_info"]["end_control_activity"] = [["initiate_application"], []]

dataset_params["intervention_info"]["retain_method"] = "precise"
# dataset_params["intervention_info"]["retain_method"] = "non-precise"

# Combinations
dataset_params["intervention_info"]["action_combinations"] = list(product(*dataset_params["intervention_info"]["actions"]))
dataset_params["intervention_info"]["action_width_combinations"] = math.prod(dataset_params["intervention_info"]["action_width"])
dataset_params["intervention_info"]["action_depth_combinations"] = math.prod(dataset_params["intervention_info"]["action_depth"])

dataset_params["intervention_info"]["len"] = [action_width if action_width > 2 else 1 for action_width in dataset_params["intervention_info"]["action_width"]]
dataset_params["intervention_info"]["RCT"] = False
dataset_params["filename"] = "loan_log_" +  str(dataset_params["intervention_info"]["name"])
#policy
dataset_params["policies_info"] = {}
dataset_params["policies_info"]["general"] = "real"
dataset_params["policies_info"]["choose_procedure"] = {"amount": 50000, "est_quality": 5}
dataset_params["policies_info"]["time_contact_HQ"] = "real"
dataset_params["policies_info"]["min_quality"] = 2
dataset_params["policies_info"]["max_noc"] = 3
dataset_params["policies_info"]["max_nor"] = 1
dataset_params["policies_info"]["min_amount_contact_cust"] = 50000

## Offline Mode

### - Use to generate offline, fixed datasets for methods requiring offline training

In [4]:
# Initiate simulation
offline_gen_normal = simulation.PresProcessGenerator(dataset_params, dataset_params["random_seed_train"])

# Generate training data (bank policy)
train_normal = offline_gen_normal.run_simulation_normal(dataset_params["train_size"])

# Generate validation data
train_normal_val = offline_gen_normal.run_simulation_normal(dataset_params["train_val_size"], seed_to_add=88)

In [5]:
# Generate RCT data (randomly chosen intervention actions)
dataset_params_RCT = deepcopy(dataset_params)
dataset_params_RCT["intervention_info"]["RCT"] = True
dataset_params_RCT["random_seed_train"] = dataset_params["random_seed_train"]*10
dataset_params_RCT["simulation_start"] = deepcopy(offline_gen_normal.simulation_end)

# Initiate simulation
offline_gen_RCT = simulation.PresProcessGenerator(dataset_params_RCT, dataset_params_RCT["random_seed_train"])

# Generate training data
train_RCT = offline_gen_RCT.run_simulation_normal(dataset_params_RCT["train_size"])

# Generate validation data
train_RCT_val = offline_gen_RCT.run_simulation_normal(dataset_params_RCT["train_val_size"], seed_to_add=88)

### - Vary the confounding level for offline datasets

In [None]:
# Set confounding level delta (combination of training data generated by bank policy and RCT)
delta = 0
train = confounding_level.set_delta(data=train_normal, data_RCT=train_RCT, delta=delta)
train_val = confounding_level.set_delta(data=train_normal_val, data_RCT=train_RCT_val, delta=delta)

##### Save data

In [None]:
save_data(dataset_params, path + "\\Data\\" + dataset_params["filename"] + "_" + str(dataset_params["train_size"]) + "_dataset_params")
save_data(train_normal, path + "\\Data\\" + dataset_params["filename"] + "_" + str(dataset_params["train_size"]) + "_train_normal")
save_data(train_normal_val, path + "\\Data\\" + dataset_params["filename"] + "_" + str(dataset_params["train_size"]) + "_train_normal_val")
save_data(train_RCT, path + "\\Data\\" + dataset_params["filename"] + "_" + str(dataset_params["train_size"]) + "_train_RCT")
save_data(train_RCT_val, path + "\\Data\\" + dataset_params["filename"] + "_" + str(dataset_params["train_size"]) + "_train_RCT_val")

## Online Mode

### - Use to generate cases (episodes) for methods requiring online training
### - Use to generate cases (episodes) for testing the performance of (both onffline and online) methods (make sure to specify the seed to generate the same cases for each method)

In [None]:
# Initiate simulation
online_gen = simulation.PresProcessGenerator(dataset_params, dataset_params["random_seed_test"])

# Start simulation
for case in range(dataset_params["test_size"]):
    prefix_list = online_gen.start_simulation_inference(seed_to_add=case)

    # Continue simulation and specify actions (for example using a model) until no more interventions are available
    while online_gen.int_points_available:
        
        # Specify the action to take
        index_action_taken = random.randrange(len(dataset_params["intervention_info"]["actions"][online_gen.current_int_index]))
        print("Action taken: ", dataset_params["intervention_info"]["actions"][online_gen.current_int_index][index_action_taken])

        # Continue simulation
        prefix_list = online_gen.continue_simulation_inference(index_action_taken)

    # End simulation
    full_case = online_gen.end_simulation_inference()
    print("\n", "Full case: ", full_case)

### - Example of bank policy performance: the first cell includes the bank policy rules, the second cell contains the performance calculation

In [6]:
# Bank policy

from activity_execution import ActivityExecutioner
def get_bank_best_action(prefix_list, current_int_index):
        prefix_without_int = prefix_list[0][0:-1]
        prev_event = prefix_without_int[-1]
        action_index = 0
        
        if dataset_params["intervention_info"]["name"][current_int_index] == "time_contact_HQ":
            cancel_condition = ((prev_event["unc_quality"] == 0 and prev_event["est_quality"] < dataset_params["policies_info"]["min_quality"] and prev_event["noc"] >= dataset_params["policies_info"]["max_noc"]) or (prev_event["noc"] >= dataset_params["policies_info"]["max_noc"] and prev_event["unc_quality"] > 0))
            contact_condition = (prev_event["noc"] < 2 and prev_event["unc_quality"] == 0 and prev_event["amount"] > 10000 and prev_event["est_quality"] >= dataset_params["policies_info"]["min_quality"])

            if cancel_condition:
                action_index = 0
            elif contact_condition:
                action_index = 1
        
        elif dataset_params["intervention_info"]["name"][current_int_index] == "choose_procedure":
            priority_condition = (prev_event["amount"] > dataset_params["policies_info"]["choose_procedure"]["amount"] and prev_event["est_quality"] >= dataset_params["policies_info"]["choose_procedure"]["est_quality"])

            if priority_condition:
                action_index = 1
            else:
                action_index = 0
        
        elif dataset_params["intervention_info"]["name"][current_int_index] == "set_ir_3_levels":
            activity_executioner = ActivityExecutioner()
            ir, _, _ = activity_executioner.calculate_offer(prev_event=prev_event, intervention_info=dataset_params["intervention_info"])
            action_index = dataset_params["intervention_info"]["actions"][current_int_index].index(ir)
        
        return action_index

In [7]:
# Specify number of cases
n_cases = 10000

#Init performance metrics
bank_performance = 0

#Init data generator
case_gen = simulation.PresProcessGenerator(dataset_params, seed=dataset_params["random_seed_test"])

#Run
for case_nr in range(n_cases):
    prefix_list = []
    prefix_list = case_gen.start_simulation_inference(seed_to_add=case_nr)
    while case_gen.int_points_available:
        bank_best_action = get_bank_best_action(prefix_list, case_gen.current_int_index)
        # Break if intervention done or in last timing
        prefix_list = case_gen.continue_simulation_inference(bank_best_action)

    full_case = case_gen.end_simulation_inference()
    full_case = pd.DataFrame(full_case)
    bank_performance += full_case["outcome"].iloc[-1]

print("Bank performance: ", bank_performance)

Bank performance:  29976509.22284894


### - Example of random policy performance: random policy includes sampling a random timing of the intervention (if action depth > 1) and sampling a random intervention action

In [10]:
# Specify number of cases and iteration (to get the same results as the paper, random performance is averaged over iterations 0 to 4)
n_cases = 10000
iteration = 0

# Init performance metrics
random_performance = 0

# Init data generator
case_gen = simulation.PresProcessGenerator(dataset_params, seed=dataset_params["random_seed_test"])

random_object_for_random_policy = random.Random(dataset_params["random_seed_test"] + 5*iteration)

# Run
for case_nr in range(n_cases):
    prefix_list = []
    prefix_list = case_gen.start_simulation_inference(seed_to_add=case_nr)

    # Sample random timing if action depth is more than 1 (else, timing is just 0)
    timings = [0] * len(dataset_params["intervention_info"]["name"])
    random_best_timings = []
    for int_index in range(len(dataset_params["intervention_info"]["name"])):
        if dataset_params["intervention_info"]["name"][int_index] == "time_contact_HQ":
            random_best_timings.append(random_object_for_random_policy.choice(range(dataset_params["intervention_info"]["action_depth"][int_index])) * 2)
        else:
            random_best_timings.append(0)
    random_best_action = 0 # control

    while case_gen.int_points_available:
        
        # Only do an action if the current timing is the same as the randomly sampled timing
        if timings[case_gen.current_int_index] == random_best_timings[case_gen.current_int_index]:
            # Sample a random intervention action
            random_best_action = random_object_for_random_policy.choice(range(dataset_params["intervention_info"]["action_width"][case_gen.current_int_index]))
        timings[case_gen.current_int_index] += 1
        
        prefix_list = case_gen.continue_simulation_inference(random_best_action)

    full_case = case_gen.end_simulation_inference()
    full_case = pd.DataFrame(full_case)
    random_performance += full_case["outcome"].iloc[-1]

print("Random policy performance: ", random_performance)

Random policy performance:  23851100.968992893
