Support vector regression (Leveraging Shallow Machine Learning to Predict Business Process Behavior)

In [1]:
import pandas as pd
import numpy as np
import datetime
import h2o
from sklearn.metrics import mean_absolute_error

In [15]:
df_train = pd.read_csv('preprocessed_train.csv')
df_test = pd.read_csv('preprocessed_test.csv')
df_train['event time:timestamp'] =  pd.to_datetime(df_train['event time:timestamp'])
df_train['timestamp_next'] = df_train.groupby('case concept:name')['timestamp_finish'].shift(-1)
df_test['timestamp_next'] = df_test.groupby('case concept:name')['timestamp_finish'].shift(-1)
df_test.head()

Unnamed: 0.1,Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,timestamp_start,timestamp_finish,time_to_complete,day_week,day_month,time_of_day,position,timestamp_next
0,0,137,32422708117518,197219,2012-02-01 15:28:39.244,10000,10909.0,W_Nabellen offertes,START,2012-01-16 11:56:28.997,1328110119,1326714988,-1 days +23:59:58.604869753,0,16,11,1,1326715000.0
1,1,138,32422708117519,197219,2012-02-01 15:28:39.244,10000,10909.0,W_Nabellen offertes,COMPLETE,2012-01-16 11:58:03.129,1328110119,1326715083,-1 days +23:59:58.604963885,0,16,11,2,1326886000.0
2,2,188,32422708117520,197219,2012-02-01 15:28:39.244,10000,10861.0,W_Nabellen offertes,START,2012-01-18 11:29:06.272,1328110119,1326886146,-1 days +23:59:58.776027028,2,18,11,3,1326886000.0
3,3,189,32422708117521,197219,2012-02-01 15:28:39.244,10000,10861.0,W_Nabellen offertes,COMPLETE,2012-01-18 11:31:28.257,1328110119,1326886288,-1 days +23:59:58.776169013,2,18,11,4,1326893000.0
4,4,190,32422708117522,197219,2012-02-01 15:28:39.244,10000,11259.0,W_Nabellen offertes,START,2012-01-18 13:24:29.176,1328110119,1326893069,-1 days +23:59:58.782949932,2,18,13,5,1326893000.0


In [16]:
h2o.init()
from h2o.estimators import H2ORandomForestEstimator

hf_train = h2o.H2OFrame(df_train)
hf_test = h2o.H2OFrame(df_test)

predictors = ['timestamp_start','case AMOUNT_REQ','event concept:name','timestamp_finish','position']
dependent = 'timestamp_next'


random_forest = H2ORandomForestEstimator(ntrees=50,
                                        max_depth=10,
                                        min_rows=10)

random_forest.train(x=predictors,y=dependent,
                   training_frame = hf_train,
                   validation_frame= hf_test)

perf = random_forest.model_performance()

pred = random_forest.predict(hf_test)
print(pred)

Checking whether there is an H2O instance running at http://localhost:54321. connected.


0,1
H2O_cluster_uptime:,12 mins 05 secs
H2O_cluster_timezone:,Europe/Amsterdam
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.1
H2O_cluster_version_age:,27 days
H2O_cluster_name:,H2O_from_python_20191955_yh0dpk
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.711 Gb
H2O_cluster_total_cores:,12
H2O_cluster_allowed_cores:,12


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
    predict
1.32554e+09
1.32533e+09
1.32555e+09
1.32556e+09
1.32544e+09
1.32531e+09
1.32324e+09
1.32321e+09
1.32345e+09
1.32345e+09
[22414 rows x 1 column]



In [5]:
def avg_time_per_position(df):
    avg_time = [0] * df['position'].max() # Set initial values for avg_time list with size equal to maximum position
    n = 0 # Initialize a counter for the number of events
    for i in range(df.shape[0] - 1): # Loop through all events in the dataframe
        if df.iloc[i]['case concept:name'] == df.iloc[i + 1]['case concept:name']: #Only continue if the next event is in the same case
            position = df.iloc[i]['position']
            t_finish_last = df.iloc[i]['timestamp_finish'] #Finish time of event i
            t_start_next_event = df.iloc[i + 1]['timestamp_finish'] #Start time of position i+1
            t_delta = t_start_next_event - t_finish_last #Time difference
            avg_time[position - 1] = (avg_time[position - 1] * n + t_delta) / (n+1) #update average time (Note the differences are in ms)
            n += 1
    return avg_time

def avg_time_per_position_added(df): #Fast version of time prediction
    df['next position'] = df.groupby('case concept:name')['position'].shift(-1) #Create column that contains the next even of each case
    df['Prediction time next event'] = df.apply(lambda x: 
        x['event time:timestamp'] + datetime.timedelta(seconds=avg_time[int(x['position']) - 1] / 1000)
        if not pd.isnull(x['next position']) else np.nan, axis=1) #Returns NaT if this position is the last one, otherwise it will add the time
    df.drop('next position', axis=1, inplace=True)
    return df


avg_time = avg_time_per_position(df_train)
df_train = avg_time_per_position_added(df_train)
df_train.head()

Unnamed: 0.1,Unnamed: 0,index,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,timestamp_start,timestamp_finish,time_to_complete,day_week,day_month,time_of_day,position,Prediction time next event
0,0,0,0,173688,2011-01-10 00:38:44.546,20000,112.0,A_SUBMITTED,COMPLETE,2011-01-10 00:38:44.546,1294619924,1294619924,0 days 00:00:00,0,10,0,1,2011-01-10 00:38:44.867543
1,1,1,1,173688,2011-01-10 00:38:44.546,20000,112.0,A_PARTLYSUBMITTED,COMPLETE,2011-01-10 00:38:44.880,1294619924,1294619924,0 days 00:00:00.000000334,0,10,0,2,2011-01-10 00:40:15.370906
2,2,2,2,173688,2011-01-10 00:38:44.546,20000,112.0,A_PREACCEPTED,COMPLETE,2011-01-10 00:39:37.906,1294619924,1294619977,0 days 00:00:00.000053360,0,10,0,3,2011-01-10 00:40:55.956427
3,3,3,3,173688,2011-01-10 00:38:44.546,20000,112.0,W_Completeren aanvraag,SCHEDULE,2011-01-10 00:39:38.875,1294619924,1294619978,0 days 00:00:00.000054329,0,10,0,4,2011-01-10 00:42:43.678512
4,4,89,4,173688,2011-01-10 00:38:44.546,20000,,W_Completeren aanvraag,START,2011-01-10 11:36:46.437,1294619924,1294659406,0 days 00:00:00.039481891,0,10,11,5,2011-01-10 11:36:54.392586


In [6]:
def calculate_mae(df):
    df['timestamp_prediction'] = df['Prediction time next event'].values.astype(np.int64) // 10 ** 9
    
    y_pred = []
    y_true = []
    for i in range(df.shape[0] - 1):
        if df.iloc[i]['case concept:name'] == df.iloc[i + 1]['case concept:name']:
            y_pred.append(df.iloc[i]['timestamp_prediction'])
            y_true.append(df.iloc[i + 1]['timestamp_finish'])
    mae = mean_absolute_error(y_true, y_pred)
    return mae


mae = calculate_mae(df_train) 
print(mae) #Current MAE~4 days

362759.1716305629
