Support vector regression (Leveraging Shallow Machine Learning to Predict Business Process Behavior)

In [3]:
# !pip install h2o

In [4]:
import pandas as pd
import numpy as np
import datetime
import h2o
from sklearn.metrics import mean_absolute_error

In [5]:
df_train = pd.read_csv('preprocessed_train.csv')
df_test = pd.read_csv('preprocessed_test.csv')
df_train['event time:timestamp'] =  pd.to_datetime(df_train['event time:timestamp'])
df_train['timestamp_next'] = df_train.groupby('case concept:name')['timestamp_finish'].shift(-1)
df_test['timestamp_next'] = df_test.groupby('case concept:name')['timestamp_finish'].shift(-1)
df_test.head()

Unnamed: 0.1,Unnamed: 0,level_0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,...,time_of_day,position,prev_event,2prev_event,next_event,prev_time,next_time,prev_timestamp,next_timestamp,timestamp_next
0,239787,239787,45676977192960,206826,2012-06-02 16:07:15.673,15000,112.0,A_SUBMITTED,COMPLETE,2012-06-02 16:07:15.673,...,16,1,FIRST EVENT,FIRST EVENT,A_PARTLYSUBMITTED,2012-06-02 16:07:15.673,2012-06-02 16:07:15.959,1338653000.0,1338653000.0,1338653000.0
1,239788,239788,45676977192961,206826,2012-06-02 16:07:15.673,15000,112.0,A_PARTLYSUBMITTED,COMPLETE,2012-06-02 16:07:15.959,...,16,2,A_SUBMITTED,FIRST EVENT,A_SUBMITTED,2012-06-02 16:07:15.673,2012-06-02 16:07:15.673,1338653000.0,1338653000.0,1338653000.0
2,239789,239789,45676977192960,206826,2012-06-02 16:07:15.673,15000,112.0,A_SUBMITTED,COMPLETE,2012-06-02 16:07:15.673,...,16,3,A_PARTLYSUBMITTED,A_SUBMITTED,A_PARTLYSUBMITTED,2012-06-02 16:07:15.959,2012-06-02 16:07:15.959,1338653000.0,1338653000.0,1338653000.0
3,239790,239790,45676977192961,206826,2012-06-02 16:07:15.673,15000,112.0,A_PARTLYSUBMITTED,COMPLETE,2012-06-02 16:07:15.959,...,16,4,A_SUBMITTED,A_PARTLYSUBMITTED,A_SUBMITTED,2012-06-02 16:07:15.673,2012-06-02 16:07:15.673,1338653000.0,1338653000.0,1338653000.0
4,239791,284613,45676977192960,206826,2012-06-02 16:07:15.673,15000,112.0,A_SUBMITTED,COMPLETE,2012-06-02 16:07:15.673,...,16,5,A_PARTLYSUBMITTED,A_SUBMITTED,A_PARTLYSUBMITTED,2012-06-02 16:07:15.959,2012-06-02 16:07:15.959,1338653000.0,1338653000.0,1338653000.0


In [6]:
h2o.init()
from h2o.estimators import H2ORandomForestEstimator

hf_train = h2o.H2OFrame(df_train)
hf_test = h2o.H2OFrame(df_test)

predictors = ['timestamp_start','case AMOUNT_REQ','event concept:name','timestamp_finish','position']
dependent = 'timestamp_next'


random_forest = H2ORandomForestEstimator(ntrees=50,
                                        max_depth=10,
                                        min_rows=10)

random_forest.train(x=predictors,y=dependent,
                   training_frame = hf_train,
                   validation_frame= hf_test)

perf = random_forest.model_performance()

pred = random_forest.predict(hf_test)
print(pred)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.361-b09, mixed mode)
  Starting server from C:\Users\Goshko\anaconda3\lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\Goshko\AppData\Local\Temp\tmpz2tr73eo
  JVM stdout: C:\Users\Goshko\AppData\Local\Temp\tmpz2tr73eo\h2o_Goshko_started_from_python.out
  JVM stderr: C:\Users\Goshko\AppData\Local\Temp\tmpz2tr73eo\h2o_Goshko_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,Europe/Berlin
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.40.0.2
H2O_cluster_version_age:,3 hours and 27 minutes
H2O_cluster_name:,H2O_from_python_Goshko_5slgs3
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.531 Gb
H2O_cluster_total_cores:,8
H2O_cluster_allowed_cores:,8


Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
drf Model Build progress: |██████████████████████████████████████████████████████| (done) 100%
drf prediction progress: |███████████████████████████████████████████████████████| (done) 100%
    predict
1.31521e+09
1.31521e+09
1.31544e+09
1.31544e+09
1.31566e+09
1.31563e+09
1.31316e+09
1.31316e+09
1.31316e+09
1.31316e+09
[268956 rows x 1 column]



In [7]:
def avg_time_per_position(df):
    avg_time = [0] * df['position'].max() # Set initial values for avg_time list with size equal to maximum position
    n = 0 # Initialize a counter for the number of events
    for i in range(df.shape[0] - 1): # Loop through all events in the dataframe
        if df.iloc[i]['case concept:name'] == df.iloc[i + 1]['case concept:name']: #Only continue if the next event is in the same case
            position = df.iloc[i]['position']
            t_finish_last = df.iloc[i]['timestamp_finish'] #Finish time of event i
            t_start_next_event = df.iloc[i + 1]['timestamp_finish'] #Start time of position i+1
            t_delta = t_start_next_event - t_finish_last #Time difference
            avg_time[position - 1] = (avg_time[position - 1] * n + t_delta) / (n+1) #update average time (Note the differences are in ms)
            n += 1
    return avg_time

def avg_time_per_position_added(df): #Fast version of time prediction
    df['next position'] = df.groupby('case concept:name')['position'].shift(-1) #Create column that contains the next even of each case
    df['Prediction time next event'] = df.apply(lambda x: 
        x['event time:timestamp'] + datetime.timedelta(seconds=avg_time[int(x['position']) - 1] / 1000)
        if not pd.isnull(x['next position']) else np.nan, axis=1) #Returns NaT if this position is the last one, otherwise it will add the time
    df.drop('next position', axis=1, inplace=True)
    return df


avg_time = avg_time_per_position(df_train)
df_train = avg_time_per_position_added(df_train)
df_train.head()

Unnamed: 0.1,Unnamed: 0,level_0,eventID,case concept:name,case REG_DATE,case AMOUNT_REQ,event org:resource,event concept:name,event lifecycle:transition,event time:timestamp,...,position,prev_event,2prev_event,next_event,prev_time,next_time,prev_timestamp,next_timestamp,timestamp_next,Prediction time next event
0,0,0,0,173688,2011-01-10 00:38:44.546,20000,112.0,A_SUBMITTED,COMPLETE,2011-01-10 00:38:44.546,...,1,FIRST EVENT,FIRST EVENT,A_PARTLYSUBMITTED,2011-01-10 00:38:44.546,2011-01-10 00:38:44.880,1294620000.0,1294620000.0,1294620000.0,2011-01-10 00:38:44.716516
1,1,1,1,173688,2011-01-10 00:38:44.546,20000,112.0,A_PARTLYSUBMITTED,COMPLETE,2011-01-10 00:38:44.880,...,2,A_SUBMITTED,FIRST EVENT,A_PREACCEPTED,2011-01-10 00:38:44.546,2011-01-10 00:39:37.906,1294620000.0,1294620000.0,1294620000.0,2011-01-10 00:39:40.837809
2,2,2,2,173688,2011-01-10 00:38:44.546,20000,112.0,A_PREACCEPTED,COMPLETE,2011-01-10 00:39:37.906,...,3,A_PARTLYSUBMITTED,A_SUBMITTED,W_Completeren aanvraag,2011-01-10 00:38:44.880,2011-01-10 00:39:38.875,1294620000.0,1294620000.0,1294620000.0,2011-01-10 00:40:41.659131
3,3,3,3,173688,2011-01-10 00:38:44.546,20000,112.0,W_Completeren aanvraag,SCHEDULE,2011-01-10 00:39:38.875,...,4,A_PREACCEPTED,A_PARTLYSUBMITTED,W_Completeren aanvraag,2011-01-10 00:39:37.906,2011-01-10 11:36:46.437,1294620000.0,1294659000.0,1294659000.0,2011-01-10 00:42:01.668581
4,4,4,4,173688,2011-01-10 00:38:44.546,20000,,W_Completeren aanvraag,START,2011-01-10 11:36:46.437,...,5,W_Completeren aanvraag,A_PREACCEPTED,A_ACCEPTED,2011-01-10 00:39:38.875,2011-01-10 11:42:43.308,1294620000.0,1294660000.0,1294660000.0,2011-01-10 11:36:51.449174


In [8]:
def calculate_mae(df):
    df['timestamp_prediction'] = df['Prediction time next event'].values.astype(np.int64) // 10 ** 9
    
    y_pred = []
    y_true = []
    for i in range(df.shape[0] - 1):
        if df.iloc[i]['case concept:name'] == df.iloc[i + 1]['case concept:name']:
            y_pred.append(df.iloc[i]['timestamp_prediction'])
            y_true.append(df.iloc[i + 1]['timestamp_finish'])
    mae = mean_absolute_error(y_true, y_pred)
    return mae


mae = calculate_mae(df_train) 
print(mae) #Current MAE~4 days

358173.2267616702
