In [1]:
# !pip install h2o

In [8]:
import pandas as pd
import numpy as np
import datetime

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Naive estimator

In [6]:
def avg_time_per_position(df):
    avg_time = [0] * df['position'].max() # Set initial values for avg_time list with size equal to maximum position
    n = 0 # Initialize a counter for the number of events
    for i in range(df.shape[0] - 1): # Loop through all events in the dataframe
        if df.iloc[i]['case concept:name'] == df.iloc[i + 1]['case concept:name']: #Only continue if the next event is in the same case
            position = df.iloc[i]['position']
            t_finish_last = df.iloc[i]['timestamp_finish'] #Finish time of event i
            t_start_next_event = df.iloc[i + 1]['timestamp_finish'] #Start time of position i+1
            t_delta = t_start_next_event - t_finish_last #Time difference
            avg_time[position - 1] = (avg_time[position - 1] * n + t_delta) / (n+1) #update average time (Note the differences are in ms)
            n += 1
    return avg_time

def avg_time_per_position_added(df): #Fast version of time prediction
    df['next position'] = df.groupby('case concept:name')['position'].shift(-1) #Create column that contains the next even of each case
    df['Prediction time next event'] = df.apply(lambda x: 
        x['event time:timestamp'] + datetime.timedelta(seconds=avg_time[int(x['position']) - 1] / 1000)
        if not pd.isnull(x['next position']) else np.nan, axis=1) #Returns NaT if this position is the last one, otherwise it will add the time
    df.drop('next position', axis=1, inplace=True)
    return df


avg_time = avg_time_per_position(df_test)
df_test = avg_time_per_position_added(df_test)


In [9]:
def calculate_metrics(df):
    df['timestamp_prediction'] = df['Prediction time next event'].values.astype(np.int64) // 10 ** 9
    
    y_pred = []
    y_true = []
    for i in range(df.shape[0] - 1):
        if df.iloc[i]['case concept:name'] == df.iloc[i + 1]['case concept:name']:
            y_pred.append(df.iloc[i]['timestamp_prediction'])
            y_true.append(df.iloc[i + 1]['timestamp_finish'])
            
    print('Mean Absolute Error:', round(mean_absolute_error(y_true, y_pred)/3600,3))
    print('Root Mean Squared Error:', round(np.sqrt(mean_squared_error(y_true, y_pred)/3600),3))
    print('R2 score:', round(r2_score(y_true, y_pred),3))
    
    
calculate_metrics(df_test) 


Mean Absolute Error: 157.549
Root Mean Squared Error: 49085.995
R2 score: 0.913
