# Whole event prediction

In [2]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error
from Split_functions import data_split
import plotly.express as px

import pandas as pd
import numpy as np

import pm4py as pm4

import pickle

In [3]:
# Since the model was already working, you could try to do the model based on first 2,3,4,5,6,7,8,9,10 values to predict all the values
# Becuase around 47% of events length are less than 10

In [4]:
df = pd.read_csv('cleaned_data.csv')
df.head()

Unnamed: 0,concept:name,time:timestamp,case:concept:name
0,A_SUBMITTED,2011-10-01 00:38:44.546000+00:00,173688
1,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880000+00:00,173688
2,A_PREACCEPTED,2011-10-01 00:39:37.906000+00:00,173688
3,A_ACCEPTED,2011-10-01 11:42:43.308000+00:00,173688
4,O_SELECTED,2011-10-01 11:45:09.243000+00:00,173688


## SOME DATA PREPARATIOIN 

In [4]:
clown = df.groupby('case:concept:name').count().sort_values(by = 'concept:name')
needed_ids = clown[clown['concept:name'] <= 82].reset_index()['case:concept:name'].unique()
df = df[df['case:concept:name'].isin(needed_ids)]

### For time

In [5]:
df['hour'] = df['time:timestamp'].dt.hour
df['day'] = df['time:timestamp'].dt.day
df['month'] = df['time:timestamp'].dt.month
df['weekday'] = df['time:timestamp'].dt.strftime("%A")
df['is_holiday'] = 0

In [6]:
work_hours = df.groupby('hour').count()
work_hours['percentage'] = work_hours['concept:name'].apply(lambda x : x/sum(work_hours['concept:name'])*100)
work_hours_list = work_hours[work_hours['percentage']>1].reset_index()['hour'].to_list()

In [7]:
# Determining if it is the working hours or not
df['work_hour'] = df['hour'].apply(lambda x: 1 if x in(work_hours_list) else 0)

In [8]:
# Typical weekends
df.loc[(df['weekday'] == 'Sunday') | (df['weekday'] == 'Saturday'), 'is_holiday'] = 1

# New Year's Day
df.loc[(df['day'] == 1) & (df['month'] == 1), 'is_holiday'] = 1

# Christmas Day 
df.loc[((df['day'].isin([i for i in range(22, 27)]))) & (df['month'] == 1), 'is_holiday'] = 1

# Good Friday, Easter 
df.loc[(df['day'].isin([i for i in range(6,10)])) & (df['month'] == 4), 'is_holiday'] = 1

# King's day (27 April)
df.loc[(df['day'] == 27) & (df['month'] == 4), 'is_holiday'] = 1

# Liberation Day
df.loc[(df['day'] == 5) & (df['month'] == 5), 'is_holiday'] = 1

# Ascension Day 
df.loc[(df['day'].isin([i for i in range(17, 21)])) & (df['month'] == 5), 'is_holiday'] = 1

# Pentecost
df.loc[(df['day'].isin([i for i in range(26, 29)])) & (df['month'] == 5), 'is_holiday'] = 1

In [9]:
df['current_time_delta'] = df.groupby('case:concept:name')['time:timestamp'].diff(-1).dt.total_seconds().abs()
df['logged_current_time_delta'] = np.log(df['current_time_delta'] + 1)

In [10]:
df['next_activity_time'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(-1)
df['lag1'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(1)
df['lag2'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(2)
df['lag3'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(3)
df['lag4'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(4)
df['lag5'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(5)

In [11]:
df = df.fillna(1e-6)

### For event

In [12]:
df['next_activity'] = df.groupby('case:concept:name')['concept:name'].shift(-1)
df['previous_activity1'] = df.groupby('case:concept:name')['concept:name'].shift(1)
df['previous_activity2'] = df.groupby('case:concept:name')['concept:name'].shift(2)
df['previous_activity3'] = df.groupby('case:concept:name')['concept:name'].shift(3)
df['previous_activity4'] = df.groupby('case:concept:name')['concept:name'].shift(4)
df['previous_activity5'] = df.groupby('case:concept:name')['concept:name'].shift(5)

In [13]:
le = LabelEncoder()
df['current_activity_encoded'] = le.fit_transform(df['concept:name'])
df['next_activity_encoded'] = le.fit_transform(df['next_activity'])
df['previous_activity1_encoded'] = le.fit_transform(df['previous_activity1'])
df['previous_activity2_encoded'] = le.fit_transform(df['previous_activity2'])
df['previous_activity3_encoded'] = le.fit_transform(df['previous_activity3'])
df['previous_activity4_encoded'] = le.fit_transform(df['previous_activity4'])
df['previous_activity5_encoded'] = le.fit_transform(df['previous_activity5'])

In [14]:
# delete this code
df['Start_from_A'] = df['concept:name'].apply(lambda x: 1 if x.split()[0][0] == 'A' else 0)
df['Start_from_O'] = df['concept:name'].apply(lambda x: 1 if x.split()[0][0] == 'O' else 0)
df['Start_from_W'] = df['concept:name'].apply(lambda x: 1 if x.split()[0][0] == 'W' else 0)
df['position'] = df.groupby('case:concept:name').cumcount()+1
df = pd.get_dummies(df, columns=['position'], dtype = int)
df = df.drop(['org:resource', 'concept:name', 'lifecycle:transition', 'case:REG_DATE', 'case:AMOUNT_REQ', 'position_82'], axis = 1)
to_use = df.drop(['next_activity', 'previous_activity1', 'previous_activity2', 'previous_activity3', 'previous_activity4', 'previous_activity5'], axis = 1)

predictor_columns = to_use.drop('next_activity_encoded', axis = 1).columns
train_columns = to_use.drop(['case:concept:name','time:timestamp','next_activity_encoded', "hour", 'day', 'month','weekday', 'is_holiday', 'work_hour',
'current_time_delta','logged_current_time_delta','next_activity_time','lag1','lag2','lag3','lag4','lag5'], axis = 1).columns

In [15]:
for i in range(25):
    print(train_columns[i])

current_activity_encoded
previous_activity1_encoded
previous_activity2_encoded
previous_activity3_encoded
previous_activity4_encoded
previous_activity5_encoded
Start_from_A
Start_from_O
Start_from_W
position_1
position_2
position_3
position_4
position_5
position_6
position_7
position_8
position_9
position_10
position_11
position_12
position_13
position_14
position_15
position_16


In [16]:
predictor = df[['logged_current_time_delta', 
                'current_activity_encoded',
                'previous_activity1_encoded',
                'previous_activity2_encoded', 
                'previous_activity3_encoded', 
                'previous_activity4_encoded', 
                'previous_activity5_encoded',
                'lag1', 
                'lag2', 
                'lag3', 
                'lag4', 
                'lag5',
                'work_hour',
                'is_holiday', 
                'month',
                'case:concept:name', 
                'time:timestamp']]
target = df[['next_activity_encoded', 
             'next_activity_time',
             'case:concept:name', 
             'time:timestamp']]
train_size = 0.8

X, X_test, y, y_test = data_split(predictor, target, train_size)

In [17]:
X_features_time = ['case:concept:name',
                   'current_activity_encoded', 
                   'previous_activity1_encoded', 
                   'previous_activity2_encoded', 
                   'previous_activity3_encoded', 
                   'previous_activity4_encoded', 
                   'previous_activity5_encoded',
                   'logged_current_time_delta',
                   'lag1', 
                   'lag2', 
                   'lag3', 
                   'lag4', 
                   'lag5',
                   'work_hour', 
                   'is_holiday', 
                   'month']
y_features_time = ['case:concept:name', 'next_activity_encoded', 'next_activity_time']

In [18]:
X_features_event = ['case:concept:name', 
                    'current_activity_encoded', 
                    'previous_activity1_encoded', 
                    'previous_activity2_encoded', 
                    'previous_activity3_encoded',
                    'previous_activity4_encoded',
                    'previous_activity5_encoded']
y_features_event = ['case:concept:name', 'next_activity_encoded']

In [19]:
X = X[X_features_time]
X_test = X_test[X_features_time]
y = y[y_features_time]
y_test = y_test[y_features_time]

In [20]:
def predict_sequence(X, model_time = None, model_event = None, features_time= None, features_event = None):
    predicted_next_sequence = {}
    predicted_time_delta = {}

    if model_time != None or features_time != None:
        for trace in X[features_event[0]].unique():
            check = X[X[features_time[0]] == trace] # contains all features and rows of particular trace
            predicted_trace = [i for i in check[features_time[1:7]].values[0]] # contains only event values
            predicted_time = [i for i in check[features_time[7:-3]].values[0]] # contains only time values

            for _ in range(len(check)):
                input_event = predicted_trace[:len(features_event)-1]
                hour_month_holiday = [i for i in check[features_time[-3:]].values[_]]
                input_time = predicted_time[:len(features_event)-1]

                temp = np.reshape(np.array(input_event + input_time + hour_month_holiday), (1, len(features_time)-1))
                predicted_time.insert(0, model_time.predict(temp)[0])

                input_event = np.reshape(np.array(predicted_trace[:len(features_event)-1]), (1, len(features_event)-1))
                predicted_trace.insert(0, model_event.predict(input_event)[0])
            
            predicted_time_delta[trace] = predicted_time[:len(check)][::-1]
            predicted_next_sequence[trace] = predicted_trace[:len(check)][::-1]
        return predicted_next_sequence, predicted_time_delta
    
    elif model_time == None:
        for trace in X[features_event[0]].unique():
            check = X[X[features_event[0]] == trace] # contains all features and rows of particular trace
            predicted_trace = [i for i in check[features_event[1:]].values[0]]
            for _ in range(len(check)):

                input_event = np.reshape(np.array(predicted_trace[:len(features_event)-1]), (1, len(features_event)-1))
                predicted_trace.insert(0, model_event.predict(input_event)[0])
                
            predicted_next_sequence[trace] = predicted_trace[:len(check)][::-1]
        return predicted_next_sequence

In [21]:
def evaluate_trace_prediction(y, predicted_next_sequence = None, predicted_time_delta= None ,features_time= None, features_event= None):
    accuracy_scores = []
    mean_absolute_errors = []
    if predicted_time_delta != None and features_time != None:
        for trace in y[features_event[0]].unique():
            true = y[y[features_event[0]] == trace][features_event[-1]].to_list()
            predicted = predicted_next_sequence[trace]
            correct = 0
            for i in range(len(true)):
                if true[i] == predicted[i]:
                    correct = correct + 1

            MAE = round(mean_absolute_error(np.exp(y[y['case:concept:name'] == trace][features_time[-1]]), np.exp(predicted_time_delta[trace])),3)

            accuracy_scores.append(correct/len(true))
            mean_absolute_errors.append(MAE)

        return round(np.mean(accuracy_scores)*100, 2), round(np.mean(mean_absolute_errors),3)
    
    elif predicted_time_delta == None:
        for trace in y[features_event[0]].unique():
            true = y[y[features_event[0]] == trace][features_event[-1]].to_list()
            predicted = predicted_next_sequence[trace]
            correct = 0
            for i in range(len(true)):
                if true[i] == predicted[i]:
                    correct = correct + 1

            accuracy_scores.append(correct/len(true))

        return round(np.mean(accuracy_scores)*100, 2)

# Parameters

### For event prediction

In [22]:
event_models = {}

In [23]:
# delete this code
for i in range(1,11):
    loaded_model = pickle.load(open(f'next_activity_prediction_new_rfc_{i}.pk1', 'rb'))
    event_models[f'next_activity_prediction_new_rfc_{i}'] = loaded_model

for i in range(1,11):
    loaded_model = pickle.load(open(f'next_activity_prediction_new_xgbc_{i}.pk1', 'rb'))
    event_models[f'next_activity_prediction_new_xgbc_{i}'] = loaded_model

In [24]:
for i in range(1,11):
    loaded_model = pickle.load(open(f'next_activity_prediction_rfc_{i}.pk1', 'rb'))
    event_models[f'next_activity_prediction_rfc_{i}'] = loaded_model

In [25]:
for i in range(1,11):
    loaded_model = pickle.load(open(f'next_activity_prediction_xgbc_{i}.pk1', 'rb'))
    event_models[f'next_activity_prediction_xgbc_{i}'] = loaded_model

### For time prediction

In [26]:
time_models = {}

In [27]:
for i in range(1,11):
    loaded_model = pickle.load(open(f'next_activity_time_prediction_rfr_{i}.pk1', 'rb'))
    time_models[f'next_activity_time_prediction_rfr_{i}'] = loaded_model

In [28]:
for i in range(1,11):
    loaded_model = pickle.load(open(f'next_activity_time_prediction_xgbr_{i}.pk1', 'rb'))
    time_models[f'next_activity_time_prediction_xgbr_{i}'] = loaded_model

# Whole Trace prediciton

In [29]:
# Delete this code
predictor = df[predictor_columns]
target = df[['next_activity_encoded', 
             'case:concept:name', 
             'time:timestamp']]
train_size = 0.8

X, X_test, y, y_test = data_split(predictor, target, train_size)

X_features_event = train_columns.insert(0, 'case:concept:name')
y_features_event = ['case:concept:name', 'next_activity_encoded']

In [30]:
X_test[X_features_event]

Unnamed: 0,case:concept:name,current_activity_encoded,previous_activity1_encoded,previous_activity2_encoded,previous_activity3_encoded,previous_activity4_encoded,previous_activity5_encoded,Start_from_A,Start_from_O,Start_from_W,...,position_72,position_73,position_74,position_75,position_76,position_77,position_78,position_79,position_80,position_81
211304,205909,9,24,24,22,22,22,1,0,0,...,0,0,0,0,0,0,0,0,0,0
211305,205909,6,9,24,22,22,22,1,0,0,...,0,0,0,0,0,0,0,0,0,0
211306,205909,17,6,9,22,22,22,0,0,1,...,0,0,0,0,0,0,0,0,0,0
211307,205909,17,17,6,8,22,22,0,0,1,...,0,0,0,0,0,0,0,0,0,0
211308,205909,4,17,17,5,8,22,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
262195,214376,6,9,24,22,22,22,1,0,0,...,0,0,0,0,0,0,0,0,0,0
262196,214376,17,6,9,22,22,22,0,0,1,...,0,0,0,0,0,0,0,0,0,0
262197,214376,17,17,6,8,22,22,0,0,1,...,0,0,0,0,0,0,0,0,0,0
262198,214376,4,17,17,5,8,22,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
models_event_scores = {}
models_time_scores = {}

In [32]:
for model_event_name in event_models:
    
    predicted_next_sequence = predict_sequence(X = X_test, model_event = event_models[model_event_name], features_event = X_features_event)
    accuracy = evaluate_trace_prediction(y = y_test, predicted_next_sequence = predicted_next_sequence, features_event = y_features_event)
    print(f'Average predicted event sequence accuracy: {accuracy}%')
    models_event_scores[f'{model_event_name}'] = accuracy

ValueError: X has 90 features, but RandomForestClassifier is expecting 222 features as input.

# Choosing the best trace prediction model to predict time on it

In [None]:
sorted_event_models = sorted(models_event_scores.items(), key = lambda x: x[1], reverse = True)
model_event_short = {f'{sorted_event_models[0][0]}': event_models[sorted_event_models[0][0]]}

In [None]:
for model_event_name in model_event_short:
    print(f"Model for event prediction: {model_event_name}")
    for model_time_name in time_models:
        print(f'Model for time predction: {model_time_name}')
        predicted_next_sequence, predicted_time_delta = predict_sequence(X_test, 
                                                                         time_models[model_time_name], 
                                                                         event_models[model_event_name],
                                                                         X_features_time, 
                                                                         X_features_event)
        accuracy, rmse = evaluate_trace_prediction(y_test, 
                                                   predicted_next_sequence, 
                                                   predicted_time_delta, 
                                                   y_features_time, 
                                                   y_features_event)
        print(f'Average predicted event sequence accuracy: {accuracy}%')
        print(f'Root mean squared error for time prediction: {rmse} in seconds, {round(rmse/3600,2)} in hours')
        models_event_scores[f'{model_event_name}'] = accuracy
        models_time_scores[f'{model_event_name}/{model_time_name}'] = rmse

Model for event prediction: next_activity_prediction_rfc_1
Model for time predction: next_activity_time_prediction_rfr_1
Average predicted event sequence accuracy: 32.87%
Root mean squared error for time prediction: 20923.73 in seconds, 5.81 in hours
Model for time predction: next_activity_time_prediction_rfr_2
Average predicted event sequence accuracy: 32.87%
Root mean squared error for time prediction: 20965.424 in seconds, 5.82 in hours
Model for time predction: next_activity_time_prediction_rfr_3
Average predicted event sequence accuracy: 32.87%
Root mean squared error for time prediction: 21070.106 in seconds, 5.85 in hours
Model for time predction: next_activity_time_prediction_rfr_4
Average predicted event sequence accuracy: 32.87%
Root mean squared error for time prediction: 21017.154 in seconds, 5.84 in hours
Model for time predction: next_activity_time_prediction_rfr_5
Average predicted event sequence accuracy: 32.87%
Root mean squared error for time prediction: 20940.315 in 

In [None]:
models_scores_event_df = pd.DataFrame.from_dict(models_event_scores, orient = 'index')

In [None]:
models_scores_time_df = pd.DataFrame.from_dict(models_time_scores, orient = 'index')

In [None]:
models_scores_event_df.reset_index(inplace = True)
models_scores_event_df.columns = ['model_name', 'accuracy']
models_scores_event_df['model_type'] = models_scores_event_df['model_name'].apply(lambda x: 'Random Forest' if x.startswith('next_activity_prediction_rfc') else 'XGBoost')

In [None]:
models_scores_time_df.reset_index(inplace = True)
models_scores_time_df.columns = ['model_name', 'MAE']


ValueError: Length mismatch: Expected axis has 4 elements, new values have 2 elements

In [None]:
models_scores_time_df['MAE'] = models_scores_time_df['MAE'].apply(lambda x: x/3600)
models_scores_time_df['model_name'] = models_scores_time_df['model_name'].apply(lambda x: x.split('/')[1])

In [None]:
px.scatter(models_scores_event_df, x = 'model_name', y = 'accuracy', color = 'model_type',
           labels = {
                      'model': 'Trace prediciton Models',
                      'score': 'Accuracy (%)'},
                  title = 'Visualization of trace prediction models and their accuracy',
                  width=1000,
                  height=500)

In [None]:
models_scores_time_df['model_type'] = models_scores_time_df['model_name'].apply(lambda x: 'Random Forest' if 'rfr' in x else 'XGBoost')

In [None]:
models_scores_time_df

Unnamed: 0,model_name,MAE,model_type
0,next_activity_time_prediction_rfr_1,5.812147,Random Forest
1,next_activity_time_prediction_rfr_2,5.823729,Random Forest
2,next_activity_time_prediction_rfr_3,5.852807,Random Forest
3,next_activity_time_prediction_rfr_4,5.838098,Random Forest
4,next_activity_time_prediction_rfr_5,5.816754,Random Forest
5,next_activity_time_prediction_rfr_6,5.832971,Random Forest
6,next_activity_time_prediction_rfr_7,5.850502,Random Forest
7,next_activity_time_prediction_rfr_8,5.830078,Random Forest
8,next_activity_time_prediction_rfr_9,5.8348,Random Forest
9,next_activity_time_prediction_rfr_10,5.824994,Random Forest


In [None]:
px.scatter(models_scores_time_df, x = 'model_name', y = 'MAE', color = 'model_type',
           labels = {
                      'model_name': 'Time prediciton models',
                      'MAE': 'Mean Absolute Error (in hours)'},
                  title = 'Visualization of MAE of time prediction models in hours (higher means worse)',
                  width=1000,
                  height=500)