# Demo version of the process mining

### Library importing

In [4]:
import time
start_time = time.time()

In [5]:
from Split_functions import *

In [6]:
from hyperopt import hp, tpe, Trials, fmin, space_eval, STATUS_OK

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error

import pandas as pd
import numpy as np

import pm4py as pm4

import plotly.express as px
import pickle

### Data loading

In [7]:
df = pd.read_csv('cleaned_data.csv')
df = df[:10000]

### Feature engineering

#### For event prediction

In [11]:
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'], format = 'mixed')
df['hour'] = df['time:timestamp'].dt.hour
df['day'] = df['time:timestamp'].dt.day
df['month'] = df['time:timestamp'].dt.month
df['weekday'] = df['time:timestamp'].dt.strftime("%A")
df['is_holiday'] = 0

In [12]:
work_hours = df.groupby('hour').count()
work_hours['percentage'] = work_hours['concept:name'].apply(lambda x : x/sum(work_hours['concept:name'])*100)
work_hours_list = work_hours[work_hours['percentage']>1].reset_index()['hour'].to_list()

In [13]:
# Determining if it is the working hours or not
df['work_hour'] = df['hour'].apply(lambda x: 1 if x in(work_hours_list) else 0)

In [14]:
# Typical weekends
df.loc[(df['weekday'] == 'Sunday') | (df['weekday'] == 'Saturday'), 'is_holiday'] = 1

# New Year's Day
df.loc[(df['day'] == 1) & (df['month'] == 1), 'is_holiday'] = 1

# Christmas Day 
df.loc[((df['day'].isin([i for i in range(22, 27)]))) & (df['month'] == 1), 'is_holiday'] = 1

# Good Friday, Easter 
df.loc[(df['day'].isin([i for i in range(6,10)])) & (df['month'] == 4), 'is_holiday'] = 1

# King's day (27 April)
df.loc[(df['day'] == 27) & (df['month'] == 4), 'is_holiday'] = 1

# Liberation Day
df.loc[(df['day'] == 5) & (df['month'] == 5), 'is_holiday'] = 1

# Ascension Day 
df.loc[(df['day'].isin([i for i in range(17, 21)])) & (df['month'] == 5), 'is_holiday'] = 1

# Pentecost
df.loc[(df['day'].isin([i for i in range(26, 29)])) & (df['month'] == 5), 'is_holiday'] = 1

In [15]:
df['current_time_delta'] = df.groupby('case:concept:name')['time:timestamp'].diff(-1).dt.total_seconds().abs()
df['logged_current_time_delta'] = np.log(df['current_time_delta'] + 1)

In [16]:
df['next_activity_time'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(-1)
df['lag1'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(1)
df['lag2'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(2)
df['lag3'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(3)
df['lag4'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(4)
df['lag5'] = df.groupby('case:concept:name')['logged_current_time_delta'].shift(5)


df = df.fillna(1e-6)

df['next_activity'] = df.groupby('case:concept:name')['concept:name'].shift(-1)
df['previous_activity1'] = df.groupby('case:concept:name')['concept:name'].shift(1)
df['previous_activity2'] = df.groupby('case:concept:name')['concept:name'].shift(2)
df['previous_activity3'] = df.groupby('case:concept:name')['concept:name'].shift(3)
df['previous_activity4'] = df.groupby('case:concept:name')['concept:name'].shift(4)
df['previous_activity5'] = df.groupby('case:concept:name')['concept:name'].shift(5)


le = LabelEncoder()
df['next_activity_encoded'] = le.fit_transform(df['next_activity'])
df['current_activity_encoded'] = le.fit_transform(df['concept:name'])
df['previous_activity1_encoded'] = le.fit_transform(df['previous_activity1'])
df['previous_activity2_encoded'] = le.fit_transform(df['previous_activity2'])
df['previous_activity3_encoded'] = le.fit_transform(df['previous_activity3'])
df['previous_activity4_encoded'] = le.fit_transform(df['previous_activity4'])
df['previous_activity5_encoded'] = le.fit_transform(df['previous_activity5'])

In [17]:
predictor = df[['current_activity_encoded', 
                'previous_activity1_encoded', 
                'previous_activity2_encoded', 
                'previous_activity3_encoded',
                'previous_activity4_encoded',
                'previous_activity5_encoded',
                'case:concept:name', 
                'time:timestamp']]
target = df[['next_activity_encoded', 'case:concept:name', 'time:timestamp']]
train_size = 0.8

X, X_test, y, y_test = data_split(predictor, target, train_size)

print('+----------------------------------------------------------------+')
print('After cleaning traces!')
print('Training dataset max time:',X['time:timestamp'].max())
print('Testing dataset min time:', X_test['time:timestamp'].min())
print('+----------------------------------------------------------------+')

+----------------------------------------------------------------+
After cleaning traces!
Training dataset max time: 2011-10-10 12:23:58.549000+00:00
Testing dataset min time: 2011-10-10 12:25:15.419000+00:00
+----------------------------------------------------------------+


In [18]:
X_features = ['current_activity_encoded', 
       'previous_activity1_encoded', 
       'previous_activity2_encoded', 
       'previous_activity3_encoded',
       'previous_activity4_encoded',
       'previous_activity5_encoded',
       'case:concept:name']
y_features = ['next_activity_encoded',
              'case:concept:name']

In [19]:
X = X[X_features]
X_test = X_test[X_features]
y = y[y_features]
y_test = y_test[y_features]

In [20]:
X = X.reset_index(drop = True)
y = y.reset_index(drop = True)

In [21]:
# Define the search space for hyperparameters
space = {
    'n_estimators': hp.choice('n_estimators', [int(x) for x in np.linspace(start = 5, stop = 100, num = 60)]),
    'max_depth': hp.choice('max_depth', [5, 6, 7, 9, 10, 12, 13, 15, 16, 17, 19, 20, 22, 23, 25]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [1, 2, 4, 6, 8])
}

# Initialize variables to store results
best_params_rc_list = []
best_scores_list = []

n = 10

start = 0
end = len(X)
step_size = end//n

train_start = 0 
train_end = end - step_size

test_start = train_end
test_end = end

for i in range(n):
    if train_start == test_start:
        train_x = X.loc[test_end+1:]
        train_y = y.loc[test_end+1:]

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]

    else:
        if test_end + 1 >= len(X):
            train_x = X.loc[train_start:train_end-1]
            train_y = y.loc[train_start:train_end-1]
        else:
            train_x = pd.concat([X.loc[train_start:train_end-1], X.loc[test_end+1:]])
            train_y = pd.concat([y.loc[train_start:train_end-1], y.loc[test_end+1:]])

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]
    
    overlapping_sets = list(set(train_x['case:concept:name'].unique()).intersection(set(test_x['case:concept:name'].unique())))
    # # Clean train
    X_train = train_x[train_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_train = train_y[train_y['case:concept:name'].isin(train_x['case:concept:name'].unique())]
    
    # # Clean test
    X_validation = test_x[test_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_validation = test_y[test_y['case:concept:name'].isin(test_x['case:concept:name'].unique())]

    # # Finalizing the data
    X_train = X_train[X_features[:-1]].values
    X_validation = X_validation[X_features[:-1]].values
    y_train = y_train[y_features[0]].values
    y_validation = y_validation[y_features[0]].values
    

    # Define a function to optimize using Hyperopt
    def objective(params):
        rfc = RandomForestClassifier(**params, n_jobs = -1)
        rfc.fit(X_train, np.ravel(y_train))
        score = rfc.score(X_validation, y_validation)
        return {'loss': -score, 'status': STATUS_OK}
    
    # Define Trials object to store optimization results
    trials = Trials()
    
    # Use Hyperopt to find the best hyperparameters
    best = fmin(objective, space, algo=tpe.suggest, max_evals=10, trials=trials, return_argmin=False)
    
    # Store the best parameters and corresponding score
    best_params_rc_list.append(best)
    best_scores_list.append(-trials.best_trial['result']['loss'])  # Convert back to positive
    
    
    test_end = test_start
    train_end -= step_size
    test_start = train_end

#Print the best parameters and average score across all outer folds
print("Best Parameters:")
for params in best_params_rc_list:
    print(params)
print("Average Score:", np.mean(best_scores_list))

 10%|█         | 1/10 [00:00<00:00,  9.37trial/s, best loss: -0.8203592814371258]

100%|██████████| 10/10 [00:00<00:00, 10.93trial/s, best loss: -0.8203592814371258]
100%|██████████| 10/10 [00:00<00:00, 14.51trial/s, best loss: -0.7976190476190477]
100%|██████████| 10/10 [00:00<00:00, 12.72trial/s, best loss: -0.8095238095238095]
100%|██████████| 10/10 [00:00<00:00, 12.30trial/s, best loss: -0.7619047619047619]
100%|██████████| 10/10 [00:00<00:00, 13.82trial/s, best loss: -0.8333333333333334]
100%|██████████| 10/10 [00:00<00:00, 20.04trial/s, best loss: -0.7440476190476191]
100%|██████████| 10/10 [00:00<00:00, 17.60trial/s, best loss: -0.7321428571428571]
100%|██████████| 10/10 [00:00<00:00, 17.51trial/s, best loss: -0.6666666666666666]
100%|██████████| 10/10 [00:00<00:00, 15.80trial/s, best loss: -0.8333333333333334]
100%|██████████| 10/10 [00:00<00:00, 16.60trial/s, best loss: -0.6428571428571429]
Best Parameters:
{'max_depth': 22, 'min_samples_leaf': 8, 'min_samples_split': 2, 'n_estimators': 90}
{'max_depth': 16, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n

In [22]:
count = 1
for i in best_params_rc_list:
    model = RandomForestClassifier(**i)

    model.fit(X[X_features[:-1]].values, np.ravel(y[y_features[0]].values))
    print(model.score(X_test[X_features[:-1]].values, y_test[y_features[0]].values))
    
    pickle.dump(model , open(f'next_activity_prediction_rfc_{count}.pk1' , 'wb'))
    count+=1

0.3595
0.4955
0.5365
0.535
0.3755
0.537
0.458
0.5325
0.423
0.536


#### For time prediciton

In [23]:
predictor = df[['logged_current_time_delta', 
                'current_activity_encoded', 'previous_activity1_encoded', 'previous_activity2_encoded', 
                'previous_activity3_encoded', 'previous_activity4_encoded', 'previous_activity5_encoded',
                'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
                'work_hour', 'is_holiday', 'month', 'case:concept:name', 'time:timestamp']]
target = df[['next_activity_time', 'case:concept:name', 'time:timestamp']]
train_size = 0.8

X, X_test, y, y_test = data_split(predictor, target, train_size)

print('+----------------------------------------------------------------+')
print('After cleaning traces!')
print('Training dataset max time:',X['time:timestamp'].max())
print('Testing dataset min time:', X_test['time:timestamp'].min())
print('+----------------------------------------------------------------+')

+----------------------------------------------------------------+
After cleaning traces!
Training dataset max time: 2011-10-10 12:23:58.549000+00:00
Testing dataset min time: 2011-10-10 12:25:15.419000+00:00
+----------------------------------------------------------------+


In [24]:
X_features = ['logged_current_time_delta', 
                'current_activity_encoded', 'previous_activity1_encoded', 'previous_activity2_encoded', 
                'previous_activity3_encoded', 'previous_activity4_encoded', 'previous_activity5_encoded',
                'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
                'work_hour', 'is_holiday', 'month', 'case:concept:name']

y_features = ['next_activity_time', 'case:concept:name']

In [25]:
X_features_to_train = ['logged_current_time_delta', 
                'current_activity_encoded', 'previous_activity1_encoded', 'previous_activity2_encoded', 
                'previous_activity3_encoded', 'previous_activity4_encoded', 'previous_activity5_encoded',
                'lag1', 'lag2', 'lag3', 'lag4', 'lag5',
                'work_hour', 'is_holiday', 'month']

y_features_to_train = ['next_activity_time']

In [26]:
X = X[X_features]
X_test = X_test[X_features]
y = y[y_features]
y_test = y_test[y_features]

In [27]:
X = X.reset_index(drop = True)
y = y.reset_index(drop = True)

In [28]:
# Define the search space for hyperparameters
space = {
    'n_estimators': hp.choice('n_estimators', [int(x) for x in np.linspace(start = 5, stop = 100, num = 60)]),
    'max_depth': hp.choice('max_depth', [5, 6, 7, 9, 10, 12, 13, 15, 16, 17, 19, 20, 22, 23, 25]),
    'min_samples_split': hp.choice('min_samples_split', [2, 5, 10]),
    'min_samples_leaf': hp.choice('min_samples_leaf', [2, 4, 6, 8])
}

# Initialize variables to store results
best_params_rfr_list = []
best_scores_list = []

n = 10

start = 0
end = len(X)
step_size = end//n

train_start = 0 
train_end = end - step_size

test_start = train_end
test_end = end

for i in range(n):
    if train_start == test_start:
        train_x = X.loc[test_end+1:]
        train_y = y.loc[test_end+1:]

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]

    else:
        if test_end + 1 >= len(X):
            train_x = X.loc[train_start:train_end-1]
            train_y = y.loc[train_start:train_end-1]
        else:
            train_x = pd.concat([X.loc[train_start:train_end-1], X.loc[test_end+1:]])
            train_y = pd.concat([y.loc[train_start:train_end-1], y.loc[test_end+1:]])

        test_x = X.loc[test_start:test_end]
        test_y = y.loc[test_start:test_end]
    
    overlapping_sets = list(set(train_x['case:concept:name'].unique()).intersection(set(test_x['case:concept:name'].unique())))
    # # Clean train
    X_train = train_x[train_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_train = train_y[train_y['case:concept:name'].isin(train_x['case:concept:name'].unique())]
    
    # # Clean test
    X_validation = test_x[test_x['case:concept:name'].isin([overlapping_sets]) == False]
    y_validation = test_y[test_y['case:concept:name'].isin(test_x['case:concept:name'].unique())]

    # # Finalizing the data
    X_train = X_train[X_features[:-1]].values
    X_validation = X_validation[X_features[:-1]].values
    y_train = y_train[y_features[0]].values
    y_validation = y_validation[y_features[0]].values
    
    # Define a function to optimize using Hyperopt
    def objective(params):
        xgb = RandomForestRegressor(**params, n_jobs = -1)
        xgb.fit(X_train, np.ravel(y_train))
        score = xgb.score(X_validation, y_validation)
        return {'loss': -score, 'status': STATUS_OK}
    
    # Define Trials object to store optimization results
    trials = Trials()
    
    # Use Hyperopt to find the best hyperparameters
    best = fmin(objective, space, algo=tpe.suggest, max_evals=10, trials=trials, return_argmin=False)
    
    # Store the best parameters and corresponding score
    best_params_rfr_list.append(best)
    best_scores_list.append(-trials.best_trial['result']['loss'])  # Convert back to positive
    
    
    test_end = test_start
    train_end -= step_size
    test_start = train_end

#Print the best parameters and average score across all outer folds
print("Best Parameters:")
for params in best_params_rfr_list:
    print(params)
print("Average Score:", np.mean(best_scores_list))

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

100%|██████████| 10/10 [00:00<00:00, 12.65trial/s, best loss: -0.5692313549284785]
100%|██████████| 10/10 [00:00<00:00, 14.75trial/s, best loss: -0.49294145897107977]
100%|██████████| 10/10 [00:00<00:00, 17.50trial/s, best loss: -0.5762346617810418]
100%|██████████| 10/10 [00:00<00:00, 19.01trial/s, best loss: -0.5130537482788584]
100%|██████████| 10/10 [00:00<00:00, 13.88trial/s, best loss: -0.5037462478687923]
100%|██████████| 10/10 [00:00<00:00, 21.57trial/s, best loss: -0.43756360132395544]
100%|██████████| 10/10 [00:00<00:00, 12.17trial/s, best loss: -0.6680090863689387]
100%|██████████| 10/10 [00:00<00:00, 12.81trial/s, best loss: -0.551741835858802]
100%|██████████| 10/10 [00:00<00:00, 12.11trial/s, best loss: -0.565069222294253]
100%|██████████| 10/10 [00:00<00:00, 15.26trial/s, best loss: -0.3428733203584219]
Best Parameters:
{'max_depth': 10, 'min_samples_leaf': 6, 'min_samples_split': 10, 'n_estimators': 50}
{'max_depth': 6, 'min_samples_leaf': 6, 'min_samples_split': 5, 'n_

In [29]:
count = 1
for i in best_params_rfr_list:
    model = RandomForestRegressor(**i)

    model.fit(X[X_features_to_train].values, np.ravel(y[y_features_to_train].values))
    print(model.score(X_test[X_features_to_train].values, y_test[y_features_to_train].values))
    print(f'MAE: {round(mean_absolute_error(np.exp(y_test[y_features_to_train].values), np.exp(model.predict(X_test[X_features_to_train].values)))/3600,3)}')
    
    pickle.dump(model , open(f'next_activity_time_prediction_rfr_{count}.pk1' , 'wb'))
    count+=1

0.32694435922667764
MAE: 15.652
0.3359182016852431
MAE: 15.657
0.334214563762553
MAE: 15.652
0.34028505000245546
MAE: 15.649
0.3329213491728267
MAE: 15.655
0.31764739410506715
MAE: 15.653
0.326288543949686
MAE: 15.658
0.33392829916365896
MAE: 15.658
0.340935354425964
MAE: 15.655
0.3214849264830396
MAE: 15.657


## The whole trace prediction

In [30]:
predictor = df[['logged_current_time_delta', 
                'current_activity_encoded',
                'previous_activity1_encoded',
                'previous_activity2_encoded', 
                'previous_activity3_encoded', 
                'previous_activity4_encoded', 
                'previous_activity5_encoded',
                'lag1', 
                'lag2', 
                'lag3', 
                'lag4', 
                'lag5',
                'work_hour',
                'is_holiday', 
                'month',
                'case:concept:name', 
                'time:timestamp']]

target = df[['next_activity_encoded', 
             'next_activity_time',
             'case:concept:name', 
             'time:timestamp']]
train_size = 0.8

X, X_test, y, y_test = data_split(predictor, target, train_size)

In [31]:
X_features_time = ['case:concept:name',
                   'current_activity_encoded', 
                   'previous_activity1_encoded', 
                   'previous_activity2_encoded', 
                   'previous_activity3_encoded', 
                   'previous_activity4_encoded', 
                   'previous_activity5_encoded',
                   'logged_current_time_delta',
                   'lag1', 
                   'lag2', 
                   'lag3', 
                   'lag4', 
                   'lag5',
                   'work_hour', 
                   'is_holiday', 
                   'month']
y_features_time = ['case:concept:name', 'next_activity_encoded', 'next_activity_time']

In [32]:
X_features_event = ['case:concept:name', 
                    'current_activity_encoded', 
                    'previous_activity1_encoded', 
                    'previous_activity2_encoded', 
                    'previous_activity3_encoded',
                    'previous_activity4_encoded',
                    'previous_activity5_encoded']
y_features_event = ['case:concept:name', 'next_activity_encoded']

In [33]:
X = X[X_features_time]
X_test = X_test[X_features_time]
y = y[y_features_time]
y_test = y_test[y_features_time]

In [34]:
from Split_functions import import_models

In [35]:
event_models = import_models('next_activity_prediction_rfc', n)

In [36]:
time_models = import_models('next_activity_time_prediction_rfr',n)

In [37]:
models_event_scores = {}
models_time_scores = {}

In [38]:
models_event_scores = test_event_models_for_trace_pred(event_models, X_test, y_test, X_features_event, y_features_event)

Average predicted event sequence accuracy: 43.69%
Average predicted event sequence accuracy: 40.32%
Average predicted event sequence accuracy: 43.2%
Average predicted event sequence accuracy: 40.41%
Average predicted event sequence accuracy: 43.2%
Average predicted event sequence accuracy: 41.22%
Average predicted event sequence accuracy: 43.07%
Average predicted event sequence accuracy: 41.33%
Average predicted event sequence accuracy: 44.59%
Average predicted event sequence accuracy: 40.02%


In [39]:
sorted_event_models = sorted(models_event_scores.items(), key = lambda x: x[1], reverse = True)
model_event_short = {f'{sorted_event_models[0][0]}': event_models[sorted_event_models[0][0]]}

In [40]:
models_time_scores = test_suffix_with_time_pred(model_event_short, models_time_scores, time_models, event_models, X_test, y_test, X_features_event, X_features_time, y_features_event, y_features_time)

Model for event prediction: next_activity_prediction_rfc_9.pk1
Model for time predction: next_activity_time_prediction_rfr_1.pk1
Average predicted event sequence accuracy: 44.59%
Root mean squared error for time prediction: 33436.931 in seconds, 9.29 in hours
Model for time predction: next_activity_time_prediction_rfr_2.pk1
Average predicted event sequence accuracy: 44.59%
Root mean squared error for time prediction: 33434.821 in seconds, 9.29 in hours
Model for time predction: next_activity_time_prediction_rfr_3.pk1
Average predicted event sequence accuracy: 44.59%
Root mean squared error for time prediction: 33439.32 in seconds, 9.29 in hours
Model for time predction: next_activity_time_prediction_rfr_4.pk1
Average predicted event sequence accuracy: 44.59%
Root mean squared error for time prediction: 33455.664 in seconds, 9.29 in hours
Model for time predction: next_activity_time_prediction_rfr_5.pk1
Average predicted event sequence accuracy: 44.59%
Root mean squared error for time p

In [41]:
models_scores_event_df = pd.DataFrame.from_dict(models_event_scores, orient = 'index')

In [42]:
models_scores_time_df = pd.DataFrame.from_dict(models_time_scores, orient = 'index')

In [43]:
models_scores_event_df.reset_index(inplace = True)
models_scores_event_df.columns = ['model_name', 'accuracy']
models_scores_event_df['model_type'] = models_scores_event_df['model_name'].apply(lambda x: 'Random Forest' if x.startswith('next_activity_prediction_rfc') else 'XGBoost')

In [44]:
models_scores_time_df.reset_index(inplace = True)
models_scores_time_df.columns = ['model_name', 'MAE']


In [45]:
models_scores_time_df['MAE'] = models_scores_time_df['MAE'].apply(lambda x: x/3600)
models_scores_time_df['model_name'] = models_scores_time_df['model_name'].apply(lambda x: x.split('/')[1])

In [46]:
px.scatter(models_scores_event_df, x = 'model_name', y = 'accuracy', color = 'model_type',
           labels = {
                      'model': 'Trace prediciton Models',
                      'score': 'Accuracy (%)'},
                  title = 'Visualization of trace prediction models and their accuracy',
                  width=1000,
                  height=500)

In [47]:
px.scatter(models_scores_time_df, x = 'model_name', y = 'MAE',
           labels = {
                      'model_name': 'Time prediciton models',
                      'MAE': 'Mean Absolute Error (in hours)'},
                  title = 'Visualization of RMSE of time prediction models (based on different trace predcitions) in seconds',
                  width=1000,
                  height=500)

In [48]:
print("--- %s seconds ---" % (time.time() - start_time))

--- 88.13192582130432 seconds ---
