In [None]:
# pip install hypopt 

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from hypopt import GridSearch
from sklearn.linear_model import ElasticNet
import copy

In [2]:
train = pd.read_csv('preprocessed_train.csv')
test = pd.read_csv('preprocessed_test.csv')

# Removing the last event from each case to reduce noise in the models

In [3]:
train.drop(train.groupby('case concept:name').tail(1).index, axis=0, inplace=True)
test.drop(test.groupby('case concept:name').tail(1).index, axis=0, inplace=True)

# Evaluation


In [4]:
def time_evaluation(y_test, y_pred, model: str):
    print('\n')
    print(f"Error metrics (measured in hours) for the {model} when predicting the next event's Unix timestamp")
    
    print('Mean Absolute Error:', round(mean_absolute_error(y_test, y_pred)/3600,2))
    print('Root Mean Squared Error:', round(np.sqrt(mean_squared_error(y_test, y_pred)/3600),2))
#     print('R2 score:', round(r2_score(y_test, y_pred),2))
    
    
def event_evaluation(y_test, y_pred, model: str, avg="weighted"):

    precision = precision_score(y_test, y_pred, average=avg, zero_division=0)
    recall = recall_score(y_test, y_pred, average=avg, zero_division=0)
    F1_score = f1_score(y_test, y_pred, average=avg, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    print('\n')
    print(f'Error metrics for the {model} when predicting the next event')
    
    print(f'Accuracy: {round(accuracy,2)}')
    print(f'Precision: {round(precision,2)}')
    print(f'Recall: {round(recall,2)}')
    print(f'f1-score: {round(F1_score,2)}')
#     print(confusion_matrix(y_test, y_pred))

# Data splitting and encoding

In [5]:
train_objs_num = train.shape[0]

dataset = pd.concat(objs=[train, test], axis=0)

dataset = dataset.replace({'event lifecycle:transition': {'SCHEDULE': 0, 'START': 1, 'COMPLETE': 2}})
dataset = pd.get_dummies(dataset, prefix = ['current', 'prev', '2prev'], 
                           columns = ['event concept:name', 'prev_event', '2prev_event'])

train_OHE = copy.copy(dataset[:train_objs_num])
test_OHE = copy.copy(dataset[train_objs_num:])



val_OHE = train_OHE.loc[147054:]
train_OHE = train_OHE.loc[:147054]

first_val_event_timestamp = val_OHE['event time:timestamp'].min()
train_OHE = train_OHE.groupby('case concept:name').filter(lambda g: g['event time:timestamp'].min() < first_val_event_timestamp and g['event time:timestamp'].max() < first_val_event_timestamp)

first_test_event_timestamp = test_OHE['event time:timestamp'].min()
val_OHE = val_OHE.groupby('case concept:name').filter(lambda g: g['event time:timestamp'].min() < first_test_event_timestamp and g['event time:timestamp'].max() < first_test_event_timestamp)


In [6]:
train_OHE.shape, val_OHE.shape, test_OHE.shape

((114600, 95), (33794, 95), (38911, 95))

In [7]:
lst = []
for element in list(train_OHE.columns):
    if element not in list(test_OHE.columns):
        lst.append(element)
print('event type that happen in the train but not the test set: ' + str(lst))

event type that happen in the train but not the test set: []


In [8]:
features_time = ['timestamp_finish', 'seconds_prev', 'active cases', 'day_week', 'time_of_day', 
                 'case AMOUNT_REQ', 'event lifecycle:transition', 
                 'current_A_ACCEPTED', 'current_A_ACTIVATED', 'current_A_APPROVED', 'current_A_CANCELLED', 'current_A_DECLINED', 'current_A_FINALIZED', 'current_A_PARTLYSUBMITTED', 'current_A_PREACCEPTED', 'current_A_REGISTERED', 'current_A_SUBMITTED', 'current_O_ACCEPTED','current_O_CANCELLED', 'current_O_CREATED', 'current_O_DECLINED','current_O_SELECTED', 'current_O_SENT', 'current_O_SENT_BACK','current_W_Afhandelen leads', 'current_W_Beoordelen fraude','current_W_Completeren aanvraag','current_W_Nabellen incomplete dossiers', 'current_W_Nabellen offertes','current_W_Valideren aanvraag', 'current_W_Wijzigen contractgegevens']

# features_time_test = ['timestamp_finish', 'seconds_prev', 'active cases', 'day_week', 'time_of_day', 
#                  'case AMOUNT_REQ', 'event lifecycle:transition', 
#            'current_A_ACCEPTED', 'current_A_ACTIVATED', 'current_A_APPROVED','current_A_CANCELLED', 'current_A_DECLINED', 'current_A_FINALIZED','current_A_PARTLYSUBMITTED', 'current_A_PREACCEPTED','current_A_REGISTERED', 'current_A_SUBMITTED', 'current_O_ACCEPTED', 'current_O_CANCELLED', 'current_O_CREATED', 'current_O_DECLINED','current_O_SELECTED', 'current_O_SENT', 'current_O_SENT_BACK','current_W_Afhandelen leads', 'current_W_Beoordelen fraude','current_W_Completeren aanvraag','current_W_Nabellen incomplete dossiers', 'current_W_Nabellen offertes', 'current_W_Valideren aanvraag']
features_time_test = features_time.copy()

target_time = 'seconds_next'

features_event = ['active cases', 'day_week', 'time_of_day',  
                  'event lifecycle:transition', 'case AMOUNT_REQ', 
       'current_A_ACCEPTED', 'current_A_ACTIVATED', 'current_A_APPROVED','current_A_CANCELLED', 'current_A_DECLINED', 'current_A_FINALIZED','current_A_PARTLYSUBMITTED', 'current_A_PREACCEPTED','current_A_REGISTERED', 'current_A_SUBMITTED', 'current_O_ACCEPTED','current_O_CANCELLED', 'current_O_CREATED', 'current_O_DECLINED','current_O_SELECTED', 'current_O_SENT', 'current_O_SENT_BACK','current_W_Afhandelen leads', 'current_W_Beoordelen fraude','current_W_Completeren aanvraag','current_W_Nabellen incomplete dossiers', 'current_W_Nabellen offertes','current_W_Valideren aanvraag', 'current_W_Wijzigen contractgegevens','prev_A_ACCEPTED', 'prev_A_ACTIVATED', 'prev_A_APPROVED','prev_A_CANCELLED', 'prev_A_DECLINED', 'prev_A_FINALIZED','prev_A_PARTLYSUBMITTED', 'prev_A_PREACCEPTED', 'prev_A_REGISTERED','prev_A_SUBMITTED', 'prev_FIRST EVENT', 'prev_O_ACCEPTED', 'prev_O_CANCELLED', 'prev_O_CREATED', 'prev_O_DECLINED','prev_O_SELECTED', 'prev_O_SENT', 'prev_O_SENT_BACK','prev_W_Afhandelen leads', 'prev_W_Beoordelen fraude','prev_W_Completeren aanvraag', 'prev_W_Nabellen incomplete dossiers','prev_W_Nabellen offertes', 'prev_W_Valideren aanvraag','prev_W_Wijzigen contractgegevens', '2prev_A_ACCEPTED','2prev_A_ACTIVATED', '2prev_A_APPROVED', '2prev_A_CANCELLED','2prev_A_FINALIZED', '2prev_A_PARTLYSUBMITTED', '2prev_A_PREACCEPTED','2prev_A_REGISTERED', '2prev_A_SUBMITTED', '2prev_FIRST EVENT','2prev_O_ACCEPTED', '2prev_O_CANCELLED', '2prev_O_CREATED','2prev_O_SELECTED', '2prev_O_SENT', '2prev_O_SENT_BACK','2prev_W_Afhandelen leads', '2prev_W_Beoordelen fraude','2prev_W_Completeren aanvraag', '2prev_W_Nabellen incomplete dossiers','2prev_W_Nabellen offertes', '2prev_W_Valideren aanvraag','2prev_W_Wijzigen contractgegevens']

# features_event_test = ['active cases', 'day_week', 'time_of_day',  
#                   'event lifecycle:transition', 'case AMOUNT_REQ', 
#                        'current_A_ACCEPTED', 'current_A_ACTIVATED', 'current_A_APPROVED','current_A_CANCELLED', 'current_A_DECLINED', 'current_A_FINALIZED','current_A_PARTLYSUBMITTED', 'current_A_PREACCEPTED','current_A_REGISTERED', 'current_A_SUBMITTED', 'current_O_ACCEPTED','current_O_CANCELLED', 'current_O_CREATED', 'current_O_DECLINED','current_O_SELECTED', 'current_O_SENT', 'current_O_SENT_BACK','current_W_Afhandelen leads', 'current_W_Beoordelen fraude','current_W_Completeren aanvraag','current_W_Nabellen incomplete dossiers', 'current_W_Nabellen offertes','current_W_Valideren aanvraag','prev_A_ACCEPTED', 'prev_A_ACTIVATED', 'prev_A_APPROVED','prev_A_CANCELLED', 'prev_A_DECLINED', 'prev_A_FINALIZED','prev_A_PARTLYSUBMITTED', 'prev_A_PREACCEPTED', 'prev_A_REGISTERED','prev_A_SUBMITTED', 'prev_FIRST EVENT', 'prev_O_ACCEPTED','prev_O_CANCELLED', 'prev_O_CREATED', 'prev_O_DECLINED','prev_O_SELECTED', 'prev_O_SENT', 'prev_O_SENT_BACK','prev_W_Afhandelen leads', 'prev_W_Beoordelen fraude','prev_W_Completeren aanvraag', 'prev_W_Nabellen incomplete dossiers','prev_W_Nabellen offertes', 'prev_W_Valideren aanvraag','2prev_A_ACCEPTED', '2prev_A_ACTIVATED', '2prev_A_APPROVED','2prev_A_FINALIZED', '2prev_A_PARTLYSUBMITTED', '2prev_A_PREACCEPTED','2prev_A_REGISTERED', '2prev_A_SUBMITTED', '2prev_FIRST EVENT','2prev_O_ACCEPTED', '2prev_O_CANCELLED', '2prev_O_CREATED','2prev_O_SELECTED', '2prev_O_SENT', '2prev_O_SENT_BACK','2prev_W_Afhandelen leads', '2prev_W_Beoordelen fraude','2prev_W_Completeren aanvraag', '2prev_W_Nabellen incomplete dossiers','2prev_W_Nabellen offertes', '2prev_W_Valideren aanvraag']
features_event_test = features_event.copy()

target_event = 'next_event'


# TIME
X_train_time = train_OHE[features_time]
y_train_time = train_OHE[target_time]

X_val_time = val_OHE[features_time]
y_val_time = val_OHE[target_time]

X_test_time = test_OHE[features_time_test].copy()
y_test_time = test_OHE[target_time]


# EVENT
X_train_event = train_OHE[features_event]
y_train_event = train_OHE[target_event]

X_val_event = val_OHE[features_event]
y_val_event = val_OHE[target_event]

X_test_event = test_OHE[features_event_test].copy()
y_test_event = test_OHE[target_event]



# X_test_time['current_W_Wijzigen contractgegevens']=-1
# X_test_event[['current_W_Wijzigen contractgegevens', 'prev_W_Wijzigen contractgegevens', '2prev_A_CANCELLED', '2prev_W_Wijzigen contractgegevens']]=-1

In [9]:
X_train_event.shape, X_train_time.shape, X_test_event.shape, X_test_time.shape

((114600, 77), (114600, 31), (38911, 77), (38911, 31))

In [10]:
# ohe = OneHotEncoder().fit(train['event concept:name'].to_numpy().reshape(-1, 1))

# transformed = ohe.transform(train['event concept:name'].to_numpy().reshape(-1, 1))
# train_OHE = pd.DataFrame(transformed, columns=jobs_encoder.get_feature_names())

# Event prediction

In [11]:
def RandomForestEvents(X_train, y_train, X_val, y_val):
    
    
    params={'max_depth': [None],
     'n_estimators': [500, 800, 1200, 1800]}

    
    forest_clf = RandomForestClassifier(bootstrap=True, criterion='gini', max_features='sqrt', random_state=42)    
    
    grid = GridSearch(model=forest_clf, param_grid=params, parallelize=False)

    grid.fit(X_train, y_train, X_val, y_val)
    
    return grid.best_estimator_


In [12]:
rf_event = RandomForestEvents(X_train_event, y_train_event, X_val_event, y_val_event)

In [13]:
rf_event_train = rf_event.predict(X_train_event)
rf_event_val = rf_event.predict(X_val_event)
rf_event_test = rf_event.predict(X_test_event)

In [14]:
print(rf_event.get_params())

event_evaluation(y_train_event, rf_event_train, 'RF EVENT TRAIN')
event_evaluation(y_val_event, rf_event_val, 'RF EVENT VAL')
event_evaluation(y_test_event, rf_event_test, 'RF EVENT TEST')

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


Error metrics for the RF EVENT TRAIN when predicting the next event
Accuracy: 0.92
Precision: 0.92
Recall: 0.92
f1-score: 0.92


Error metrics for the RF EVENT VAL when predicting the next event
Accuracy: 0.79
Precision: 0.78
Recall: 0.79
f1-score: 0.78


Error metrics for the RF EVENT TEST when predicting the next event
Accuracy: 0.81
Precision: 0.8
Recall: 0.81
f1-score: 0.81


In [43]:
rf_event_features = dict(zip(features_event, rf_event.feature_importances_))

rf_event_features

# some categories of current, prev, 2prev event are very useful for next event -> current, prev, 2prev are good features

{'active cases': 0.00011947709507541567,
 'day_week': 0.024820746962539556,
 'time_of_day': 0.06013903563604539,
 'event lifecycle:transition': 0.04281649298793064,
 'case AMOUNT_REQ': 0.10432306916647882,
 'current_A_ACCEPTED': 0.011782490573607453,
 'current_A_ACTIVATED': 0.002367069304498333,
 'current_A_APPROVED': 0.0031558481783923696,
 'current_A_CANCELLED': 0.006086892423604019,
 'current_A_DECLINED': 0.008451545976532951,
 'current_A_FINALIZED': 0.007416561640944685,
 'current_A_PARTLYSUBMITTED': 0.007940892124245962,
 'current_A_PREACCEPTED': 0.02275893288682289,
 'current_A_REGISTERED': 0.002380425211632206,
 'current_A_SUBMITTED': 0.029118400629182254,
 'current_O_ACCEPTED': 0.003550584544147582,
 'current_O_CANCELLED': 0.006287051757252049,
 'current_O_CREATED': 0.02542212414435273,
 'current_O_DECLINED': 0.0024770008808909076,
 'current_O_SELECTED': 0.014389295749890492,
 'current_O_SENT': 0.01373686089232643,
 'current_O_SENT_BACK': 0.012462086406648578,
 'current_W_Afhan

# Time prediction

In [16]:
def RandomForestTime(X_train, y_train, X_val, y_val):
    
    
    params={'max_depth': [None],
     'n_estimators': [500, 800, 1200, 1800]}

    forest_reg = RandomForestRegressor(bootstrap = True, max_features='sqrt', random_state=42)
    
    grid = GridSearch(model=forest_reg, param_grid=params, parallelize=False)

    grid.fit(X_train, y_train, X_val, y_val)
    
    return grid.best_estimator_


In [17]:
rf_time = RandomForestTime(X_train_time, y_train_time, X_val_time, y_val_time)

In [18]:
rf_time_train = rf_time.predict(X_train_time)
rf_time_val = rf_time.predict(X_val_time)
rf_time_test = rf_time.predict(X_test_time)

In [19]:
print(rf_time.get_params())

time_evaluation(y_train_time, rf_time_train, 'RF TIME TRAIN')
time_evaluation(y_val_time, rf_time_val, 'RF TIME VAL')
time_evaluation(y_test_time, rf_time_test, 'RF TIME TEST')

{'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 50, 'n_jobs': None, 'oob_score': False, 'random_state': 0, 'verbose': 0, 'warm_start': False}


Error metrics (measured in hours) for the RF TIME TRAIN when predicting the next event's Unix timestamp
Mean Absolute Error: 3.25
Root Mean Squared Error: 709.76


Error metrics (measured in hours) for the RF TIME VAL when predicting the next event's Unix timestamp
Mean Absolute Error: 7.6
Root Mean Squared Error: 1748.99


Error metrics (measured in hours) for the RF TIME TEST when predicting the next event's Unix timestamp
Mean Absolute Error: 9.05
Root Mean Squared Error: 1897.07


In [45]:
rf_time_features = dict(zip(features_time, rf_time.feature_importances_))

rf_time_features

# current event, current and prev time, note that active cases is relatively unimportant here too, mention cyclical time too

{'timestamp_finish': 0.2243940517374157,
 'seconds_prev': 0.30303430245697843,
 'active cases': 0.00011317513440475242,
 'day_week': 0.04041619836782698,
 'time_of_day': 0.06835446834755127,
 'case AMOUNT_REQ': 0.12532327283483394,
 'event lifecycle:transition': 0.09193534964528795,
 'current_A_ACCEPTED': 0.002350422029149343,
 'current_A_ACTIVATED': 6.925558321589428e-05,
 'current_A_APPROVED': 9.573732137159489e-05,
 'current_A_CANCELLED': 0.0007559085204182952,
 'current_A_DECLINED': 0.002593765622419306,
 'current_A_FINALIZED': 0.0009758217528619943,
 'current_A_PARTLYSUBMITTED': 0.0031856179999272174,
 'current_A_PREACCEPTED': 0.003162301158400756,
 'current_A_REGISTERED': 0.00010216527914579641,
 'current_A_SUBMITTED': 0.0022362473858487304,
 'current_O_ACCEPTED': 0.0001472687827287863,
 'current_O_CANCELLED': 0.0007565113007666169,
 'current_O_CREATED': 0.0036181219705798727,
 'current_O_DECLINED': 6.972259651017639e-05,
 'current_O_SELECTED': 0.0015403150006980016,
 'current_O_

# Decision Tree event

In [21]:
def DecisionTreeEvent(X_train, y_train, X_val, y_val):


    params={'max_depth': [8, 16, 32, 64, 128, None]}

    tree_clf = DecisionTreeClassifier(random_state=42)
    
    grid = GridSearch(model=tree_clf, param_grid=params, parallelize=False)

    grid.fit(X_train, y_train, X_val, y_val)
    
    return grid.best_estimator_



In [22]:
dt_event = DecisionTreeEvent(X_train_event, y_train_event, X_val_event, y_val_event)

In [23]:
dt_event_train = dt_event.predict(X_train_event)
dt_event_val = dt_event.predict(X_val_event)
dt_event_test = dt_event.predict(X_test_event)

In [24]:
print(dt_event.get_params())

event_evaluation(y_train_event, dt_event_train, 'DT EVENT TRAIN')
event_evaluation(y_val_event, dt_event_val, 'DT EVENT VAL')
event_evaluation(y_test_event, dt_event_test, 'DT EVENT TEST')

{'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}


Error metrics for the DT EVENT TRAIN when predicting the next event
Accuracy: 0.92
Precision: 0.92
Recall: 0.92
f1-score: 0.92


Error metrics for the DT EVENT VAL when predicting the next event
Accuracy: 0.77
Precision: 0.78
Recall: 0.77
f1-score: 0.77


Error metrics for the DT EVENT TEST when predicting the next event
Accuracy: 0.79
Precision: 0.8
Recall: 0.79
f1-score: 0.8


In [46]:
dt_event_features = dict(zip(features_event, dt_event.feature_importances_))

dt_event_features

{'active cases': 0.00010871844903737872,
 'day_week': 0.03477333290806445,
 'time_of_day': 0.05001588810171122,
 'event lifecycle:transition': 0.031735095489763006,
 'case AMOUNT_REQ': 0.0747109996641813,
 'current_A_ACCEPTED': 0.003722387433179566,
 'current_A_ACTIVATED': 0.0028321635818129467,
 'current_A_APPROVED': 0.005153060018560649,
 'current_A_CANCELLED': 0.012628546753077013,
 'current_A_DECLINED': 0.009664028203280392,
 'current_A_FINALIZED': 2.0618446285064007e-05,
 'current_A_PARTLYSUBMITTED': 0.0,
 'current_A_PREACCEPTED': 0.031348989497806845,
 'current_A_REGISTERED': 0.0028197117630874803,
 'current_A_SUBMITTED': 0.07461430843886468,
 'current_O_ACCEPTED': 0.0057046432689246114,
 'current_O_CANCELLED': 0.010651903918637679,
 'current_O_CREATED': 0.035241001379647016,
 'current_O_DECLINED': 0.003864991157721397,
 'current_O_SELECTED': 0.02364483333639318,
 'current_O_SENT': 0.03067079519712621,
 'current_O_SENT_BACK': 0.026510106896761382,
 'current_W_Afhandelen leads': 1

In [26]:
dt_event.get_depth()

48

# Decision tree time

In [27]:
def DecisionTreeTime(X_train, y_train, X_val, y_val):

    params={'max_depth': [8, 16, 32, 64, 128, None]}

    tree_reg = DecisionTreeRegressor(random_state=42)
    
    grid = GridSearch(model=tree_reg, param_grid=params, parallelize=False)

    grid.fit(X_train, y_train, X_val, y_val)
    
    return grid.best_estimator_



In [28]:
dt_time = DecisionTreeTime(X_train_time, y_train_time, X_val_time, y_val_time)

In [29]:
dt_time_train = dt_time.predict(X_train_time)
dt_time_val = dt_time.predict(X_val_time)
dt_time_test = dt_time.predict(X_test_time)

In [30]:
print(dt_time.get_params())

time_evaluation(y_train_time, dt_time_train, 'DT TIME TRAIN')
time_evaluation(y_val_time, dt_time_val, 'DT TIME VAL')
time_evaluation(y_test_time, dt_time_test, 'DT TIME TEST')

{'ccp_alpha': 0.0, 'criterion': 'mse', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 0, 'splitter': 'best'}


Error metrics (measured in hours) for the DT TIME TRAIN when predicting the next event's Unix timestamp
Mean Absolute Error: 0.0
Root Mean Squared Error: 0.0


Error metrics (measured in hours) for the DT TIME VAL when predicting the next event's Unix timestamp
Mean Absolute Error: 8.24
Root Mean Squared Error: 2093.13


Error metrics (measured in hours) for the DT TIME TEST when predicting the next event's Unix timestamp
Mean Absolute Error: 10.24
Root Mean Squared Error: 2404.93


In [50]:
dt_time_features = dict(zip(features_time, dt_time.feature_importances_))

dt_time_features

# active cases is stronger here

{'timestamp_finish': 0.24003062575618003,
 'seconds_prev': 0.2706353103027644,
 'active cases': 2.716685486968852e-05,
 'day_week': 0.04584551769449599,
 'time_of_day': 0.06246247037062935,
 'case AMOUNT_REQ': 0.10685930044380547,
 'event lifecycle:transition': 0.19110169545905775,
 'current_A_ACCEPTED': 4.526081227121611e-08,
 'current_A_ACTIVATED': 9.910098331613159e-13,
 'current_A_APPROVED': 9.29382532881499e-10,
 'current_A_CANCELLED': 1.0483483238714222e-08,
 'current_A_DECLINED': 6.793258404359787e-12,
 'current_A_FINALIZED': 3.531226823559454e-12,
 'current_A_PARTLYSUBMITTED': 3.1007354571091215e-09,
 'current_A_PREACCEPTED': 1.2960836668159157e-13,
 'current_A_REGISTERED': 6.76732531746585e-13,
 'current_A_SUBMITTED': 9.690105958707387e-12,
 'current_O_ACCEPTED': 2.1430668258002979e-13,
 'current_O_CANCELLED': 1.07214095320976e-12,
 'current_O_CREATED': 1.8823487801523044e-16,
 'current_O_DECLINED': 4.950656997995056e-13,
 'current_O_SELECTED': 2.7691368619948167e-12,
 'curren

In [32]:
dt_time.get_depth()

53

## Time, Memory and CPU usage

### Memory and CPU usage

In [None]:
import psutil
import time

# start measuring CPU and memory usage
process = psutil.Process()

# start point
start_time = time.time()

forest_event, y_pred_event = RandomForestEvents(X_train, X_test, y_train, y_test)
event_evaluation(y_test['next_event'], y_pred_event, 'Random forest classifier')
# forest_time, y_pred_time = RandomForestTime(X_train, X_test, y_train, y_test)
# time_evaluation(y_test['next_timestamp'], y_pred_time, 'Random forest regressor')
# end point
end_time = time.time()

# calculate time taken
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# measure CPU and memory usage
cpu_usage = process.cpu_percent()
memory_usage = process.memory_info().rss / 1024 / 1024  # in MB
print(f"CPU usage: {cpu_usage:.2f}%")
print(f"Memory usage: {memory_usage:.2f} MB")

### Time usage

In [None]:
start_time = time.time()

# RandomForestEvents function
forest_event, y_pred_event = RandomForestEvents(X_train, X_test, y_train, y_test)

# event_evaluation function
event_evaluation(y_test['next_event'], y_pred_event, 'Random forest classifier')

print(f"Time taken by RandomForestEvents: {time.time() - start_time} seconds")

start_time = time.time()

# RandomForestTime function
forest_time, y_pred_time = RandomForestTime(X_train, X_test, y_train, y_test)

# # time_evaluation function
# time_evaluation(y_test['next_timestamp'], y_pred_time, 'Random forest regressor')

print(f"Time taken by RandomForestTime: {time.time() - start_time} seconds")

# Visualizations

### Confusion matrix for Random Forest estimator

In [None]:
cm = confusion_matrix(y_test['next_event'], y_pred_event, normalize='true')

# plot confusion matrix as a heatmap
sns.set(font_scale=1.2)
fig, ax = plt.subplots(figsize=(10,8), facecolor='w')  # add facecolor attribute
sns.heatmap(cm, fmt='g', cmap='RdBu_r', ax=ax, annot_kws={"size": 14})
ax.set_xlabel('Predicted labels', fontsize=14)
ax.set_ylabel('True labels', fontsize=14)
ax.set_title('Confusion matrix for Random Forest estimator', fontsize=16)
plt.show()

### Naive and Random Forest estimators accuracy and precision

In [None]:
# plot the naive and random forest f1, acc, pre, recall
metrics_df = pd.DataFrame({
    ' ': ['F1 Score', 'Accuracy', 'Precision', 'Recall'],
    'Naive': ['0.266', '0.346', '0.297', '0.346'],
    'Random Forest': ['0.748', '0.785', '0.772', '0.785']    
})

metrics_df.to_string(index=False)
metrics_df

### Bar chart for feature importance of Random Forest event estimator

In [None]:
import matplotlib.pyplot as plt

# get feature importances and their names
importances = forest_event.feature_importances_
# feature_names = X_train.columns.values
feature_names = ['Current event', 'Previous event', '2nd previous event']

# sort the features by importance in descending order
indices = np.argsort(importances)[::-1]
sorted_importances = importances[indices]

plt.figure(figsize=(10,6))
plt.bar(feature_names, sorted_importances, color=['brown', 'green', 'blue'])
plt.title('Feature Importances of Random Forest event estimator')
plt.xlabel('Features')
plt.xticks(rotation=0)
plt.ylabel('Importance')
plt.show()


### Bar chart feature importance Random Forest time estimator

In [None]:
# get feature importances and their names
importances = forest_time.feature_importances_
feature_names = ['Current event time','Prev event time', 'Prev event', '2nd prev event', 'Current event']

# sort the features by importance in descending order
indices = np.argsort(importances)[::-1]
sorted_importances = importances[indices]

plt.figure(figsize=(10,6))
plt.bar(feature_names, sorted_importances, color=['brown', 'green', 'blue', 'yellow'])
plt.title('Feature Importances of Random Forest time estimator')
plt.xlabel('Features')
plt.xticks(rotation=0)
plt.ylabel('Importance')
plt.show()

In [None]:
residual = pd.concat([y_test, pd.Series(y_pred_time)], axis="columns", ignore_index=True)
residual.columns = ['none','real','pred']
residual['error']= residual['real']-residual['pred']

sns.residplot(x='real', y='error', data=residual)
