In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, OrdinalEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.linear_model import ElasticNet
from sklearn.datasets import make_regression

In [2]:
train = pd.read_csv('preprocessed_train.csv')
test = pd.read_csv('preprocessed_test.csv')

# remove last event for each case in order to not break the model when accessing the next event time + make predictions on seconds_next + do tuning (grid search, feature selection) and cv


# Removing the last event from each case to reduce noise in the models

In [3]:
last_events = train.groupby('case concept:name').tail(1)

train.drop(last_events.index, axis=0, inplace=True)

# Evaluation


In [4]:
def time_evaluation(y_test, y_pred, model: str):
 
    print(f"Error metrics (measured in hours) for the {model} when predicting the next event's Unix timestamp")
    print('\n')
    print('Mean Absolute Error:', round(mean_absolute_error(y_test, y_pred)/3600,3))
    print('Root Mean Squared Error:', round(np.sqrt(mean_squared_error(y_test, y_pred)/3600),3))
    print('R2 score:', round(r2_score(y_test, y_pred),3))
    
    
def event_evaluation(y_test, y_pred, model: str, avg="weighted"):

    precision = precision_score(y_test, y_pred, average=avg, zero_division=0)
    recall = recall_score(y_test, y_pred, average=avg, zero_division=0)
    F1_score = f1_score(y_test, y_pred, average=avg, zero_division=0)
    accuracy = accuracy_score(y_test, y_pred)
    
    print(f'Error metrics for the {model} when predicting the next event')
    print('\n')
    print(f'Accuracy: {round(accuracy,3)}.')
    print(f'Precision: {round(precision,3)}')
    print(f'Recall: {round(recall,3)}')
    print(f'f1-score: {round(F1_score,3)}')
#     print(confusion_matrix(y_test, y_pred))

In [29]:
train.columns

Index(['Unnamed: 0', 'index', 'eventID ', 'case concept:name', 'case REG_DATE',
       'case AMOUNT_REQ', 'event org:resource', 'event concept:name',
       'event lifecycle:transition', 'event time:timestamp', 'timestamp_start',
       'timestamp_finish', 'day_week', 'time_of_day', 'position', 'prev_event',
       '2prev_event', 'next_event', 'prev_time', 'next_time', 'seconds_next',
       'seconds_prev', 'position inverse', 'case start count',
       'case end count', 'active cases', 'next position'],
      dtype='object')

# Data splitting and encoding

In [None]:
train = pd.read_csv('preprocessed_train.csv')
test = pd.read_csv('preprocessed_test.csv')

df_train_LE = df_train.copy()
df_train_LE = df_train_LE.replace({'event lifecycle:transition': {'SCHEDULE': 0, 'START': 1, 'COMPLETE': 2}})

train_OHE = pd.get_dummies(df_train_LE, prefix=['type'], columns = ['event concept:name'])
test_OHE = pd.get_dummies(df_train_LE, prefix=['type'], columns = ['event concept:name'])

def make_val_set(dataframe):
    """make a validation set from the dataframe"""
    
    #set seed for reproducibility
    np.random.seed(69)
    
    #extract all unique case IDs
    unique_ids = dataframe['case concept:name'].unique()
    
    #select 10% of the unique IDs and use them to create a validation set
    samples = np.random.choice(unique_ids, size=int(len(unique_ids)*0.1), replace=False)
    val_set = dataframe[dataframe['case concept:name'].isin(samples)]
    
    train = dataframe[~dataframe['case concept:name'].isin(samples)]
    
    return val_set, train

val_OHE, train_OHE = make_val_set(train_OHE)

In [None]:
features_time = ['timestamp_finish', 'seconds_prev', 'active cases', 'day_week', 'time_of_day', 
                 'case AMOUNT_REQ', 'event lifecycle:transition', 
 'type_A_ACCEPTED', 'type_A_ACTIVATED', 'type_A_APPROVED',
 'type_A_CANCELLED', 'type_A_DECLINED', 'type_A_FINALIZED','type_A_PARTLYSUBMITTED', 'type_A_PREACCEPTED', 
 'type_A_REGISTERED','type_A_SUBMITTED', 'type_O_ACCEPTED', 'type_O_CANCELLED','type_O_CREATED', 'type_O_DECLINED', 
 'type_O_SELECTED', 'type_O_SENT','type_O_SENT_BACK', 'type_W_Afhandelen leads','type_W_Beoordelen fraude', 
 'type_W_Completeren aanvraag','type_W_Nabellen incomplete dossiers', 'type_W_Nabellen offertes',
 'type_W_Valideren aanvraag', 'type_W_Wijzigen contractgegevens']

target_time = 'seconds_next'

features_event = ['event concept:name','prev_event', '2prev_event', 'active cases', 'day_week', 'time_of_day',  
                  'event lifecycle:transition', 'case AMOUNT_REQ', 
 'type_A_ACCEPTED', 'type_A_ACTIVATED', 'type_A_APPROVED',
 'type_A_CANCELLED', 'type_A_DECLINED', 'type_A_FINALIZED','type_A_PARTLYSUBMITTED', 'type_A_PREACCEPTED', 
 'type_A_REGISTERED','type_A_SUBMITTED', 'type_O_ACCEPTED', 'type_O_CANCELLED','type_O_CREATED', 'type_O_DECLINED', 
 'type_O_SELECTED', 'type_O_SENT','type_O_SENT_BACK', 'type_W_Afhandelen leads','type_W_Beoordelen fraude', 
 'type_W_Completeren aanvraag','type_W_Nabellen incomplete dossiers', 'type_W_Nabellen offertes',
 'type_W_Valideren aanvraag', 'type_W_Wijzigen contractgegevens']

target_event = 'next_event'





X_train_time = train_OHE[features_time]
y_train_time = train_OHE[target_time]

X_val_time = val_OHE[features_time]
y_val_time = val_OHE[target_time]

X_test_time = test_OHE[features_time]
y_test_time = test_OHE[target_time]



X_train_event = train_OHE[features_event]
y_train_event = train_OHE[target_event]

X_val_event = val_OHE[features_event]
y_val_event = val_OHE[target_event]

X_test_event = test_OHE[features_event]
y_test_event = test_OHE[target_event]



# Event prediction

In [9]:
def RandomForestEvents(X_train, X_test, y_train, y_test):
    '''no grid search, no cv, or feature selection, do for final model
    '''
    
    X_train_event = X_train[['event concept:name','prev_event', '2prev_event']]
    y_train_event = y_train['next_event']

    X_test_event = X_test[['event concept:name','prev_event', '2prev_event']]
    y_test_event = y_test['next_event']
    
    forest_clf = RandomForestClassifier(n_estimators=10, bootstrap = False, criterion = 'gini', random_state=42)
    forest_clf.fit(X_train_event, y_train_event)
    
    y_pred = forest_clf.predict(X_test_event)

    return forest_clf, y_pred

In [10]:
forest_event, y_pred_event = RandomForestEvents(X_train, X_test, y_train, y_test)

In [11]:
event_evaluation(y_test['next_event'], y_pred_event, 'Random forest classifier')

Error metrics for the Random forest classifier when predicting the next event


Accuracy: 0.845.
Precision: 0.803
Recall: 0.845
f1-score: 0.818


In [12]:
forest_event.feature_importances_
# current + prev 2 events are all important

array([0.38658421, 0.32330209, 0.2901137 ])

# Time prediction

In [13]:
def RandomForestTime(X_train, X_test, y_train, y_test):
    '''no grid search, no cv, or feature selection, do for final model
    '''
    
    X_train_time = X_train[['event concept:name','prev_event', '2prev_event', 'seconds_prev', 'timestamp_finish']]
    y_train_time = y_train['seconds_next']



    X_test_time = X_test[['event concept:name','prev_event', '2prev_event', 'seconds_prev', 'timestamp_finish']]
    y_test_time = y_test['seconds_next']
    
    forest_reg = RandomForestRegressor(n_estimators=10, bootstrap = False, random_state=42)
    forest_reg.fit(X_train_time, y_train_time)
    
    y_pred = forest_reg.predict(X_test_time)

    return forest_reg, y_pred

In [14]:
forest_time, y_pred_time = RandomForestTime(X_train, X_test, y_train, y_test)

In [15]:
time_evaluation(y_test=st['seconds_next'], y_pred_time, 'Random forest regressor')

SyntaxError: positional argument follows keyword argument (3863106997.py, line 1)

In [None]:
forest_time.feature_importances_
# only the time of prev event seems to be important, great r2 score

# Decision tree event prediction

In [None]:
def DecisionTreeEvents(X_train, X_test, y_train, y_test):
    '''no grid search, no cv, or feature selection, do for final model
    '''
    
    X_train_event = X_train[['event concept:name','prev_event', '2prev_event']]
    y_train_event = y_train['next_event']

    X_test_event = X_test[['event concept:name','prev_event', '2prev_event']]
    y_test_event = y_test['next_event']
    
    dt_clf = DecisionTreeClassifier(random_state=42)
    dt_clf.fit(X_train_event, y_train_event)
    
    y_pred = dt_clf.predict(X_test_event)

    return dt_clf, y_pred

dt_event, y_pred_dt_event = DecisionTreeEvents(X_train, X_test, y_train, y_test)
event_evaluation(y_test['next_event'], y_pred_dt_event, 'Decision tree classifier')

# calculate feature importances
feature_importances_dt_event = dt_event.feature_importances_

feature_names_dt_event = ['event concept:name', 'prev_event', '2prev_event']
for feature_name_dt_event, importance_dt_event in zip(feature_names_dt_event, feature_importances_dt_event):
    print(f"{feature_name_dt_event}: {importance_dt_event:.4f}")

In [None]:
def feature_dt_event(importances_dt_event, feature_names_dt_event, title='Feature Importances DT event prediction'):
    """
    Plot a bar chart of feature importances for the decision tree model for event prediction.
    """

    plt.figure(figsize=(8, 6))
    plt.bar(feature_names_dt_event, importances_dt_event, color=['brown', 'green', 'blue', 'yellow'])
    plt.xlabel('Features')
    plt.ylabel('Importance')
    plt.title(title)
    plt.show()

# Plot the feature importances for the Decision Tree model
feature_dt_event(feature_importances_dt_event, feature_names_dt_event, title='Decision Tree Feature Importances')


In [None]:
cm_dt = confusion_matrix(y_test['next_event'], y_pred_dt_event, normalize='true')

# plot confusion matrix as a heatmap
sns.set(font_scale=1.2)
fig, ax = plt.subplots(figsize=(10, 8), facecolor='w')
sns.heatmap(cm_dt, fmt='.2f', cmap='RdBu_r', ax=ax, annot_kws={"size": 14}, annot=False)
ax.set_xlabel('Predicted labels', fontsize=14)
ax.set_ylabel('True labels', fontsize=14)
ax.set_title('Confusion matrix for Decision Tree estimator', fontsize=16)
plt.show()


# Decision Tree time prediction

In [None]:
from sklearn.tree import DecisionTreeRegressor

def DecisionTreeTime(X_train, X_test, y_train, y_test):
    '''no grid search, no cv, or feature selection, do for final model
    '''
    
    X_train_time = X_train[['event concept:name','prev_event', '2prev_event', 'seconds_prev', 'timestamp_finish']]
    y_train_time = y_train['seconds_next']

    X_test_time = X_test[['event concept:name','prev_event', '2prev_event', 'seconds_prev', 'timestamp_finish']]
    y_test_time = y_test['seconds_next']
    
    dt_reg = DecisionTreeRegressor(random_state=42)
    dt_reg.fit(X_train_time, y_train_time)
    
    y_pred = dt_reg.predict(X_test_time)

    return dt_reg, y_pred

dt_time, y_pred_dt_time = DecisionTreeTime(X_train, X_test, y_train, y_test)
time_evaluation(y_test['seconds_next'], y_pred_dt_time, 'Decision tree regressor')


In [None]:
# get feature importances and their names
feature_importances_dt_time = dt_time.feature_importances_
feature_names_dt_time = ['seconds_prev', 'timestamp_finish', 'prev_event', 'event concept:name', '2prev_event']

# sort the features by importance in descending order
inx = np.argsort(feature_importances_dt_time)[::-1]
sorted_imp_dt_time = feature_importances_dt_time[inx]

plt.figure(figsize=(10, 8))
plt.bar(feature_names_dt_time, sorted_imp_dt_time, color=['brown', 'green', 'blue', 'yellow'])
plt.xticks(rotation=10)
plt.title('Feature Importances of Decision tree time estimator')
plt.xlabel('Features')
plt.ylabel('Importance')
plt.show()

## Time, Memory and CPU usage

### Memory and CPU usage

In [None]:
import psutil
import time

# start measuring CPU and memory usage
process = psutil.Process()

# start point
start_time = time.time()

forest_event, y_pred_event = RandomForestEvents(X_train, X_test, y_train, y_test)
event_evaluation(y_test['next_event'], y_pred_event, 'Random forest classifier')
# forest_time, y_pred_time = RandomForestTime(X_train, X_test, y_train, y_test)
# time_evaluation(y_test['next_timestamp'], y_pred_time, 'Random forest regressor')
# end point
end_time = time.time()

# calculate time taken
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time:.2f} seconds")

# measure CPU and memory usage
cpu_usage = process.cpu_percent()
memory_usage = process.memory_info().rss / 1024 / 1024  # in MB
print(f"CPU usage: {cpu_usage:.2f}%")
print(f"Memory usage: {memory_usage:.2f} MB")

### Time usage

In [None]:
start_time = time.time()

# RandomForestEvents function
forest_event, y_pred_event = RandomForestEvents(X_train, X_test, y_train, y_test)

# event_evaluation function
event_evaluation(y_test['next_event'], y_pred_event, 'Random forest classifier')

print(f"Time taken by RandomForestEvents: {time.time() - start_time} seconds")

start_time = time.time()

# RandomForestTime function
forest_time, y_pred_time = RandomForestTime(X_train, X_test, y_train, y_test)

# # time_evaluation function
# time_evaluation(y_test['next_timestamp'], y_pred_time, 'Random forest regressor')

print(f"Time taken by RandomForestTime: {time.time() - start_time} seconds")

# Visualizations

### Confusion matrix for Random Forest estimator

In [None]:
cm = confusion_matrix(y_test['next_event'], y_pred_event, normalize='true')

# plot confusion matrix as a heatmap
sns.set(font_scale=1.2)
fig, ax = plt.subplots(figsize=(10,8), facecolor='w')  # add facecolor attribute
sns.heatmap(cm, fmt='g', cmap='RdBu_r', ax=ax, annot_kws={"size": 14})
ax.set_xlabel('Predicted labels', fontsize=14)
ax.set_ylabel('True labels', fontsize=14)
ax.set_title('Confusion matrix for Random Forest estimator', fontsize=16)
plt.show()

### Naive and Random Forest estimators accuracy and precision

In [None]:
# plot the naive and random forest f1, acc, pre, recall
metrics_df = pd.DataFrame({
    ' ': ['F1 Score', 'Accuracy', 'Precision', 'Recall'],
    'Naive': ['0.266', '0.346', '0.297', '0.346'],
    'Random Forest': ['0.748', '0.785', '0.772', '0.785']    
})

metrics_df.to_string(index=False)
metrics_df

### Bar chart for feature importance of Random Forest event estimator

In [None]:
import matplotlib.pyplot as plt

# get feature importances and their names
importances = forest_event.feature_importances_
# feature_names = X_train.columns.values
feature_names = ['Current event', 'Previous event', '2nd previous event']

# sort the features by importance in descending order
indices = np.argsort(importances)[::-1]
sorted_importances = importances[indices]

plt.figure(figsize=(10,6))
plt.bar(feature_names, sorted_importances, color=['brown', 'green', 'blue'])
plt.title('Feature Importances of Random Forest event estimator')
plt.xlabel('Features')
plt.xticks(rotation=0)
plt.ylabel('Importance')
plt.show()


### Bar chart feature importance Random Forest time estimator

In [None]:
# get feature importances and their names
importances = forest_time.feature_importances_
feature_names = ['Current event time','Prev event time', 'Prev event', '2nd prev event', 'Current event']

# sort the features by importance in descending order
indices = np.argsort(importances)[::-1]
sorted_importances = importances[indices]

plt.figure(figsize=(10,6))
plt.bar(feature_names, sorted_importances, color=['brown', 'green', 'blue', 'yellow'])
plt.title('Feature Importances of Random Forest time estimator')
plt.xlabel('Features')
plt.xticks(rotation=0)
plt.ylabel('Importance')
plt.show()

In [None]:
residual = pd.concat([y_test, pd.Series(y_pred_time)], axis="columns", ignore_index=True)
residual.columns = ['none','real','pred']
residual['error']= residual['real']-residual['pred']

sns.residplot(x='real', y='error', data=residual)
