In [1]:
import os
import wget
import argparse
import pm4py
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn import metrics 

from processtransformer import constants
from processtransformer.models import transformer
from processtransformer.data.loader import LogsDataLoader
from processtransformer.data.processor import LogsDataProcessor


In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
data_dir = "./datasets/"
if not os.path.exists(data_dir): 
  os.mkdir(data_dir)
_ = wget.download("https://data.4tu.nl/file/1987a2a6-9f5b-4b14-8d26-ab7056b17929/8b99119d-9525-452e-bc8f-236ac76fa9c9")


input_file_path= wget.download("https://data.4tu.nl/file/1987a2a6-9f5b-4b14-8d26-ab7056b17929/8b99119d-9525-452e-bc8f-236ac76fa9c9")
output_file_path='./datasets/BPIC13.csv'

#Write to Pandas Dataframe
log = pm4py.read_xes(input_file_path) #Input Filename
df = pm4py.convert_to_dataframe(log)
df.to_csv(output_file_path)

parsing log, completed traces ::   0%|          | 0/1487 [00:00<?, ?it/s]

In [4]:
df

Unnamed: 0,org:group,resource country,organization country,org:resource,organization involved,org:role,concept:name,impact,product,lifecycle:transition,time:timestamp,case:concept:name
0,Org line A2,INDIA,se,Minnie,J11 2nd,A2_2,Queued,High,PROD191,Awaiting Assignment,2006-01-11 14:49:42+00:00,1-109135791
1,Org line A2,INDIA,se,Minnie,J11 2nd,A2_2,Accepted,High,PROD191,In Progress,2012-03-15 10:53:52+00:00,1-109135791
2,Org line A2,INDIA,se,Minnie,J11 2nd,A2_2,Accepted,High,PROD191,Assigned,2012-03-15 10:56:17+00:00,1-109135791
3,Org line A2,INDIA,se,Minnie,J11 2nd,A2_2,Accepted,High,PROD191,In Progress,2012-03-15 11:09:05+00:00,1-109135791
4,Org line A2,INDIA,se,Minnie,J11 2nd,A2_2,Completed,High,PROD191,Closed,2012-03-15 11:11:33+00:00,1-109135791
...,...,...,...,...,...,...,...,...,...,...,...,...
6655,Org line C,Sweden,se,Karl,G161 2nd,E_7,Completed,Medium,PROD671,Closed,2012-05-30 13:04:30+00:00,1-752134249
6656,Org line A2,Sweden,se,Niklas,U6 2nd,A2_3,Accepted,Major,PROD831,In Progress,2012-05-31 09:58:45+00:00,1-752600115
6657,Org line A2,Sweden,se,Niklas,U6 2nd,A2_3,Completed,Major,PROD831,Closed,2012-05-31 10:01:24+00:00,1-752600115
6658,Org line G3,POLAND,us,Ewa,G199 3rd,,Accepted,High,PROD97,In Progress,2012-05-31 18:07:59+00:00,1-752835764


In [5]:
data_processor = LogsDataProcessor(name='BPIC13', filepath="datasets/BPIC13.csv",  
                                    columns = ["case:concept:name", "concept:name", "time:timestamp"], #specify the columns name containing case_id, activity name and timestamp 
                                    dir_path='datasets', pool = 4)
data_processor.process_logs(task=constants.Task.NEXT_ACTIVITY, sort_temporally= False)

In [6]:
# Load data
data_loader = LogsDataLoader(name = 'BPIC13')

(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_ACTIVITY)

# Prepare training examples for next activity prediction task
train_token_x, train_token_y = data_loader.prepare_data_next_activity(train_df, 
    x_word_dict, y_word_dict, max_case_length)

In [7]:
learning_rate = 0.001
batch_size = 12
epochs = 10

In [8]:
# Create and train a transformer model
transformer_model = transformer.get_next_activity_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size,
    output_dim=num_output)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

transformer_model.fit(train_token_x, train_token_y, 
    epochs=epochs, batch_size=batch_size)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x27ff2ea44c0>

In [9]:
# Evaluate over all the prefixes (k) and save the results
k, accuracies,fscores, precisions, recalls = [],[],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_token_y = data_loader.prepare_data_next_activity(test_data_subset, 
            x_word_dict, y_word_dict, max_case_length)   
        y_pred = np.argmax(transformer_model.predict(test_token_x), axis=1)
        accuracy = metrics.accuracy_score(test_token_y, y_pred)
        precision, recall, fscore, _ = metrics.precision_recall_fscore_support(
            test_token_y, y_pred, average="weighted")
        k.append(i)
        accuracies.append(accuracy)
        fscores.append(fscore)
        precisions.append(precision)
        recalls.append(recall)

k.append(i + 1)
accuracies.append(np.mean(accuracy))
fscores.append(np.mean(fscores))
precisions.append(np.mean(precisions))
recalls.append(np.mean(recalls))



In [10]:
print('Average accuracy across all prefixes:', np.mean(accuracies))
print('Average f-score across all prefixes:', np.mean(fscores))
print('Average precision across all prefixes:', np.mean(precisions))
print('Average recall across all prefixes:', np.mean(recalls))

Average accuracy across all prefixes: 0.49384244883225253
Average f-score across all prefixes: 0.40164260847114513
Average precision across all prefixes: 0.3861076260796069
Average recall across all prefixes: 0.5212781404340443


In [11]:
data_processor = LogsDataProcessor(name='BPIC13', filepath="datasets/BPIC13.csv",  
                                    columns = ["case:concept:name", "concept:name", "time:timestamp"],  #specify the columns name containing case_id, activity name and timestamp 
                                    dir_path='datasets', pool = 4)
data_processor.process_logs(task=constants.Task.NEXT_TIME, sort_temporally= False)

In [12]:
# Load data
data_loader = LogsDataLoader(name = 'BPIC13')

(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.NEXT_TIME)

# Prepare training examples for next time prediction task
(train_token_x, train_time_x, train_token_y, time_scaler, y_scaler) = \
                                    data_loader.prepare_data_next_time(train_df, x_word_dict, max_case_length)

In [13]:
learning_rate = 0.001
batch_size = 12
epochs = 1

In [14]:
# Create and train a transformer model
transformer_model = transformer.get_next_time_model(
        max_case_length=max_case_length, 
        vocab_size=vocab_size)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
        loss=tf.keras.losses.LogCosh())

transformer_model.fit([train_token_x, train_time_x], train_token_y, 
        epochs=epochs, batch_size=batch_size, verbose=2)

466/466 - 20s - loss: 0.2309 - 20s/epoch - 43ms/step


<keras.callbacks.History at 0x27ff79cd730>

In [15]:
# Evaluate over all the prefixes (k) and save the results
k, maes, mses, rmses = [],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_next_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

k.append(i + 1)
maes.append(np.mean(maes))
mses.append(np.mean(mses))
rmses.append(np.mean(rmses))  



In [16]:
print('Average MAE across all prefixes:', np.mean(maes))
print('Average MSE across all prefixes:', np.mean(mses))
print('Average RMSE across all prefixes:', np.mean(rmses))

Average MAE across all prefixes: 19.971987
Average MSE across all prefixes: 435.67032
Average RMSE across all prefixes: 20.518627


In [17]:
data_processor = LogsDataProcessor(name='BPIC13', filepath="datasets/BPIC13.csv",  
                                    columns = ["case:concept:name", "concept:name", "time:timestamp"],  #specify the columns name containing case_id, activity name and timestamp 
                                    dir_path='datasets', pool = 4)
data_processor.process_logs(task=constants.Task.REMAINING_TIME, sort_temporally= False)

In [18]:
# Load data
data_loader = LogsDataLoader(name = 'BPIC13')

(train_df, test_df, x_word_dict, y_word_dict, max_case_length, 
    vocab_size, num_output) = data_loader.load_data(constants.Task.REMAINING_TIME)

# Prepare training examples for next time prediction task
(train_token_x, train_time_x, 
    train_token_y, time_scaler, y_scaler) = data_loader.prepare_data_remaining_time(train_df, 
    x_word_dict, max_case_length)

In [19]:
learning_rate = 0.001
batch_size = 12
epochs = 1

In [20]:
# Create and train a transformer model
transformer_model = transformer.get_remaining_time_model(
    max_case_length=max_case_length, 
    vocab_size=vocab_size)

transformer_model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate),
    loss=tf.keras.losses.LogCosh())

transformer_model.fit([train_token_x, train_time_x], train_token_y, 
        epochs=epochs, batch_size=batch_size, verbose=2)

466/466 - 22s - loss: 0.3121 - 22s/epoch - 46ms/step


<keras.callbacks.History at 0x27ff7dbd940>

In [21]:
# Evaluate over all the prefixes (k) and save the results
k, maes, mses, rmses = [],[],[],[]
for i in range(max_case_length):
    test_data_subset = test_df[test_df["k"]==i]
    if len(test_data_subset) > 0:
        test_token_x, test_time_x, test_y, _, _ = data_loader.prepare_data_remaining_time(
            test_data_subset, x_word_dict, max_case_length, time_scaler, y_scaler, False)   

        y_pred = transformer_model.predict([test_token_x, test_time_x])
        _test_y = y_scaler.inverse_transform(test_y)
        _y_pred = y_scaler.inverse_transform(y_pred)

        k.append(i)
        maes.append(metrics.mean_absolute_error(_test_y, _y_pred))
        mses.append(metrics.mean_squared_error(_test_y, _y_pred))
        rmses.append(np.sqrt(metrics.mean_squared_error(_test_y, _y_pred)))

k.append(i + 1)
maes.append(np.mean(maes))
mses.append(np.mean(mses))
rmses.append(np.mean(rmses))  



In [22]:
print('Average MAE across all prefixes:', np.mean(maes))
print('Average MSE across all prefixes:', np.mean(mses))
print('Average RMSE across all prefixes:', np.mean(rmses))

Average MAE across all prefixes: 117.24571
Average MSE across all prefixes: 14280.811
Average RMSE across all prefixes: 118.38883
