In [31]:
import glob
import logging
import warnings

import pandas as pd
from definitions import RESULTS_DIR

In [32]:
LAYER_FREEZING_COMBINATIONS= [['0', '1', '2', '3'], ['0', '1', '2'],
                              ['1', '2', '3'], ['0', '2', '3'],
                              ['0', '1', '3'], ['0', '1'], ['0']]
ALL_LOGS = [
    'BPI_Challenge_2013_closed_problems.xes.gz', 
    'BPI_Challenge_2012.xes.gz',
    'BPI_Challenge_2013_incidents.xes.gz', 
    'BPI_Challenge_2013_open_problems.xes.gz',
    'BPI%20Challenge%202017.xes.gz',
    'BPIC15_1.xes',
    'Road_Traffic_Fine_Management_Process.xes.gz', 
    'Sepsis%20Cases%20-%20Event%20Log.xes.gz',
    'helpdesk.csv'
]
BASE_LOGS = [
    'Road_Traffic_Fine_Management_Process.xes.gz', 
    'helpdesk.csv'
]
ARCHITECTURES = ['rnn', 'GPT']

# Training

## Collection of the training statistics
As the training of the models takes a long time, we have trained not all models at once. The training statistics are stored in the `training_figures.csv` files in the `results/rnn` and `results/GPT` folders. The following code collects the training statistics of the models that have been trained. 

In [33]:
# All the timestamps of the training runs
rnn_base_training_timestamps        = ['202312090718', '202312120826']
gpt_base_training_timestamps        = ['202312131309', '202312121708']
rnn_pre_trained_RTFMP_timestamps    = ['202312151300', '202312161146']
rnn_pre_trained_helpdesk_timestamps = ['202312161442']
gpt_pre_trained_RTFMP_timestamps    = ['202312171427', '202312171342']
gpt_pre_trained_helpdesk_timestamps = ['202312181009', '202312182038']

In [34]:
# Next we define a function that returns the path to the training statistics file for a given model and log based on the timestamps.
# The function raises an error if no file is found or multiple files are found.
def get_training_stat_file(architecture, log, training_timestamps):
    files = []
    for training_timestamp in training_timestamps:
        files += glob.glob(f'{RESULTS_DIR}/{architecture}/*/*training_figures_{training_timestamp}.csv')
        
    matching_files = [f for f in files if log in f]
    
    if len(matching_files) == 0:
        raise ValueError(f'No training stat file found for {architecture} and {log} for timestamp {training_timestamp}')
    elif len(matching_files) > 1:
        raise ValueError(f'Multiple training stat files found for {architecture} and {log} for timestamp {training_timestamp}')
    else:
        return matching_files[0]

In [39]:
# Here we collect the training statistics files for all models and logs that have been trained and store them in a dictionary.
training_stat_files = {}
for log in ALL_LOGS:
    training_stat_files[log] = {
        'rnn': {},
        'GPT': {}
    }
    training_stat_files[log]['rnn']['base'] = get_training_stat_file('rnn', log, rnn_base_training_timestamps)
    training_stat_files[log]['GPT']['base'] = get_training_stat_file('GPT', log, gpt_base_training_timestamps)

    if log == 'BPI%20Challenge%202017.xes.gz':
        continue
        
    if log != 'Road_Traffic_Fine_Management_Process.xes.gz':
        training_stat_files[log]['rnn']['RTFMP'] = get_training_stat_file('rnn', log, rnn_pre_trained_RTFMP_timestamps)
        if log != 'BPI_Challenge_2012.xes.gz':
            training_stat_files[log]['GPT']['RTFMP'] = get_training_stat_file('GPT', log, gpt_pre_trained_RTFMP_timestamps)
        
    if log != 'helpdesk.csv':
        training_stat_files[log]['rnn']['helpdesk'] = get_training_stat_file('rnn', log, rnn_pre_trained_helpdesk_timestamps)
        if log != 'BPI_Challenge_2012.xes.gz':
            training_stat_files[log]['GPT']['helpdesk'] = get_training_stat_file('GPT', log, gpt_pre_trained_helpdesk_timestamps)

## Extraction of the training results
In the following we extract the training results from the training statistics files. The results are stored in a pandas DataFrame.
The available information included in the training statistics files is:
- `epoch`: The epoch number
- `elapsed_seconds`: The time in seconds training took for this epoch
- `training_loss_activity`: The training loss for the activity prediction
- `training_loss_time`: The training loss for the time prediction
- `training_loss`: The training loss for the combined activity and time prediction
- `validation_loss_activity`: The validation loss for the activity prediction
- `validation_loss_time`: The validation loss for the time prediction
- `validation_loss`: The validation loss for the combined activity and time prediction

We combine that information with the log, architecture and setting (base, RTFMP or helpdesk) and the freezed layers (only for the pre-trained models) in the DataFrame.

In [43]:
# First we define a function that extracts the training results from a training statistics file
from pandas.errors import ParserWarning


# Extract interesting information from training stat files
def extract_training_results(training_stats_file, pre_trained):
    # The files contain columns that do not have a header. This leads to a warning when loading the file.
    # We ignore this warning.
    with warnings.catch_warnings():
        warnings.simplefilter(action='ignore', category=ParserWarning)
            
        if pre_trained:
            # Load file an determine the line numbers of the header for each layer freezing combination
            with open(training_stats_file, 'r') as f:
                lines = f.readlines()
                header_line_numbers = []
                for i, line in enumerate(lines):
                    if 'epoch' in line:
                        header_line_numbers.append(i)
            if len(header_line_numbers) != 7:
                logging.warning(f'{training_stats_file} may be incomplete!')
            results = {}
            # For each layer freezing combination, load the corresponding part of the file
            for i, header_line_number in enumerate(header_line_numbers):
                if i == len(header_line_numbers) - 1:
                    df = pd.read_csv(training_stats_file, skiprows=header_line_number, index_col=False)
                else:
                    df = pd.read_csv(training_stats_file, skiprows=header_line_number, nrows=header_line_numbers[i+1] - header_line_number - 1, index_col=False)
                
                results[i] = df
        
        else:
            df = pd.read_csv(training_stats_file, index_col=False)
            results = df
        
    return results

In [44]:
# Iterate over all logs and extract training results
training_result_dfs = []
for log in ALL_LOGS:
    for architecture in ARCHITECTURES:
        for setting in training_stat_files[log][architecture].keys():
                pre_trained = setting != 'base'
                extracted = extract_training_results(training_stat_files[log][architecture][setting], pre_trained)
                
                if pre_trained:
                    for i, results in extracted.items():
                        results['log'] = log
                        results['architecture'] = architecture
                        results['setting'] = setting
                        results['freezed_layers'] = str(LAYER_FREEZING_COMBINATIONS[i])
                        
                        training_result_dfs.append(results)
                else:
                    extracted['log'] = log
                    extracted['architecture'] = architecture
                    extracted['setting'] = setting
                    
                    training_result_dfs.append(extracted)



In [47]:
training_results_df = pd.concat(training_result_dfs, ignore_index=True)
training_results_df.to_csv('all_training_results.csv')
training_results_df

Unnamed: 0,datetime,epoch,training_loss_activity,training_loss_time,training_loss,validation_loss_activity,validation_loss_time,validation_loss,elapsed_seconds,log,architecture,setting,freezed_layers
0,202312090718,0,1.9248,0.0013,1.9260,1.8914,0.0011,1.8925,0.565,BPI_Challenge_2013_closed_problems.xes.gz,rnn,base,
1,202312090718,1,1.8428,0.0014,1.8442,1.7063,0.0020,1.7083,0.212,BPI_Challenge_2013_closed_problems.xes.gz,rnn,base,
2,202312090718,2,1.6531,0.0018,1.6549,1.5158,0.0012,1.5170,0.208,BPI_Challenge_2013_closed_problems.xes.gz,rnn,base,
3,202312090718,3,1.5426,0.0020,1.5446,1.4599,0.0012,1.4611,0.205,BPI_Challenge_2013_closed_problems.xes.gz,rnn,base,
4,202312090718,4,1.5979,0.0016,1.5995,1.4324,0.0010,1.4335,0.210,BPI_Challenge_2013_closed_problems.xes.gz,rnn,base,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
39498,202312171427,195,0.6147,0.0042,0.6188,0.1525,0.0032,0.1557,3.698,helpdesk.csv,GPT,RTFMP,['0']
39499,202312171427,196,0.6054,0.0041,0.6095,0.1525,0.0032,0.1557,3.699,helpdesk.csv,GPT,RTFMP,['0']
39500,202312171427,197,0.6082,0.0041,0.6123,0.1523,0.0032,0.1555,3.664,helpdesk.csv,GPT,RTFMP,['0']
39501,202312171427,198,0.6092,0.0041,0.6133,0.1521,0.0032,0.1553,3.671,helpdesk.csv,GPT,RTFMP,['0']


In [46]:
grouped = training_results_df.groupby(['log', 'architecture', 'setting', 'freezed_layers'], dropna=False).last()
grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,datetime,epoch,training_loss_activity,training_loss_time,training_loss,validation_loss_activity,validation_loss_time,validation_loss,elapsed_seconds
log,architecture,setting,freezed_layers,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
BPI%20Challenge%202017.xes.gz,GPT,base,,202312131309,399,0.7520,0.0001,0.7521,0.0767,0.0001,0.0767,171.848
BPI%20Challenge%202017.xes.gz,rnn,base,,202312090718,200,0.9627,0.0003,0.9630,0.3848,0.0002,0.3850,196.965
BPIC15_1.xes,GPT,RTFMP,"['0', '1', '2', '3']",202312171427,199,2.9385,0.0053,2.9438,2.0594,0.0005,2.0599,0.857
BPIC15_1.xes,GPT,RTFMP,"['0', '1', '2']",202312171427,199,2.9993,0.0050,3.0043,2.0760,0.0007,2.0767,0.856
BPIC15_1.xes,GPT,RTFMP,"['0', '1', '3']",202312171427,199,2.9502,0.0080,2.9582,2.0833,0.0007,2.0840,0.855
...,...,...,...,...,...,...,...,...,...,...,...,...
helpdesk.csv,rnn,RTFMP,"['0', '1']",202312161146,199,0.7651,0.0079,0.7731,0.6038,0.0053,0.6091,0.186
helpdesk.csv,rnn,RTFMP,"['0', '2', '3']",202312161146,199,0.8036,0.0078,0.8115,0.5924,0.0049,0.5973,0.178
helpdesk.csv,rnn,RTFMP,['0'],202312161146,199,0.8090,0.0075,0.8165,0.5972,0.0054,0.6026,0.185
helpdesk.csv,rnn,RTFMP,"['1', '2', '3']",202312161146,199,0.8265,0.0079,0.8344,0.6241,0.0047,0.6289,0.186
