In [160]:
# list the folders in the results directory
import os
import pandas as pd
import re

miners = [f.name for f in os.scandir('RealLifeLogs/results_final_final') if f.is_dir()]

res_df = pd.DataFrame()

for miner in miners:
  # Read the results.csv file
  try:
    df = pd.read_csv(f'RealLifeLogs/results_final_final/{miner}/results.csv')
    df['miner'] = miner
    df['log_seed'] = df['model'].apply(lambda x: x.replace(f'{miner}_', '').replace('.xes.pnml', ''))
    df['log'] = df['log_seed'].apply(lambda x: '_'.join(x.split('_')[:-1]))
    res_df = pd.concat([res_df, df])
  except:
    print(f"Error reading {miner}")

# Print the unique miners
print(res_df['miner'].unique())
# Print the amount of entries per miner
print(res_df['miner'].value_counts())

print(res_df.head())

['Heuristic_Miner', 'OptIMIISt_filter_new', 'ILP_Miner_0', 'IMf', 'AIM']
['Heuristic_Miner' 'OptIMIISt_filter_new' 'ILP_Miner_0' 'IMf' 'AIM']
miner
Heuristic_Miner         50
OptIMIISt_filter_new    50
ILP_Miner_0             50
IMf                     50
AIM                     50
Name: count, dtype: int64
                                               model  size  activities  \
0  Heuristic_Miner_BPI_Challenge_2013_closed_prob...    57           4   
1  Heuristic_Miner_BPI_Challenge_2013_closed_prob...    57           4   
2  Heuristic_Miner_BPI_Challenge_2020_Prepaid_Tra...   409          29   
3  Heuristic_Miner_BPI_Challenge_2020_Prepaid_Tra...   405          29   
4  Heuristic_Miner_Sepsis_Cases_-_Event_Log_23022...   275          16   

   is_sound  alignments_fitness  alignments_precision            miner  \
0      True            0.986963              0.868998  Heuristic_Miner   
1      True            0.986865              0.869518  Heuristic_Miner   
2     False            0

In [161]:
# Create averge results based on log and miner
# Average the specified columns
res_df_avg = res_df.groupby(['log', 'miner']).agg({
  'size': 'mean',
  'activities': 'mean',
  'alignments_fitness': 'mean',
  'alignments_precision': 'mean',
  'is_sound': lambda x: x.mean() * 100,  # percentage of True values
  'model': 'count'  # count the number of entries
}).reset_index()

# Rename the miner "ILP_Miner_0" to "ILP_Miner"
res_df_avg['miner'] = res_df_avg['miner'].apply(lambda x: 'ILP_Miner' if x == 'ILP_Miner_0' else x)
# Rename the Heuristic Miner to "Heuristic"
res_df_avg['miner'] = res_df_avg['miner'].apply(lambda x: 'Heuristic' if 'Heuristic_Miner' in x else x)
# Rename OptIMIIst filter to "OptIMISt"
res_df_avg['miner'] = res_df_avg['miner'].apply(lambda x: 'OptIMIIst' if 'OptIMIISt_Filter' in x else x)
# Rename optIMIIst_filter_new
res_df_avg['miner'] = res_df_avg['miner'].apply(lambda x: 'OptIMIIst' if 'OptIMIISt_filter_new' in x else x)

# Rename the 'log' column to be latex friendly
res_df_avg['log'] = res_df_avg['log'].apply(lambda x: x.replace('BPI_Challenge', 'BPIC'))


res_df_avg['log'] = res_df_avg['log'].apply(lambda x: x.replace('_', ' ', 1))

# Function to convert the strings
def convert_string(log):
    # Extract year and the part after the second underscore
    match = re.search(r' (\d{4})_(.*?)$', log)  # Capture the year and the part after the second underscore
    if match:
        year = match.group(1)
        description = match.group(2)
        
        # Generate initials by splitting the description and taking the first letter of each word
        initials = ''.join([word[0].upper() for word in description.split('_')])
        
        # Format the result with $ and spaces around the year
        return f"BPIC {year}$_{{{initials}}}$"
    else:
        return log  # Return the original string if format doesn't match

# Apply the function to the 'logs' column
res_df_avg['log'] = res_df_avg['log'].apply(convert_string)

# The Sepsis Cases_-_Event_Log logs should be renamed to Sepsis Cases
res_df_avg['log'] = res_df_avg['log'].apply(lambda x: 'Sepsis' if 'Sepsis Cases_-_Event_Log' == x else x)

# Rename the miner column to be latex friendly
res_df_avg['miner'] = res_df_avg['miner'].apply(lambda x: x.replace('_', ' '))

# Rename the 'model' column to 'count'
res_df_avg.rename(columns={'model': 'count'}, inplace=True)

# Round fitness and precision to 2 decimal places
res_df_avg['alignments_fitness'] = res_df_avg['alignments_fitness'].round(2)
res_df_avg['alignments_precision'] = res_df_avg['alignments_precision'].round(2)

# Rename the alignment columns to the same without alignment
res_df_avg.rename(columns={
  'log': 'Event Log',
  'miner': 'Algorithm',
  'size'  : 'Size',
  'activities': 'Activities',
  'alignments_fitness': 'Fitness',
  'alignments_precision': 'Precision',
  'is_sound': 'Sound(\%)'
}, inplace=True)

# Order the log first by Event Log and then by Algorithm for algorithm the OptIMIIst should be on top
res_df_avg['Algorithm'] = pd.Categorical(res_df_avg['Algorithm'], categories=['OptIMIIst', 'AIM', 'Heuristic', 'ILP Miner', 'IMf'], ordered=True)
res_df_avg = res_df_avg.sort_values(['Event Log', 'Algorithm'])

# Round the size and activities to 1 decimal places
res_df_avg['Size'] = res_df_avg['Size'].round(1)
res_df_avg['Activities'] = res_df_avg['Activities'].round(1)

print(res_df_avg.head())

   Event Log  Algorithm   Size  Activities  Fitness  Precision  Sound(\%)  \
4  BPIC 2012  OptIMIIst  214.4        21.4     0.70       0.33      100.0   
0  BPIC 2012        AIM  161.2        21.0     0.59       0.72      100.0   
1  BPIC 2012  Heuristic  288.4        23.8      NaN        NaN        0.0   
2  BPIC 2012  ILP Miner  296.0        24.0     1.00       0.11        0.0   
3  BPIC 2012        IMf  242.6        23.8     0.99       0.12      100.0   

   count  
4      5  
0      5  
1      5  
2      5  
3      5  


  'is_sound': 'Sound(\%)'


In [162]:
res_df_avg

Unnamed: 0,Event Log,Algorithm,Size,Activities,Fitness,Precision,Sound(\%),count
4,BPIC 2012,OptIMIIst,214.4,21.4,0.7,0.33,100.0,5
0,BPIC 2012,AIM,161.2,21.0,0.59,0.72,100.0,5
1,BPIC 2012,Heuristic,288.4,23.8,,,0.0,5
2,BPIC 2012,ILP Miner,296.0,24.0,1.0,0.11,0.0,5
3,BPIC 2012,IMf,242.6,23.8,0.99,0.12,100.0,5
14,BPIC 2013$_{CP}$,OptIMIIst,45.8,4.0,0.99,0.84,100.0,5
10,BPIC 2013$_{CP}$,AIM,17.4,3.0,0.79,1.0,100.0,5
11,BPIC 2013$_{CP}$,Heuristic,57.4,4.0,0.99,0.87,100.0,5
12,BPIC 2013$_{CP}$,ILP Miner,41.6,4.0,1.0,0.79,100.0,5
13,BPIC 2013$_{CP}$,IMf,50.6,3.8,0.99,0.95,100.0,5


In [163]:
# Remove the count column
res_df_avg.drop(columns=['count'], inplace=True)

# Create a latex table
latex = res_df_avg.to_latex(index=False, float_format='%.2f')
print(latex)

\begin{tabular}{llrrrrr}
\toprule
Event Log & Algorithm & Size & Activities & Fitness & Precision & Sound(\%) \\
\midrule
BPIC 2012 & OptIMIIst & 214.40 & 21.40 & 0.70 & 0.33 & 100.00 \\
BPIC 2012 & AIM & 161.20 & 21.00 & 0.59 & 0.72 & 100.00 \\
BPIC 2012 & Heuristic & 288.40 & 23.80 & NaN & NaN & 0.00 \\
BPIC 2012 & ILP Miner & 296.00 & 24.00 & 1.00 & 0.11 & 0.00 \\
BPIC 2012 & IMf & 242.60 & 23.80 & 0.99 & 0.12 & 100.00 \\
BPIC 2013$_{CP}$ & OptIMIIst & 45.80 & 4.00 & 0.99 & 0.84 & 100.00 \\
BPIC 2013$_{CP}$ & AIM & 17.40 & 3.00 & 0.79 & 1.00 & 100.00 \\
BPIC 2013$_{CP}$ & Heuristic & 57.40 & 4.00 & 0.99 & 0.87 & 100.00 \\
BPIC 2013$_{CP}$ & ILP Miner & 41.60 & 4.00 & 1.00 & 0.79 & 100.00 \\
BPIC 2013$_{CP}$ & IMf & 50.60 & 3.80 & 0.99 & 0.95 & 100.00 \\
BPIC 2013$_{I}$ & OptIMIIst & 48.00 & 4.00 & 0.99 & 0.85 & 100.00 \\
BPIC 2013$_{I}$ & AIM & 37.00 & 3.00 & 0.91 & 0.99 & 100.00 \\
BPIC 2013$_{I}$ & Heuristic & 53.00 & 4.00 & 0.99 & 0.89 & 100.00 \\
BPIC 2013$_{I}$ & ILP Miner & 39

In [164]:
# Go through the latex string line by line.
latex2 = ""
table_start = False
latest_log = None

def handleLine(line):
    global latex2
    global latest_log
    # Get substring until the first '&'
    log = line.split('&')[0].strip()

    if latest_log != log:
        latest_log = log
        latex2 += '\\midrule\n'
        latex2 += line + '\n'
    else:
        latex2 += '&' + '&'.join(line.split('&')[1:]) + '\n'
        

for line in latex.split('\n'):
    if not table_start:
        if line.startswith('BPIC'):
            table_start = True
            handleLine(line)
        else:
            latex2 += line + '\n'
    else:
        if not line.startswith('\\bottomrule'):
            handleLine(line)
        else:
            table_start = False
            latex2 += line + '\n'

print(latex2)

\begin{tabular}{llrrrrr}
\toprule
Event Log & Algorithm & Size & Activities & Fitness & Precision & Sound(\%) \\
\midrule
\midrule
BPIC 2012 & OptIMIIst & 214.40 & 21.40 & 0.70 & 0.33 & 100.00 \\
& AIM & 161.20 & 21.00 & 0.59 & 0.72 & 100.00 \\
& Heuristic & 288.40 & 23.80 & NaN & NaN & 0.00 \\
& ILP Miner & 296.00 & 24.00 & 1.00 & 0.11 & 0.00 \\
& IMf & 242.60 & 23.80 & 0.99 & 0.12 & 100.00 \\
\midrule
BPIC 2013$_{CP}$ & OptIMIIst & 45.80 & 4.00 & 0.99 & 0.84 & 100.00 \\
& AIM & 17.40 & 3.00 & 0.79 & 1.00 & 100.00 \\
& Heuristic & 57.40 & 4.00 & 0.99 & 0.87 & 100.00 \\
& ILP Miner & 41.60 & 4.00 & 1.00 & 0.79 & 100.00 \\
& IMf & 50.60 & 3.80 & 0.99 & 0.95 & 100.00 \\
\midrule
BPIC 2013$_{I}$ & OptIMIIst & 48.00 & 4.00 & 0.99 & 0.85 & 100.00 \\
& AIM & 37.00 & 3.00 & 0.91 & 0.99 & 100.00 \\
& Heuristic & 53.00 & 4.00 & 0.99 & 0.89 & 100.00 \\
& ILP Miner & 39.00 & 4.00 & 1.00 & 0.63 & 100.00 \\
& IMf & 48.00 & 4.00 & 0.96 & 0.72 & 100.00 \\
\midrule
BPIC 2017 & OptIMIIst & 219.60 & 25.