# Sprint 4 — Naive predictor

*Our naive "baseline" predictor model, based on trace position.*

For the type prediction, the naive predictor finds the most common events at every position of the trace. When predicting an event type at position $i$, it returns the most common event for that position.

For the time prediction, there are two interpretations of the baseline description.
1. It calculates for every event the most likely next event, and returns the average time between those two event types.
2. It checks the most common event type at the next position, and returns the average time between those two event types.

In [1]:
# Setup
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('mode.chained_assignment', None)

# Config variables
training_testing_data_path = 'https://raw.githubusercontent.com/NickSot/process_mining/main/BPI_2012.csv'

# Loading and splitting the datasets
df = pd.read_csv(training_testing_data_path)
df.rename(columns = {'event': 'concept:name', 'case':'case:concept:name'},  inplace=True)
df.dropna(inplace=True)
df['startTime'] = pd.to_datetime(df['startTime'])


In [2]:
#tTest train split on date
slices = int(len(df)*0.3)
dftrain = df.iloc[:slices]
dftest = df.iloc[slices:]

dftrain.head()

Unnamed: 0,case:concept:name,concept:name,startTime,completeTime,AMOUNT_REQ,REG_DATE,org:resource
0,173688,A_SUBMITTED,2011-10-01 00:38:44.546,2011/10/01 00:38:44.546,20000,2011/10/01 00:38:44.546,112
1,173688,A_PARTLYSUBMITTED,2011-10-01 00:38:44.880,2011/10/01 00:38:44.880,20000,2011/10/01 00:38:44.546,112
2,173688,A_PREACCEPTED,2011-10-01 00:39:37.906,2011/10/01 00:39:37.906,20000,2011/10/01 00:38:44.546,112
3,173688,A_PREACCEPTED,2011-10-01 00:39:37.906,2011/10/01 00:39:37.906,20000,2011/10/01 00:38:44.546,112
4,173688,W_Completeren aanvraag,2011-10-01 11:36:46.437,2011/10/01 11:45:13.917,20000,2011/10/01 00:38:44.546,112


# 1. Defining training functions

In [3]:
# Get the most common next event type for the given ev_type
# This function replaces block 10 til 15 (10: "def nth_most_common(w, n):", 15: "next_common_train")
def get_most_common_next_type(df, ev_type):
    # Find all rows of the given event type
    target_rows = df[df['concept:name'] == ev_type]
    successors = []
    
    # Loop over all rows of the given event type
    for idx, row in target_rows.iterrows():
        
        # Only check the next row if it exists in the DF
        if idx + 1 < len(df):
            current_case = row['case:concept:name']
            next_case = df.loc[df.index[idx + 1], 'case:concept:name']

            # If the next event in the DF is in the same case, add the event type to the successors list
            if current_case == next_case:
                successors.append(df.loc[df.index[idx + 1], 'concept:name'])
    
    # Return the most common event type in the successors list
    return max(set(successors), key=successors.count) if len(successors) > 0 else None


# Get the time between events and their most common next event type
def get_most_common_next_time(df, ev_type, next_type):
    # Find all rows of the given event type followed by an event of type next_type
    time_diff = []
    for idx, row in df.iterrows():
        if idx + 1 < len(df) and row['concept:name'] == ev_type and df.loc[df.index[idx + 1], 'concept:name'] == next_type:
            time_diff.append((df.loc[df.index[idx + 1], 'startTime'] - row['startTime']).total_seconds())
    
    return np.mean(time_diff)


In [22]:
from collections import Counter

def nth_most_common(w, n):
    sort = dict(sorted(Counter(w).items(), key=lambda item: item[1]))
    return str(list(sort)[-n])


def count_freq(my_list):
 
    # Creating an empty dictionary
    freq = {}
    for item in my_list:
        if (item in freq):
            freq[item] += 1
        else:
            freq[item] = 1
    
    return freq

def get_max(df, task):
    previous = 'blibs'
    empty = []
    case = 'blib'
    for index, row in df.iterrows():
        t = row['concept:name']
        c = row['case:concept:name']
        if (case == c) & (previous == task):
            empty.append(t)
        case = c
        previous = t
    
    freq_list = count_freq(empty)
    
    top = nth_most_common(empty, 1)
    print(f'List of {task}: {freq_list}')
    print(f'Top of {task}: {top}')
    return top


old_caselist = list(df['concept:name'].unique())


# 2. Training model

In [5]:
event_types = df['concept:name'].unique()

most_common_next_types = {}
for ev_type in event_types:
    most_common_next_types[ev_type] = get_most_common_next_type(df=dftrain, ev_type=ev_type)


In [13]:
most_common_next_types

{'A_SUBMITTED': 'A_PARTLYSUBMITTED',
 'A_PARTLYSUBMITTED': 'A_PREACCEPTED',
 'A_PREACCEPTED': 'A_PREACCEPTED',
 'W_Completeren aanvraag': 'W_Completeren aanvraag',
 'A_ACCEPTED': 'O_SELECTED',
 'O_SELECTED': 'A_FINALIZED',
 'A_FINALIZED': 'O_CREATED',
 'O_CREATED': 'O_SENT',
 'O_SENT': 'O_SENT',
 'W_Nabellen offertes': 'W_Nabellen offertes',
 'O_SENT_BACK': 'O_SENT_BACK',
 'W_Valideren aanvraag': 'W_Valideren aanvraag',
 'A_REGISTERED': 'A_ACTIVATED',
 'A_APPROVED': 'A_ACTIVATED',
 'O_ACCEPTED': 'A_REGISTERED',
 'A_ACTIVATED': 'A_REGISTERED',
 'O_CANCELLED': 'O_CREATED',
 'A_DECLINED': 'O_DECLINED',
 'A_CANCELLED': 'O_CANCELLED',
 'W_Afhandelen leads': 'A_PREACCEPTED',
 'O_DECLINED': 'A_DECLINED',
 'W_Nabellen incomplete dossiers': 'W_Nabellen incomplete dossiers',
 'W_Beoordelen fraude': 'W_Beoordelen fraude'}

In [5]:
#most_common_next_times = {}
#for ev_type in event_types:
#    most_common_next_times[ev_type] = get_most_common_next_time(df=dftrain, ev_type=ev_type, next_type=most_common_next_types[ev_type])


In [23]:
next_common_train_old = {}
for value in old_caselist:
    next_common_train_old[value] = get_max(task=value, df=dftrain)

next_common_train_old

List of A_SUBMITTED: {'A_PARTLYSUBMITTED': 3811}
Top of A_SUBMITTED: A_PARTLYSUBMITTED
List of A_PARTLYSUBMITTED: {'A_PREACCEPTED': 1490, 'A_DECLINED': 940, 'A_PARTLYSUBMITTED': 1381, 'W_Afhandelen leads': 1361, 'W_Beoordelen fraude': 20}
Top of A_PARTLYSUBMITTED: A_PREACCEPTED
List of A_PREACCEPTED: {'A_PREACCEPTED': 2194, 'W_Completeren aanvraag': 2194}
Top of A_PREACCEPTED: W_Completeren aanvraag
List of W_Completeren aanvraag: {'A_ACCEPTED': 1561, 'W_Completeren aanvraag': 4713, 'A_CANCELLED': 315, 'A_DECLINED': 323, 'W_Beoordelen fraude': 2, 'O_SELECTED': 8, 'A_FINALIZED': 5}
Top of W_Completeren aanvraag: W_Completeren aanvraag
List of A_ACCEPTED: {'O_SELECTED': 924, 'A_FINALIZED': 598, 'W_Completeren aanvraag': 20, 'A_CANCELLED': 15, 'A_DECLINED': 4}
Top of A_ACCEPTED: O_SELECTED
List of O_SELECTED: {'A_FINALIZED': 932, 'O_CREATED': 869, 'O_CANCELLED': 273}
Top of O_SELECTED: A_FINALIZED


KeyboardInterrupt: 

# 3. Making predictions

In [7]:
from datetime import timedelta

dftest['pred_next_event'] = dftest['concept:name'].map(most_common_next_types)
dftest['pred_next_event_old'] = dftest['concept:name'].map(next_common_train_old)
#dftest['pred_time_until_next'] = dftest['concept:name'].map(most_common_next_times)
#dftest['pred_time_next'] = dftest['startTime'] + pd.to_timedelta(dftest['pred_time_until_next'], unit='s')

In [24]:
dfdiff = dftest[['pred_next_event']][dftest['pred_next_event'] != dftest['pred_next_event_old']]
dfdiff['pred_next_event'].unique()

array(['A_PREACCEPTED'], dtype=object)

# 4. Checking accuracy metrics

In [8]:
# Finding the correct next event for accuracy calculation purposes
dftest['next_event'] = dftest['concept:name'].shift(-1)
dftest['next_case'] = dftest['case:concept:name'].shift(-1)

# Select correctly predicted rows
df_same_cases = dftest[dftest['case:concept:name'] == dftest['next_case']]
df_correct = df_same_cases[df_same_cases['pred_next_event'] == df_same_cases['next_event']]

accuracy = len(df_correct) / len(df_same_cases) * 100
print(f"Event type prediction accuracy: {len(df_correct)} correct / {len(df_same_cases)} total = {accuracy}%")


Event type prediction accuracy: 75228 correct / 124302 total = 60.52034560988561%


In [9]:
# Finding the correct next event for accuracy calculation purposes
#dftest['start_time_next'] = dftest['startTime'].shift(-1)

#df_time_diff = pd.DataFrame()
#df_time_diff['diff_delta'] = pd.to_datetime(dftest['start_time_next']) - dftest['pred_time_until_next'] # x - timedelta.total_seconds()
#df_time_diff['diff_s'] = dftest['diff_delta'].total_seconds()

#df_time_diff.head()

# Select correctly predicted rows
#df_same_cases = df_test[df_test['case'] == df_test['next_case']]
#df_correct = df_same_cases[df_same_cases['pred_next_event'] == df_same_cases['next_event']]

#accuracy = len(df_correct) / len(df_same_cases) * 100
#print(f"Event time prediction RMSE: {len(df_correct)} / {len(df_same_cases)} = {accuracy}")


# 5. Visualizations