In [1]:
import pandas as pd
import numpy as np

import statistics
import time, datetime

#### This estimator consideres the current and 2 previous events (if they exist) to predict the next event. It's currently too inefficient to be applied to the entire dataset.

In [2]:
df_train = pd.read_csv('preprocessed_train.csv')
test = pd.read_csv('preprocessed_test.csv')

In [7]:
def complex_event_estimator(df, current_event, position, prev_event=None, second_prev_event=None):
    """An estimator for predicting the next event, makes use of the previous 2 events, if they exist

    Args:
        df (pd.DataFrame): preprocessed dataframe
        current_event (str): the name of the current event
        position (int): _description_
        prev_event (str, optional): the event 1 position before the current event. Defaults to None.
        second_prev_event (str, optional): the event 2 positions before the current event. Defaults to None.

    Returns:
        number: Return the most common next event type 
    """

    next_list=[]
    
    for row in df.iterrows(): 
        
        # if 2 previous events exist and are provided, go through the data and record the next event (if it exists) 
        # every time the same 3 events (as 2nd prev, prev, current) occur in the same order
        
        if position >= 3:     
            
            # verify the data point examined also has 2 preceeding events
            if row[1]['event concept:name'] == current_event and row[1]['position'] >= 3:
                
                # check if the 2 previous events in the data match the previous 2 events of our current event
                if df.iloc[row[0]-1]['event concept:name'] == prev_event and df.iloc[row[0]-2]['event concept:name'] == second_prev_event:
                    
                    # check if the next event in the data is a part of the same sequence
                    if row[1]['case concept:name'] == df.iloc[row[0]+1]['case concept:name']:
                        
                        # if yes, store the next event
                        next_list.append(df.iloc[row[0]+1]['event concept:name'])
           
        # if only the last previous event exists, do the same but without having the 2nd previous event
        
        if position == 2:
            if row[1]['event concept:name'] == current_event and row[1]['position'] >= 2:
                if df.iloc[row[0]-1]['event concept:name'] == prev_event:
                    if row[1]['case concept:name'] == df.iloc[row[0]+1]['case concept:name']:
                        next_list.append(df.iloc[row[0]+1]['event concept:name'])
           
        # if no previous events exist, get the most common event that follows after the current one in the data
        
        if position == 1:
            if row[1]['event concept:name'] == current_event:
                if row[1]['case concept:name'] == df.iloc[row[0]+1]['case concept:name']:
                    next_list.append(df.iloc[row[0]+1]['event concept:name'])

        
    return statistics.mode(next_list)

In [4]:
# test['complex_event']= None
# dct={}

# for idx, row in test.iterrows():
    
#         next_event = complex_event_estimator(test, row['event concept:name'], 
#                                              row['position'], row['prev_event'], row['2prev_event'])
        
#         test['complex_event'].iloc[idx] = next_event[0]

In [9]:
event_estimator = complex_event_estimator(df_train, 'W_Completeren aanvraag', 1, 'A_PREACCEPTED', 'A_PARTLYSUBMITTED')

In [10]:
event_estimator

'W_Completeren aanvraag'