# Preprocessing file

In [None]:
import pandas as pd
import pm4py as pm4
import numpy as np
import plotly.express as px
from sklearn.preprocessing import LabelEncoder

from scipy import stats

In [None]:
df_old = pm4.convert_to_dataframe(pm4.read.read_xes('BPI_Challenge_2012.xes.gz'))
df_old.head()

# Goals of the current preprocessing step
- Delete unfinished traces from the dataset
- Delete traces which are too long by amount of events (Address that)
- Delete traces which are too long by amount of time taken (Adrress events which took so long or time)
- Delete duplicates
- Delete intersecting events for train and test

In [None]:
#Make a list of all possible concept_names
concept_names = ['A_ACCEPTED', 'A_ACTIVATED', 'A_APPROVED', 'A_CANCELLED', 'A_DECLINED', 'A_FINALIZED', 'A_PREACCEPTED', 'A_REGISTERED', 'O_ACCEPTED', 'O_CANCELLED', 'O_CREATED',
                 'O_DECLINED', 'O_SELECTED', 'O_SENT', 'O_SENT_BACK', 'W_Afhandelen leads', 'W_Beoordelen fraude', 'W_Nabellen incomplete dossiers', 'W_Nabellen offertes',
                   'W_Valideren aanvraag', 'W_Wijzigen contractgegevens']

# Remove duplicates

In [None]:
df_old = df_old.drop_duplicates(keep = 'first')

# Removing unfinished entries

In [None]:
df_old['remove'] = 0
completed_dict = {}

# Save the case:concept:name and the corresponding activity that have a complete lifecycle transition.

for line in range(len(df_old)):
    if df_old.loc[line, 'lifecycle:transition'] == 'COMPLETE' and df_old.loc[line, 'case:concept:name'] not in completed_dict.keys():
        completed_dict[df_old.loc[line,'case:concept:name']] = [df_old.loc[line, 'concept:name']]
    elif df_old.loc[line, 'lifecycle:transition'] == 'COMPLETE' and df_old.loc[line, 'case:concept:name'] in completed_dict.keys():
        completed_dict[df_old.loc[line,'case:concept:name']] += [df_old.loc[line, 'concept:name']]

# If the activity will be completed for this case, and this is a different transition than complete, save in the dataframe to remove this
for line in range(len(df_old)):
    if df_old.loc[line, 'lifecycle:transition'] != 'COMPLETE' and df_old.loc[line, 'case:concept:name'] in completed_dict.keys() and df_old.loc[line, 'concept:name'] in completed_dict[df_old.loc[line,'case:concept:name']]:
        df_old.loc[line, 'remove'] = 1

# Now remove the lines that are not the complete transition, but do have that for this activity.
        
df = df_old.loc[df_old['remove'] == 0].reset_index()
df = df.drop(axis = 1, columns = 'remove')
print(f'We lose {len(df_old) - len(df)} out of {len(df_old)} lines')

### Taking into consideration trace length that has at least 0.5% contribution in the dataset (losing less than 5% of the data) 

In [None]:
percent_cases_length = df.groupby('case:concept:name').count().sort_values(by = 'index').groupby('index').count()
percent_cases_length['percent'] = df.groupby('case:concept:name').count().sort_values(by = 'index').groupby('index').count()['concept:name'].apply(lambda x: round(100* x/sum(df.groupby('case:concept:name').count().sort_values(by = 'index').groupby('index').count()['concept:name']),2))
print('Percent of data we loose:', round(percent_cases_length[percent_cases_length['percent']<0.5]['percent'].sum(),1))
percent_cases_length = percent_cases_length[percent_cases_length['percent']>=0.5]
percent_cases_length.head()

In [None]:
cases_length = df.groupby('case:concept:name').count().sort_values(by = 'index')
work_id = cases_length[cases_length['index']<= percent_cases_length.index[-1]].reset_index()['case:concept:name'].to_list()

In [None]:
df = df[df['case:concept:name'].isin(work_id)]

In [None]:
df = df.drop(['index', 'org:resource', 'lifecycle:transition', 'case:REG_DATE', 'case:AMOUNT_REQ'], axis = 1)

### Saving cleaned up version of the data

In [None]:
df.to_csv('cleaned_data.csv', index = False)