In [31]:
import pandas as pd
import pm4py

## Importing the data

In [32]:
data = pd.read_csv("event_logs/BPI_Challenge_2019_edited.csv", sep=";", parse_dates=["event time:timestamp"])

In [33]:
data.head()

Unnamed: 0,eventID,case Spend area text,case Company,case Document Type,case Sub spend area text,case Purchasing Document,case Purch. Doc. Category name,case Vendor,case Item Type,case Item Category,...,case Name,case GR-Based Inv. Verif.,case Item,case concept:name,case Goods Receipt,event User,event org:resource,event concept:name,event Cumulative net worth (EUR),event time:timestamp
0,0,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,Purchase order,vendorID_0000,Standard,"3-way match, invoice before GR",...,vendor_0000,False,1,2000000000_00001,True,batch_00,batch_00,SRM: Created,298.0,2018-02-01 13:53:00
1,6,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,Purchase order,vendorID_0000,Standard,"3-way match, invoice before GR",...,vendor_0000,False,1,2000000000_00001,True,batch_00,batch_00,SRM: Change was Transmitted,298.0,2018-02-01 14:53:00
2,3,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,Purchase order,vendorID_0000,Standard,"3-way match, invoice before GR",...,vendor_0000,False,1,2000000000_00001,True,batch_00,batch_00,SRM: Document Completed,298.0,2018-02-01 14:53:00
3,2,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,Purchase order,vendorID_0000,Standard,"3-way match, invoice before GR",...,vendor_0000,False,1,2000000000_00001,True,batch_00,batch_00,SRM: Awaiting Approval,298.0,2018-02-01 14:53:00
4,1,CAPEX & SOCS,companyID_0000,EC Purchase order,Facility Management,2000000000,Purchase order,vendorID_0000,Standard,"3-way match, invoice before GR",...,vendor_0000,False,1,2000000000_00001,True,batch_00,batch_00,SRM: Complete,298.0,2018-02-01 14:53:00


## Looking at contents of columns in the original event log

There are many unnecessary features that contain very little information.

In [34]:
data['case Company'].value_counts()

companyID_0000    997358
companyID_0003      2617
companyID_0001         7
Name: case Company, dtype: int64

In [35]:
pd.crosstab(data['case Company'], data['case Document Type'])

case Document Type,EC Purchase order,Framework order,Standard PO
case Company,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
companyID_0000,14309,16271,966778
companyID_0001,0,0,7
companyID_0003,0,2617,0


In [36]:
data["case Item Category"].value_counts()

3-way match, invoice before GR    758324
3-way match, invoice after GR     216177
Consignment                        22864
2-way match                         2617
Name: case Item Category, dtype: int64

The very first time stamp included in the event log is in 1948. This is a mistake.

In [37]:
data["event time:timestamp"].min()

Timestamp('1948-01-26 23:59:00')

In [38]:
data["event time:timestamp"].max()

Timestamp('2018-12-08 23:59:00')

In [39]:
len(data[data["event time:timestamp"]<"2018-01-01"].sort_values("event time:timestamp"))

318

In [40]:
data[data["event time:timestamp"]<"2018-01-01"].nunique()

eventID                             318
case Spend area text                 13
case Company                          2
case Document Type                    3
case Sub spend area text             38
case Purchasing Document             97
case Purch. Doc. Category name        1
case Vendor                          81
case Item Type                        5
case Item Category                    4
case Spend classification text        3
case Source                           1
case Name                            80
case GR-Based Inv. Verif.             2
case Item                            76
case concept:name                   264
case Goods Receipt                    2
event User                            4
event org:resource                    4
event concept:name                    3
event Cumulative net worth (EUR)    190
event time:timestamp                 93
dtype: int64

## Filtering the data, removing and renaming columns

In [41]:
# only sub-process "3-way match, invoice before GR"
data_filtered = data[data["case Item Category"]=="3-way match, invoice before GR"]
data_filtered = data_filtered[data_filtered['case Document Type']=="Standard PO"]
data_filtered = data_filtered[data_filtered['case Item Type']=="Standard"]

In [42]:
# only useful columns
data_filtered = data_filtered[["case concept:name", "case Purchasing Document", "eventID ", "event concept:name", "event time:timestamp", "event org:resource",
               "event Cumulative net worth (EUR)", "case Vendor", "case Spend classification text", "case Spend area text", "case Sub spend area text"]]

In [43]:
# renaming the columns
data_filtered.columns=["case_id", "case_PO_id", "event_id", "event_name", "event_timestamp", "event_resource", "event_value_EUR", 
                       "case_vendor", "case_PR_NPR", "case_product_type", "case_product_specific"]

In [44]:
# removing all cases occurring before 2018
data_filtered = data_filtered[data_filtered["event_timestamp"]>"2018-01-01"]

In [45]:
data_filtered = data_filtered.reset_index(drop=True)

In [46]:
data_filtered

Unnamed: 0,case_id,case_PO_id,event_id,event_name,event_timestamp,event_resource,event_value_EUR,case_vendor,case_PR_NPR,case_product_type,case_product_specific
0,4507000221_00010,4507000221,1430224109568,Create Purchase Order Item,2018-01-01 03:42:00,batch_03,208.0,vendorID_0103,PR,Packaging,Labels
1,4507000221_00010,4507000221,1430224109569,Receive Order Confirmation,2018-02-01 11:04:00,user_029,208.0,vendorID_0103,PR,Packaging,Labels
2,4507000221_00010,4507000221,1430224109570,Record Goods Receipt,2018-05-01 13:36:00,user_030,208.0,vendorID_0103,PR,Packaging,Labels
3,4507000221_00010,4507000221,1430224109571,Record Invoice Receipt,2018-05-01 17:35:00,user_024,208.0,vendorID_0103,PR,Packaging,Labels
4,4507000221_00010,4507000221,1430224109572,Vendor creates invoice,2018-05-01 23:59:00,NONE,208.0,vendorID_0103,PR,Packaging,Labels
...,...,...,...,...,...,...,...,...,...,...,...
721256,4508070768_00010,4508070768,1001285725716480,Vendor creates invoice,2018-08-23 23:59:00,NONE,5415.0,vendorID_0267,PR,Latex & Monomers,Styrene Acrylics
721257,4508072736_00010,4508072736,1029847258234880,Vendor creates invoice,2018-12-03 23:59:00,NONE,866.0,vendorID_0259,OTHER,Others,Raw Material
721258,4508072736_00020,4508072736,1029851553202176,Vendor creates invoice,2018-07-17 23:59:00,NONE,606.0,vendorID_0259,OTHER,Others,Raw Material
721259,4508075167_00010,4508075167,1066019472801792,Vendor creates invoice,2018-06-14 23:59:00,NONE,797.0,vendorID_0314,OTHER,Others,Raw Material


## Importing the case complete filtered log from ProM to save as .csv

The filtered event log was then filtered further in ProM, in order to select only cases that have been completed (ending on either "clear invoice" or "delete purchase order").

A second dataset was also created, selecting only completed cases that ended on "clear invoice".

In [47]:
log = pm4py.read_xes('event_logs/filtered_event_log_complete.xes')
complete_cases = pm4py.convert_to_dataframe(log)

parsing log, completed traces ::   0%|          | 0/29941 [00:00<?, ?it/s]

In [48]:
complete_cases.head()

Unnamed: 0,event_id,concept:name,case_PO_id,case_product_type,case_product_specific,event_value_EUR,event_resource,lifecycle:transition,time:timestamp,case_vendor,case_PR_NPR,case:concept:name
0,2280627634176,Create Purchase Order Item,4507000266,Packaging,Labels,134.0,user_054,start,2018-02-01 08:53:00+00:00,vendorID_0103,PR,4507000266_00010
1,2280627634177,Receive Order Confirmation,4507000266,Packaging,Labels,134.0,user_029,start,2018-02-01 10:04:00+00:00,vendorID_0103,PR,4507000266_00010
2,2280627634178,Record Goods Receipt,4507000266,Packaging,Labels,134.0,user_055,start,2018-03-01 09:20:00+00:00,vendorID_0103,PR,4507000266_00010
3,2280627634179,Vendor creates invoice,4507000266,Packaging,Labels,134.0,NONE,start,2018-03-01 22:59:00+00:00,vendorID_0103,PR,4507000266_00010
4,2280627634180,Record Invoice Receipt,4507000266,Packaging,Labels,134.0,user_012,start,2018-04-01 13:15:00+00:00,vendorID_0103,PR,4507000266_00010


In [49]:
complete_cases.columns=["event_id", "event_name", "case_PO_id", "case_product_type", "case_product_specific", "event_value_EUR", "event_resource", "remove", "event_timestamp", "case_vendor", "case_PR_NPR", "case_id"]

In [50]:
complete_cases = complete_cases[["case_id", "case_PO_id", "event_id", "event_name", "event_timestamp", "event_resource", "event_value_EUR", 
                       "case_vendor", "case_PR_NPR", "case_product_type", "case_product_specific"]]

In [51]:
complete_cases.nunique()

case_id                   29941
case_PO_id                 8919
event_id                 155209
event_name                   26
event_timestamp           28836
event_resource              302
event_value_EUR            6069
case_vendor                 853
case_PR_NPR                   3
case_product_type            16
case_product_specific        98
dtype: int64

In [52]:
complete_cases.groupby("case_id").nth(0)["event_name"].unique()

array(['Create Purchase Order Item'], dtype=object)

In [53]:
complete_cases.groupby("case_id").nth(-1)["event_name"].unique()

array(['Clear Invoice', 'Delete Purchase Order Item'], dtype=object)

---
Creating a log containing only cases that finished on "clear invoice" (not delete purchase order item):

In [54]:
log = pm4py.read_xes('event_logs/filtered_event_log_complete_clear_invoice.xes')
complete_cases_clear_invoice = pm4py.convert_to_dataframe(log)

parsing log, completed traces ::   0%|          | 0/25877 [00:00<?, ?it/s]

In [55]:
complete_cases_clear_invoice.columns=["event_id", "event_name", "case_PO_id", "case_product_type", "case_product_specific", "event_value_EUR", "event_resource", "remove", "event_timestamp", "case_vendor", "case_PR_NPR", "case_id"]

In [56]:
complete_cases_clear_invoice = complete_cases_clear_invoice[["case_id", "case_PO_id", "event_id", "event_name", "event_timestamp", "event_resource", "event_value_EUR", 
                       "case_vendor", "case_PR_NPR", "case_product_type", "case_product_specific"]]

In [57]:
complete_cases_clear_invoice.nunique()

case_id                   25877
case_PO_id                 7447
event_id                 146273
event_name                   26
event_timestamp           25992
event_resource              286
event_value_EUR            5485
case_vendor                 783
case_PR_NPR                   3
case_product_type            16
case_product_specific        96
dtype: int64

In [58]:
complete_cases_clear_invoice.groupby("case_id").nth(-1)["event_name"].unique()

array(['Clear Invoice'], dtype=object)