In [1]:
import pandas as pd
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter

# Exploration of the BPIC 2017 Event Log

This notebook provides an exploration of the BPIC2017 event log.
The goal is to develop an understanding of the event structure, the case composition,
and the variability of the process.

In [3]:
log = xes_importer.apply("../data/BPIC2017.xes.gz")
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

### Columns

In [5]:
print(df.columns.tolist())
df.head()

['Action', 'org:resource', 'concept:name', 'EventOrigin', 'EventID', 'lifecycle:transition', 'time:timestamp', 'case:LoanGoal', 'case:ApplicationType', 'case:concept:name', 'case:RequestedAmount', 'FirstWithdrawalAmount', 'NumberOfTerms', 'Accepted', 'MonthlyCost', 'Selected', 'CreditScore', 'OfferedAmount', 'OfferID']


Unnamed: 0,Action,org:resource,concept:name,EventOrigin,EventID,lifecycle:transition,time:timestamp,case:LoanGoal,case:ApplicationType,case:concept:name,case:RequestedAmount,FirstWithdrawalAmount,NumberOfTerms,Accepted,MonthlyCost,Selected,CreditScore,OfferedAmount,OfferID
0,Created,User_1,A_Create Application,Application,Application_652823628,complete,2016-01-01 09:51:15.304000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
1,statechange,User_1,A_Submitted,Application,ApplState_1582051990,complete,2016-01-01 09:51:15.352000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
2,Created,User_1,W_Handle leads,Workflow,Workitem_1298499574,schedule,2016-01-01 09:51:15.774000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
3,Deleted,User_1,W_Handle leads,Workflow,Workitem_1673366067,withdraw,2016-01-01 09:52:36.392000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,
4,Created,User_1,W_Complete application,Workflow,Workitem_1493664571,schedule,2016-01-01 09:52:36.403000+00:00,Existing loan takeover,New credit,Application_652823628,20000.0,,,,,,,,


### Inspecting a Typical Case (Random)


In [6]:
one_case_id = df['case:concept:name'].sample(1).iloc[0]
print("Random case ID:", one_case_id)

Random case ID: Application_754734698


A random case is shown below, sorted by timestamp and displaying only the activity, lifecycle transition, resource, and event time.

In [7]:
case_df = df[df['case:concept:name'] == one_case_id].sort_values('time:timestamp')

case_lengths = df.groupby('case:concept:name')['concept:name'].count()
one_case_id_length = case_lengths.loc[one_case_id]


print("Number of events in this case:", one_case_id_length)

case_df[['time:timestamp', 'concept:name', 'lifecycle:transition', 'org:resource']].head(20)


Number of events in this case: 31


Unnamed: 0,time:timestamp,concept:name,lifecycle:transition,org:resource
805579,2016-09-06 09:34:13.979000+00:00,A_Create Application,complete,User_1
805580,2016-09-06 09:34:14.011000+00:00,A_Submitted,complete,User_1
805581,2016-09-06 09:34:14.179000+00:00,W_Handle leads,schedule,User_1
805582,2016-09-06 09:35:25.003000+00:00,W_Handle leads,withdraw,User_1
805583,2016-09-06 09:35:25.009000+00:00,W_Complete application,schedule,User_1
805584,2016-09-06 09:35:25.013000+00:00,A_Concept,complete,User_1
805585,2016-09-08 11:54:03.317000+00:00,W_Complete application,start,User_49
805586,2016-09-08 11:57:18.567000+00:00,A_Accepted,complete,User_49
805587,2016-09-08 11:58:39.185000+00:00,O_Create Offer,complete,User_49
805588,2016-09-08 11:58:39.726000+00:00,O_Created,complete,User_49


Running the random case selection multiple times always results in applications with different event counts, which shows how variable the process is

## Shortest and Longest Cases

To illustrate the variability within the event log,
the cases with the minimum and maximum number of events are analyzed.

- The **shortest case** shows the minimal path through the process
  and typically represents loan applications that were processed quickly
  with few interactions or interruptions.

- The **longest case** shows a highly complex process instance
  involving numerous work-item transitions or repeated activity cycles,
  often indicating delays, rework, or intensive human interaction.

Inspecting both extremes provides an intuitive understanding of
the range of process behaviors present in the event log.

### Shortest Case

The shortest case is displayed below, ordered by timestamp, to show how a minimal loan application unfolds


In [7]:
case_lengths = df.groupby('case:concept:name')['concept:name'].count()

short_case_id = case_lengths.idxmin()
short_case_event_count = case_lengths.loc[short_case_id]

print("Longest case ID:", short_case_id)
print("Number of events in this case:", short_case_event_count)

short_case = df[df['case:concept:name'] == short_case_id].sort_values('time:timestamp')
short_case[['time:timestamp', 'concept:name', 'lifecycle:transition', 'org:resource']]

Longest case ID: Application_1045162022
Number of events in this case: 10


Unnamed: 0,time:timestamp,concept:name,lifecycle:transition,org:resource
925437,2016-10-08 10:33:07.161000+00:00,A_Create Application,complete,User_37
925438,2016-10-08 10:33:07.163000+00:00,A_Concept,complete,User_37
925439,2016-10-08 10:33:07.168000+00:00,W_Complete application,schedule,User_37
925440,2016-10-08 10:33:07.169000+00:00,W_Complete application,start,User_37
925441,2016-10-08 10:33:45.088000+00:00,A_Accepted,complete,User_37
925442,2016-10-08 10:36:51.448000+00:00,O_Create Offer,complete,User_37
925443,2016-10-08 10:36:52.015000+00:00,O_Created,complete,User_37
925444,2016-10-08 10:40:42.577000+00:00,A_Cancelled,complete,User_37
925445,2016-10-08 10:40:42.594000+00:00,O_Cancelled,complete,User_37
925446,2016-10-08 10:40:42.601000+00:00,W_Complete application,complete,User_37


### Longest Case

The longest case is shown below, sorted by timestamp, I am displaying the first 40 events.




In [11]:
case_lengths = df.groupby("case:concept:name").size()

# Get longest case ID and event count
longest_case_id = case_lengths.idxmax()
longest_case_length = case_lengths.max()

print("Longest case ID:", longest_case_id)
print("Number of events:", longest_case_length)



Longest case ID: Application_1219772874
Number of events: 180
