In [4]:
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as log_converter
import pandas as pd
import numpy as np

In [5]:
log = xes_importer.apply("../data/BPIC2017.xes.gz")
df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)

parsing log, completed traces ::   0%|          | 0/31509 [00:00<?, ?it/s]

## 3.2 Basic Analysis

This section collects a few basic statistics about the BPIC2017 event log.
These values give a simple overview of the log size and structure of the event log.

The following metrics are included:
- number of cases
- number of events
- number of process variants
- number of case and event labels
- mean and standard deviation of case length
- mean and standard deviation of case duration (days, minutes, seconds)
- number of categorical event attributes

In [6]:
# number of cases
num_cases = df['case:concept:name'].nunique()

# number of events
num_events = len(df)

# number of process variants
from pm4py.statistics.traces.generic.log import case_statistics
num_variants = len(case_statistics.get_variant_statistics(log))

# number of case labels
num_case_labels = df['case:concept:name'].nunique()

# number of  event labels
num_event_labels = df['concept:name'].nunique()

num_cases, num_events, num_variants, num_case_labels, num_event_labels


(31509, 1202267, 15930, 31509, 26)

### Case Length

In [7]:
case_lengths = df.groupby('case:concept:name')['concept:name'].count()

case_length_mean = case_lengths.mean()

case_length_std = case_lengths.std()

case_length_mean, case_length_std

(np.float64(38.15630454790695), np.float64(16.71530805656438))

### Case Duration

In [16]:
df['time:timestamp'] = pd.to_datetime(df['time:timestamp'])

case_start = df.groupby('case:concept:name')['time:timestamp'].min()
case_end = df.groupby('case:concept:name')['time:timestamp'].max()
case_duration = case_end - case_start

import numpy as np

duration_days_mean = case_duration.dt.total_seconds().mean() / (60*60*24)
duration_days_std = case_duration.dt.total_seconds().std() / (60*60*24)

duration_minutes_mean = case_duration.dt.total_seconds().mean() / 60
duration_minutes_std = case_duration.dt.total_seconds().std() / 60

duration_seconds_mean = case_duration.dt.total_seconds().mean()
duration_seconds_std = case_duration.dt.total_seconds().std()

duration_days_mean, duration_days_std, duration_minutes_mean, duration_minutes_std, duration_seconds_mean, duration_seconds_std


(np.float64(21.899605591087365),
 np.float64(13.1692325332843),
 np.float64(31535.432051165804),
 np.float64(18963.694847929393),
 np.float64(1892125.9230699483),
 np.float64(1137821.6908757635))

### Categorical Event Attributes

In [5]:
categorical_cols = df.select_dtypes(include=['object']).columns
num_categorical = len(categorical_cols)

num_categorical

12

### Attribute Distributions
To get an idea of how the process behaves, the frequency of activities and resources
is shown below. This helps to see which steps occur most often and how the workload
is distributed across users.

Only the most common values are displayed for readability.

#### Activity Frequency

In [7]:
activity_counts = df['concept:name'].value_counts()
activity_counts.head(20)

concept:name
W_Validate application      209496
W_Call after offers         191092
W_Call incomplete files     168529
W_Complete application      148900
W_Handle leads               47264
O_Create Offer               42995
O_Created                    42995
O_Sent (mail and online)     39707
A_Validating                 38816
A_Create Application         31509
A_Concept                    31509
A_Accepted                   31509
A_Complete                   31362
O_Returned                   23305
A_Incomplete                 23055
O_Cancelled                  20898
A_Submitted                  20423
O_Accepted                   17228
A_Pending                    17228
A_Cancelled                  10431
Name: count, dtype: int64

#### Resource Frequency

In [8]:
resource_counts = df['org:resource'].value_counts()
resource_counts.head(20)

org:resource
User_1      148404
User_3       26342
User_5       22900
User_87      22498
User_30      21272
User_49      21134
User_123     20909
User_29      20860
User_100     20651
User_2       19134
User_27      18806
User_121     18726
User_28      18412
User_68      17581
User_116     17423
User_10      16365
User_113     16151
User_118     16005
User_75      15955
User_41      15481
Name: count, dtype: int64

### Additional Metric 1: Average Number of Offers per Case

In [19]:
offer_events = df[df['concept:name'].str.startswith('O_')]

offers_per_case = offer_events.groupby('case:concept:name')['concept:name'].count()

mean_offers = offers_per_case.mean()
median_offers = offers_per_case.median()

print("Average number of offer events per case:", round(mean_offers, 2))
print("Median number of offer events per case:", median_offers)


Average number of offer events per case: 6.15
Median number of offer events per case: 5.0


### Additional Metric 2: Ratio of A, O, and W Activities

In [20]:
a_count = df['concept:name'].str.startswith('A_').sum()
o_count = df['concept:name'].str.startswith('O_').sum()
w_count = df['concept:name'].str.startswith('W_').sum()

total_events = len(df)

a_ratio = a_count / total_events * 100
o_ratio = o_count / total_events * 100
w_ratio = w_count / total_events * 100
a_ratio, o_ratio, w_ratio

(np.float64(19.92860155023801),
 np.float64(16.123623121985382),
 np.float64(63.94777532777661))