# Event Log Statistics

- nb variants / nb traces


In [1]:
import pandas as pd
import numpy as np
import pm4py

In [18]:
def _get_variant_stats(df, cluster_col, trace_col):
    ratio = []
    variants = []
    for o, df_ in df.groupby(cluster_col):
        vars_per_origin = list(pm4py.statistics.variants.log.get.get_variants(df_).keys())
        variants.append((o, len(vars_per_origin)))
        n_traces_per_origin = len(df_[trace_col].unique())
        ratio_vars_traces = len(vars_per_origin) / n_traces_per_origin
        ratio.append(
            (o, ratio_vars_traces)
        )
    return ratio, variants

In [19]:
def analyze_dataset(
        df,
        trace_col="case:concept:name",# 'trace_id',
        event_col="concept:name", # 'event_id',
        timestamp_col='time:timestamp',
        cluster_col="origin", # 'cluster',
    ):

    stats = {}

    stats['dataset_size'] = len(df)
    stats['num_traces'] = df[trace_col].nunique()
    stats['num_clusters'] = df[cluster_col].nunique()

    cluster_dist = df.groupby(cluster_col)[trace_col].nunique()
    stats['cluster_distribution'] = cluster_dist

    trace_lengths = df.groupby(trace_col)[event_col].count()
    stats['trace_lengths'] = trace_lengths.describe()

    event_freq = df.groupby(event_col)[trace_col].nunique()
    stats['event_frequency'] = event_freq
    stats['len_vocabulary'] = len(event_freq)


    ratios, variants = _get_variant_stats(df, cluster_col, trace_col)
    stats["variant_ratio_per_cluster"] = ratios
    stats["num_variants_per_cluster"] = variants
    # TODO this does not work, but is probably also not that important
    # df[timestamp_col] = pd.to_datetime(df[timestamp_col])
    # avg_timestamp = df[timestamp_col].mean()
    # stats['avg_timestamp'] = avg_timestamp

    # timestamp_variance = df[timestamp_col].var()
    # stats['timestamp_variance'] = timestamp_variance

    cluster_size = df.groupby(cluster_col)[trace_col].nunique()
    stats['cluster_size_distribution'] = cluster_size

    return stats


In [20]:
log = pm4py.read_xes("./datasets/synthetic-smart-parking-log.xes")

In [21]:
dataset_stats = analyze_dataset(log, cluster_col="label")

# Print the statistics for the dataset
for stat, value in dataset_stats.items():
    print(f"{stat}: {value}")
    print("------")


dataset_size: 134887
------
num_traces: 30000
------
num_clusters: 3
------
cluster_distribution: label
0    10000
1    10000
2    10000
Name: case:concept:name, dtype: int64
------
trace_lengths: count    30000.000000
mean         4.496233
std          2.059497
min          2.000000
25%          2.000000
50%          5.000000
75%          6.000000
max         17.000000
Name: concept:name, dtype: float64
------
event_frequency: concept:name
 Chat/Answer FAQs                                          5062
Book a space                                              10053
Call or online chat/FAQs                                   5039
Find their location                                       10053
Load FSM App                                              10053
Park in parking space                                     10053
Pay by My Wallet (online)                                 10053
Provide availability information                           9896
Receive requirements from motorists        

In [73]:
bpi2013 = pm4py.read_xes("./datasets/bpi2013-downsampled.xes")
bpi2015 = pm4py.read_xes("./datasets/bpi2015-downsampled-reduced.xes")

In [74]:
dataset_stats = analyze_dataset(
    bpi2013, 
)

# Print the statistics for the dataset
for stat, value in dataset_stats.items():
    print(f"{stat}: {value}")
    print("------")


dataset_size: 21859
------
num_traces: 3817
------
num_clusters: 3
------
cluster_distribution: origin
closed.xes       1487
incidents.xes    1511
open.xes          819
Name: case:concept:name, dtype: int64
------
trace_lengths: count    3817.000000
mean        5.726749
std         5.694162
min         1.000000
25%         2.000000
50%         4.000000
75%         7.000000
max       116.000000
Name: concept:name, dtype: float64
------
event_frequency: concept:name
Accepted     3797
Completed    3359
Queued       1690
Unmatched      11
Name: case:concept:name, dtype: int64
------
variant_ratio_per_cluster: [(0, 0.1230665770006725), (1, 0.2753143613500993), (2, 0.13186813186813187)]
------
cluster_size_distribution: origin
closed.xes       1487
incidents.xes    1511
open.xes          819
Name: case:concept:name, dtype: int64
------


In [75]:
dataset_stats = analyze_dataset(
    bpi2015, 
)

# Print the statistics for the dataset
for stat, value in dataset_stats.items():
    print(f"{stat}: {value}")
    print("------")


dataset_size: 49516
------
num_traces: 1059
------
num_clusters: 5
------
cluster_distribution: origin
mun_1.xes    231
mun_2.xes    156
mun_3.xes    233
mun_4.xes    210
mun_5.xes    229
Name: case:concept:name, dtype: int64
------
trace_lengths: count    1059.000000
mean       46.757318
std        17.023297
min         3.000000
25%        40.000000
50%        45.000000
75%        55.000000
max       132.000000
Name: concept:name, dtype: float64
------
event_frequency: concept:name
01_BB_540       396
01_BB_545         6
01_BB_546         5
01_BB_550         4
01_BB_550_1       2
               ... 
15_NGV_010       71
16_LGSD_010     176
16_LGSV_010     218
99_NOCODE_01      2
99_NOCODE_02     17
Name: case:concept:name, Length: 420, dtype: int64
------
variant_ratio_per_cluster: [(0, 1.0), (1, 1.0), (2, 0.9871244635193133), (3, 1.0), (4, 1.0)]
------
cluster_size_distribution: origin
mun_1.xes    231
mun_2.xes    156
mun_3.xes    233
mun_4.xes    210
mun_5.xes    229
Name: case:conc

In [54]:
lang2013 = pm4py.statistics.variants.log.get.get_language(bpi2013)

In [None]:
variants = list(pm4py.statistics.variants.log.get.get_variants(bpi2013).keys())

In [55]:
lang2013

{('Queued',
  'Accepted',
  'Accepted',
  'Accepted',
  'Completed'): 0.0007859575582918522,
 ('Accepted',
  'Accepted',
  'Accepted',
  'Accepted',
  'Accepted',
  'Completed'): 0.005501702908042965,
 ('Accepted',
  'Accepted',
  'Accepted',
  'Accepted',
  'Completed'): 0.020434896515588157,
 ('Accepted',
  'Accepted',
  'Queued',
  'Accepted',
  'Accepted',
  'Accepted',
  'Completed'): 0.0034058160859313596,
 ('Accepted',
  'Accepted',
  'Accepted',
  'Accepted',
  'Accepted',
  'Accepted',
  'Completed'): 0.004715745349751114,
 ('Accepted',
  'Accepted',
  'Accepted',
  'Queued',
  'Accepted',
  'Queued',
  'Accepted',
  'Completed'): 0.0007859575582918522,
 ('Accepted', 'Accepted', 'Completed'): 0.16190725700812156,
 ('Accepted',
  'Queued',
  'Accepted',
  'Queued',
  'Accepted',
  'Accepted',
  'Queued',
  'Accepted',
  'Completed'): 0.00026198585276395077,
 ('Accepted', 'Unmatched', 'Completed'): 0.0007859575582918522,
 ('Queued',
  'Accepted',
  'Accepted',
  'Queued',
  'Que