In [1]:
import pandas as pd
from collections import Counter

In [2]:
#!pip install pandas

In [3]:
# Load CSV file
#df = pd.read_csv('../event_log/wastewater/log_5000_all_noise_30.csv')
df = pd.read_csv('replicated_even_variant_log.csv')

In [4]:
CASE_ID_COL = 'case'
ACTIVITY_COL = 'activity'
TIMESTAMP_COL = 'timestamp'

In [5]:
# Total traces
n_traces = df[CASE_ID_COL].nunique()

In [6]:
# Total events
n_events = len(df)

In [7]:
# Activities
activities = df[ACTIVITY_COL].unique()
n_activities = len(activities)

In [8]:
# Trace variants
trace_variants = df.groupby(CASE_ID_COL)[ACTIVITY_COL].apply(tuple)
variant_counts = trace_variants.value_counts()
n_variants = len(variant_counts)

In [9]:
# Pareto variants (e.g., top 80% of traces)
cum_sum = variant_counts.cumsum() / variant_counts.sum()
n_pareto_variants = (cum_sum <= 0.8).sum()

In [10]:
# Length stats
trace_lengths = trace_variants.apply(len)
min_length = trace_lengths.min()
max_length = trace_lengths.max()
mean_length = trace_lengths.mean()

In [11]:
# Characterization (number of unique attributes)
trace_attributes = df.drop_duplicates(subset=[CASE_ID_COL])
event_attributes = df.drop_duplicates()

trace_character_count = len(trace_attributes.columns) - 1  # Exclude case_id
event_character_count = len(event_attributes.columns) - 2  # Exclude case_id, timestamp


In [12]:
# --- test ---
print(f'#Trace: {n_traces}')
print(f'#Events: {n_events}')
print(f'#Variants: {n_variants}')
print(f'#ParetoVariants: {n_pareto_variants}')
print(f'#Activities: {n_activities}')
print(f'Min Length: {min_length}')
print(f'Max Length: {max_length}')
print(f'Mean Length: {mean_length:.0f}')
print(f'#Trace Character.: {trace_character_count}')
print(f'#Event Character.: {event_character_count}')

#Trace: 5370
#Events: 35545
#Variants: 1019
#ParetoVariants: 804
#Activities: 1035
Min Length: 3
Max Length: 10
Mean Length: 7
#Trace Character.: 2
#Event Character.: 1
