In [1]:
import pandas as pd
import pm4py
from scipy.stats import chi2_contingency
import numpy as np
import ipywidgets as widgets
import plotly.express as px
import pandas as pd
from IPython.display import display
import streamlit as st
import plotly.graph_objects as go



In [2]:
CASE_ID_COL = 'Case ID'
TIMESTAMP_COL = 'Complete Timestamp'
ACTIVITY_COL = 'Activity'

path_to_log = r"C:\Users\20232378\Documents\phd beginning\logview-1.0.1\notebooks\dataset\Road_Traffic_Fine_Management_Process.csv"
df = pd.read_csv(path_to_log, dtype={'Resource': str, 'matricola': str}, parse_dates=[TIMESTAMP_COL])
df = df.sort_values([CASE_ID_COL, TIMESTAMP_COL], ignore_index=True)
log_rtfm = pm4py.format_dataframe(df, case_id=CASE_ID_COL, activity_key=ACTIVITY_COL, timestamp_key=TIMESTAMP_COL)

In [3]:
selected_activity = 'Insert Fine Notification'
matching_case_ids = log_rtfm.loc[log_rtfm['Activity'] == selected_activity, 'Case ID'].unique()

result_set = log_rtfm[log_rtfm['Case ID'].isin(matching_case_ids)]
complement = log_rtfm[~log_rtfm['Case ID'].isin(matching_case_ids)]

print(f"Cases in result set: {result_set['Case ID'].nunique()}")
print(f"Cases in complement: {complement['Case ID'].nunique()}")


Cases in result set: 79860
Cases in complement: 70510


In [4]:
result_set.head(4)

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,amount,article,dismissal,expense,...,notificationType,paymentAmount,points,totalPaymentAmount,vehicleClass,case:concept:name,concept:name,time:timestamp,@@index,@@case_index
2,A100,Create Fine,561.0,2006-08-02 00:00:00+00:00,Variant 1,1,35.0,157.0,NIL,,...,,,0.0,0.0,A,A100,Create Fine,2006-08-02 00:00:00+00:00,2,1
3,A100,Send Fine,,2006-12-12 00:00:00+00:00,Variant 1,1,,,,11.0,...,,,,,,A100,Send Fine,2006-12-12 00:00:00+00:00,3,1
4,A100,Insert Fine Notification,,2007-01-15 00:00:00+00:00,Variant 1,1,,,,,...,P,,,,,A100,Insert Fine Notification,2007-01-15 00:00:00+00:00,4,1
5,A100,Add penalty,,2007-03-16 00:00:00+00:00,Variant 1,1,71.5,,,,...,,,,,,A100,Add penalty,2007-03-16 00:00:00+00:00,5,1


In [5]:
complement.head(4)

Unnamed: 0,Case ID,Activity,Resource,Complete Timestamp,Variant,Variant index,amount,article,dismissal,expense,...,notificationType,paymentAmount,points,totalPaymentAmount,vehicleClass,case:concept:name,concept:name,time:timestamp,@@index,@@case_index
0,A1,Create Fine,561.0,2006-07-24 00:00:00+00:00,Variant 3,3,35.0,157.0,NIL,,...,,,0.0,0.0,A,A1,Create Fine,2006-07-24 00:00:00+00:00,0,0
1,A1,Send Fine,,2006-12-05 00:00:00+00:00,Variant 3,3,,,,11.0,...,,,,,,A1,Send Fine,2006-12-05 00:00:00+00:00,1,0
23,A10005,Create Fine,537.0,2007-03-20 00:00:00+00:00,Variant 2,2,36.0,157.0,NIL,,...,,,0.0,0.0,A,A10005,Create Fine,2007-03-20 00:00:00+00:00,23,5
24,A10005,Payment,,2007-03-21 00:00:00+00:00,Variant 2,2,,,,,...,,36.0,,36.0,,A10005,Payment,2007-03-21 00:00:00+00:00,24,5


In [21]:
def plot_activity_exclusivity(result_set, complement, activity_col='Activity', case_col='Case ID'):
    result_counts = result_set.groupby(activity_col)[case_col].nunique().rename("Result Set")
    complement_counts = complement.groupby(activity_col)[case_col].nunique().rename("Complement")

    df_plot = pd.concat([result_counts, complement_counts], axis=1).fillna(0).reset_index()

    df_melt = df_plot.melt(id_vars=activity_col, var_name="Set", value_name="Cases")

    fig = px.bar(df_melt, x=activity_col, y="Cases", color="Set", barmode='group',
                 title="Activity Exclusivity (Unique Cases per Activity)",
                 color_discrete_map={"Result Set": "blue", "Complement": "orange"},
                width=550, height=400)
    fig.update_layout(
        xaxis_tickangle=-45,
        margin=dict(l=50, r=50, t=50, b=200),  # Add bottom margin
        xaxis=dict(
            tickfont=dict(size=9),
            title=dict(text=activity_col, standoff=25)  # Add space between title and ticks
        ),
        yaxis=dict(
            tickfont=dict(size=9),
            title=dict(text="Cases", standoff=10)
        )
    )

    return fig


In [24]:
def plot_numeric_distribution(result_set, complement, numeric_col, case_col='Case ID', timestamp_col='Complete Timestamp'):
    def get_latest_values(df):
        return (
            df.sort_values([case_col, timestamp_col])
              .dropna(subset=[numeric_col])
              .groupby(case_col)
              .last()
              .reset_index()
        )

    latest_result = get_latest_values(result_set)
    latest_complement = get_latest_values(complement)

    fig = go.Figure()

    fig.add_trace(go.Histogram(
        x=latest_result[numeric_col],
        #nbinsx=30,
        opacity=0.6,
        name='Result Set',
        marker_color='blue'
    ))

    fig.add_trace(go.Histogram(
        x=latest_complement[numeric_col],
        #nbinsx=30,
        opacity=0.6,
        name='Complement',
        marker_color='orange'
    ))

    fig.update_layout(
        barmode='overlay',
        title=f"Distribution of {numeric_col} (Latest per Case)",
        width=550,
        height=400,
        margin=dict(l=50, r=50, t=50, b=100),
        xaxis=dict(
            tickfont=dict(size=9),
            title=dict(text=numeric_col, standoff=25)
        ),
        yaxis=dict(
            tickfont=dict(size=9),
            title=dict(text="Number of Cases", standoff=10)
        )
    )

    return fig

In [32]:
def show_dashboard(result_set, complement, numeric_options):
    activity_fig = plot_activity_exclusivity(result_set, complement)

    dropdown = widgets.Dropdown(
        options=numeric_options,
        description='Numeric Attribute:',
        layout=widgets.Layout(width='50%')
    )

    out_left = widgets.Output()
    out_right = widgets.Output()

    with out_left:
        display(activity_fig)

    def on_change(change):
        if change['type'] == 'change' and change['name'] == 'value':
            out_right.clear_output(wait=True)
            with out_right:
                fig = plot_numeric_distribution(result_set, complement, numeric_col=change['new'])
                fig.show()

    dropdown.observe(on_change)

    if numeric_options:
        fig = plot_numeric_distribution(result_set, complement, numeric_col=numeric_options[0])
        with out_right:
            display(fig)

    display(widgets.HBox([
        out_left,
        widgets.VBox([dropdown, out_right])
    ]))


In [33]:
numeric_columns = df.select_dtypes(include='number').columns.tolist()
show_dashboard(result_set, complement, numeric_options=numeric_columns)

HBox(children=(Output(), VBox(children=(Dropdown(description='Numeric Attribute:', layout=Layout(width='50%'),…