In [5]:
# %% [markdown]
# # import dependencies

# %%
#log section
from pm4py.objects.log.importer.xes import importer as xes_importer
#export log to csv section
import pandas as pd
from pm4py.objects.conversion.log import converter as log_converter
import numpy as np
#create feature section

# %%
LIST_DATA_SETS=['DomesticDeclarations','InternationalDeclarations','Road_Traffic_Fine_Management_Process', 'PermitLog', 'PrepaidTravelCost', 'RequestForPayment']


for DATASET_NAME in LIST_DATA_SETS:
    # %% [markdown]
    # # import log

    # %%
    print(DATASET_NAME)
    print('load xes...')
    log = xes_importer.apply(f'../../src/datasets/datasets for tagging/{DATASET_NAME}.xes')

    # %%
    #print(log[0])

    # %%
    #print(log[0][0])

    # %%


    # %% [markdown]
    # # export log to csv

    # %%
    df = log_converter.apply(log, variant=log_converter.Variants.TO_DATA_FRAME)
    df = df.sort_values(by=['case:concept:name', 'time:timestamp'])


    # %%
    df

    # %%
    df['time_delta'] = df['time:timestamp'] - df['time:timestamp'].shift()

    # %%
    print('calculate NaT...')
    from itertools import islice
    df=df.reset_index(drop=True)
    #first_run=True
    #for index, row in df.iterrows():
    #    if first_run:
    #        first_run=False
    #else:
        #print(row['case:concept:name'] )
        #print(row['time_delta'])
        #print(index)
    #    if row['case:concept:name'] != df.loc[index-1]['case:concept:name']:
    #        df.loc[index,'time_delta'] = pd.NaT


    for index, row in islice(df.iterrows(), 1, None):
        if row['case:concept:name'] != df.loc[index-1]['case:concept:name']:
            df.loc[index,'time_delta'] = pd.NaT

    # %%
    df

    # %%
    np.mean(df['time_delta'])

    # %%
    df.to_pickle(f'../../src/datasets/{DATASET_NAME}_baseline_dataset.pkl')

    # %%
    df.head()

    # %% [markdown]
    # # create features
    print('create features...')
    # %%
    df = pd.read_pickle(f'../../src/datasets/{DATASET_NAME}_baseline_dataset.pkl')

    # %%
    df

    # %% [markdown]
    # **execution frequency**

    # %%
    number_of_all_activities = len(df['concept:name'])
    number_of_all_activities

    # %%
    number_of_unique_activities = len(df['concept:name'].value_counts().keys())
    number_of_unique_activities

    # %%
    df_execution_frequency = pd.DataFrame({'concept:name':df['concept:name'].value_counts().keys(),'execution_frequncy': df['concept:name'].value_counts().values / number_of_all_activities})
    df_execution_frequency

    # %% [markdown]
    # **execution time**

    # %%
    # median
    series_execution_time_median = df.groupby(['concept:name'])['time_delta'].median()
    df_execution_time_median = pd.DataFrame({'concept:name':series_execution_time_median.keys(),'execution_time_median': series_execution_time_median.values})
    df_execution_time_median

    # %%
    #mean
    series_execution_time_mean = df.groupby(['concept:name'])['time_delta'].mean()
    df_execution_time_mean = pd.DataFrame({'concept:name':series_execution_time_mean.keys(),'execution_time_mean': series_execution_time_mean.values})
    df_execution_time_mean

    # %%
    #std
    df_copy = df.copy()
    df_copy['time_delta'] = df_copy['time_delta'].apply(lambda x:x.days)

    series_execution_time_std = df_copy.groupby(['concept:name'])['time_delta'].std()
    df_execution_time_std = pd.DataFrame({'concept:name':series_execution_time_std.keys(),'execution_time_std_days': series_execution_time_std.values})
    df_execution_time_std

    # %%
    # median
    series_execution_time_skew = df_copy.groupby(['concept:name'])['time_delta'].skew()
    df_execution_time_skew = pd.DataFrame({'concept:name':series_execution_time_skew.keys(),'execution_time_skew': series_execution_time_skew.values})
    df_execution_time_skew

    # %% [markdown]
    # **failure rate**

    # %%
    series_value_counts_activities = df['concept:name'].value_counts()
    series_value_counts_activities

    # %%
    list_failure_rate_absolute=[]
    for key in series_value_counts_activities.keys():
        number_of_traces=0
        for activities in df.groupby('case:concept:name')['concept:name'].sum().values:
            if key in activities:
                number_of_traces=number_of_traces+1
        list_failure_rate_absolute.append(number_of_traces)

    list_failure_rate = series_value_counts_activities.values / list_failure_rate_absolute

    df_failure_rate = pd.DataFrame({'concept:name':series_value_counts_activities.keys(),'failure_rate': list_failure_rate})
    df_failure_rate

    # %% [markdown]
    # **number of ressources**

    # %%
    series_number_of_resources_per_activity = df.groupby(['concept:name'])['org:resource'].nunique()
    df_number_of_resources = pd.DataFrame({'concept:name':series_number_of_resources_per_activity.keys(),'number_of_resources': series_number_of_resources_per_activity.values})
    df_number_of_resources


    # %%


    # %% [markdown]
    # **task maturity**

    # %%
    list_activities = df['concept:name'].unique()

    # %%
    #deterministic preceding
    a=[activities[0] for activities in list(set(zip(df['concept:name'][1:],df['concept:name'][:-1])))]
    list_has_same_preceding=[]
    for activity in list_activities:
        if a.count(activity) ==1:
            list_has_same_preceding.append(True)
        else:
            list_has_same_preceding.append(False)

    # %%
    df_deterministic_p = pd.DataFrame({'concept:name':list_activities,'deterministic_p': list_has_same_preceding})
    df_deterministic_p

    # %%
    #deterministic following
    a=[activities[0] for activities in list(set(zip(df['concept:name'][:-1],df['concept:name'][1:])))]
    list_has_same_follower=[]
    for activity in list_activities:
        if a.count(activity) ==1:
            list_has_same_follower.append(True)
        else:
            list_has_same_follower.append(False)

    # %%
    df_deterministic_f = pd.DataFrame({'concept:name':list_activities,'deterministic_f': list_has_same_follower})
    df_deterministic_f

    # %% [markdown]
    # **standardization**

    # %%
    #standardization_p
    a=[activities[0] for activities in list(set(zip(df['concept:name'][1:],df['concept:name'][:-1])))]
    list_standardization_p=[]
    for activity in list_activities:
        list_standardization_p.append(a.count(activity))

    df_standardization_p = pd.DataFrame({'concept:name':list_activities,'standardization_p': list_standardization_p})
    df_standardization_p

    # %%
    #standardization_f
    a=[activities[0] for activities in list(set(zip(df['concept:name'][:-1],df['concept:name'][1:])))]
    list_standardization_f=[]
    for activity in list_activities:
        list_standardization_f.append(a.count(activity))

    df_standardization_f = pd.DataFrame({'concept:name':list_activities,'standardization_f': list_standardization_f})
    df_standardization_f

    # %%


    # %%


    # %% [markdown]
    # # merge dfs

    # %% [markdown]
    # df_standardization_p
    # df_standardization_f df_execution_time_std
    # df_execution_frequency df_execution_time_median df_execution_time_mean df_failure_rate df_number_of_resources df_deterministic_p df_deterministic_f 

    # %%
    print('merge dfs')
    final_dataset = df_execution_frequency.merge(df_execution_time_median, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_execution_time_mean, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_execution_time_std, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_execution_time_skew, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_failure_rate, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_number_of_resources, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_deterministic_p, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_deterministic_f, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_standardization_p, on='concept:name',how='inner')
    final_dataset = final_dataset.merge(df_standardization_f, on='concept:name',how='inner')
    final_dataset['dataset_name']=DATASET_NAME
    final_dataset
    print('save csv...')
    final_dataset.to_csv(f'../../src/datasets/{DATASET_NAME}.csv')
    print('done...')
    print('--------------------------')
    # %%





DomesticDeclarations
load xes...


parsing log, completed traces :: 100%|██████████| 10500/10500 [00:02<00:00, 4017.84it/s]


calculate NaT...
create features...
merge dfs
save csv...
done...
--------------------------
InternationalDeclarations
load xes...


parsing log, completed traces :: 100%|██████████| 6449/6449 [00:03<00:00, 1726.77it/s]


calculate NaT...
create features...
merge dfs
save csv...
done...
--------------------------
Road_Traffic_Fine_Management_Process
load xes...


parsing log, completed traces :: 100%|██████████| 150370/150370 [00:28<00:00, 5255.03it/s]


calculate NaT...
create features...
merge dfs
save csv...
done...
--------------------------
PermitLog
load xes...


parsing log, completed traces :: 100%|██████████| 7065/7065 [00:03<00:00, 2110.03it/s]


calculate NaT...
create features...
merge dfs
save csv...
done...
--------------------------
PrepaidTravelCost
load xes...


parsing log, completed traces :: 100%|██████████| 2099/2099 [00:00<00:00, 2448.16it/s]


calculate NaT...
create features...
merge dfs
save csv...
done...
--------------------------
RequestForPayment
load xes...


parsing log, completed traces :: 100%|██████████| 6886/6886 [00:01<00:00, 3951.37it/s]


calculate NaT...
create features...
merge dfs
save csv...
done...
--------------------------
