In [32]:
import pm4py
import pandas as pd
from pm4py.objects.log.importer.xes import importer as xes_importer
from pm4py.objects.conversion.log import converter as xes_converter

In [2]:
log = xes_importer.apply("../mxml-to-xes/simulation_logs.xes")

xesDataframe = xes_converter.apply(log, variant=xes_converter.Variants.TO_DATA_FRAME)
xesDataframe



parsing log, completed traces ::   0%|          | 0/10000 [00:00<?, ?it/s]

Unnamed: 0,concept:name,lifecycle:transition,time:timestamp,elementId,processId,case:concept:name,resourceId
0,Start Loan Application Process,assign,2024-05-29 11:03:34.810000+00:00,StartEvent_1y45yut,Process_01r57cc,2182,
1,Start Loan Application Process,start,2024-05-29 11:03:34.810000+00:00,StartEvent_1y45yut,Process_01r57cc,2182,
2,Start Loan Application Process,complete,2024-05-29 11:03:34.810000+00:00,StartEvent_1y45yut,Process_01r57cc,2182,
3,A_Create Application (with employee),assign,2024-05-29 11:03:34.810000+00:00,Activity_0r5aevx,Process_01r57cc,2182,
4,A_Create Application (with employee),start,2024-05-29 11:03:34.810000+00:00,Activity_0r5aevx,Process_01r57cc,2182,Lane_0rqv7vq
...,...,...,...,...,...,...,...
650461,A_Pending,start,2024-03-04 14:05:08.031000+00:00,Activity_1smaynk,Process_01r57cc,686,
650462,A_Pending,complete,2024-03-04 14:05:08.031000+00:00,Activity_1smaynk,Process_01r57cc,686,
650463,EVENT 36 END,assign,2024-03-04 14:05:08.031000+00:00,Event_0tdhb1r,Process_01r57cc,686,
650464,EVENT 36 END,start,2024-03-04 14:05:08.031000+00:00,Event_0tdhb1r,Process_01r57cc,686,


In [11]:
# add NumberOfOffers column to the dataset based on "O_Created" Activity
def add_No_Of_Offers(log):
    tmp_df = log[log['concept:name'] == "Make Another Offer"] 
    tmp_df2 = pd.DataFrame(tmp_df.groupby(['case:concept:name'])['concept:name'].count()).reset_index()
    tmp_df2.columns = ['case:concept:name', 'NumberOfOffers']
    offersLog = pd.merge(tmp_df2, log, on='case:concept:name')
    return offersLog

def addTreatment(log):
    # case should be marked as treated if it receives more than one offer
    def check_NoOfOffers(gr):
        df = pd.DataFrame(gr)
        if  list(df['NumberOfOffers'])[0] <= 1:
            df['treatment'] = 0 #notTreated  T=0
        else:
            df['treatment'] = 1 # T=1
        return df
    # add new treatment for each case based on number of offers
    # cases with only one offer should be treated
    treatedLog = log.groupby('case:concept:name').apply(check_NoOfOffers)
    treatedLog = treatedLog.reset_index(drop=True)
    return treatedLog

def addSuccessColumns(log):
    # if case includes A_Pending than column successful = 1, else 0
    log['successful'] = log.groupby('case:concept:name')['concept:name'].transform(lambda x: 1 if 'A_Pending' in x.values else 0)
    # make column treatmentSuccess, here treated and if successfull Yes, else No
    log['treatmentSuccess'] = log.apply(lambda row: 1 if row['treatment'] == 'treated' and row['successful'] == 1 else 0 if row['treatment'] == 'treated' and row['successful'] == 0 else 2, axis=1)
    #log['treatmentSuccess'] = log.groupby('case:concept:name').apply(lambda group: "Yes" if ('treated' in group['treatment'].values) and ('successful' == 1) else "No" if ('treated' in group['treatment'].values) else '').reset_index(level=0, drop=True)
    log.loc[log['NumberOfOffers'] == 1, 'treatmentSuccess'] = 0
    return log

In [12]:
offersLog = add_No_Of_Offers(xesDataframe)
offersLog.head()

Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,processId,resourceId
0,0,3,Start Loan Application Process,assign,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,
1,0,3,Start Loan Application Process,start,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,
2,0,3,Start Loan Application Process,complete,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,
3,0,3,A_Create Application (with employee),assign,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Process_01r57cc,
4,0,3,A_Create Application (with employee),start,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Process_01r57cc,Lane_0rqv7vq


In [37]:
unique_values = offersLog['concept:name'].unique()
unique_values

array(['Start Loan Application Process',
       'A_Create Application (with employee)', 'A_Concept',
       'Call to customer', 'Application completed and assessed again',
       'A_Accepted', 'A_Complete', 'O_Create Offer', 'O_Created',
       'O_Sent (online only)', 'Call after offer',
       'EVENT 11 CATCH MESSAGE', 'O_Returned', 'A_Validating',
       'A_Incomplete', 'Call incomplete files', 'O_Cancelled',
       'EVENT 36 END', 'EVENT 12 THROW MESSAGE', 'A_Cancelled',
       'EVENT 28 CATCH MESSAGE', 'EVENT 27 THROW MESSAGE', 'O_Accepted',
       'A_Pending', 'A_Create Application', 'A_Submitted',
       'O_Sent (mail & online)', 'Make Another Offer', 'A_Denied',
       'O_Refused'], dtype=object)

In [13]:
treatedLog = addTreatment(offersLog)
treatedLog.head()

Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,processId,resourceId,treatment
0,0,3,Start Loan Application Process,assign,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,,1
1,0,3,Start Loan Application Process,start,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,,1
2,0,3,Start Loan Application Process,complete,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,,1
3,0,3,A_Create Application (with employee),assign,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Process_01r57cc,,1
4,0,3,A_Create Application (with employee),start,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Process_01r57cc,Lane_0rqv7vq,1


In [14]:
sucessLog = addSuccessColumns(treatedLog)

In [15]:
sucessLog.head()

Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,processId,resourceId,treatment,successful,treatmentSuccess
0,0,3,Start Loan Application Process,assign,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,,1,1,2
1,0,3,Start Loan Application Process,start,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,,1,1,2
2,0,3,Start Loan Application Process,complete,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,Process_01r57cc,,1,1,2
3,0,3,A_Create Application (with employee),assign,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Process_01r57cc,,1,1,2
4,0,3,A_Create Application (with employee),start,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Process_01r57cc,Lane_0rqv7vq,1,1,2


In [16]:
log = sucessLog.drop('processId', axis=1)
log

Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,resourceId,treatment,successful,treatmentSuccess
0,0,3,Start Loan Application Process,assign,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2
1,0,3,Start Loan Application Process,start,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2
2,0,3,Start Loan Application Process,complete,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2
3,0,3,A_Create Application (with employee),assign,2024-01-24 09:00:00+00:00,Activity_0r5aevx,,1,1,2
4,0,3,A_Create Application (with employee),start,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Lane_0rqv7vq,1,1,2
...,...,...,...,...,...,...,...,...,...,...
650461,9999,3,A_Pending,complete,2025-08-27 15:20:45.133000+00:00,Activity_1smaynk,,1,1,2
650462,9999,3,EVENT 11 CATCH MESSAGE,start,2025-08-27 15:20:45.133000+00:00,Event_194v609,,1,1,2
650463,9999,3,EVENT 36 END,assign,2025-08-27 15:20:45.133000+00:00,Event_0tdhb1r,,1,1,2
650464,9999,3,EVENT 36 END,start,2025-08-27 15:20:45.133000+00:00,Event_0tdhb1r,,1,1,2


In [25]:
#A_Create Application
def get_weekday(timestamp):
    return timestamp.weekday()

# Apply the function to create the new column 'weekdayApplication'
log['weekdayApplication'] = log[log['concept:name'] == 'A_Create Application']['time:timestamp'].apply(get_weekday)
log['weekdayApplication'] = log[log['concept:name'] == 'A_Create Application (with employee)']['time:timestamp'].apply(get_weekday)

# Forward fill the 'weekdayApplication' column to fill the NaN values for other rows in the same case
#log['weekdayApplication'] = log.groupby('case:concept:name')['weekdayApplication'].transform(lambda x: x.ffill().bfill())
log['weekdayApplication'] = log.groupby('case:concept:name')['weekdayApplication'].ffill().bfill()
log.head()

Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,resourceId,treatment,successful,treatmentSuccess,weekdayApplication
0,0,3,Start Loan Application Process,assign,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2,2.0
1,0,3,Start Loan Application Process,start,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2,2.0
2,0,3,Start Loan Application Process,complete,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2,2.0
3,0,3,A_Create Application (with employee),assign,2024-01-24 09:00:00+00:00,Activity_0r5aevx,,1,1,2,2.0
4,0,3,A_Create Application (with employee),start,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Lane_0rqv7vq,1,1,2,2.0


In [28]:
#trial code
log["time:timestamp"] = pd.to_datetime(log["time:timestamp"], format='ISO8601')
log['timeApplication'] = log.groupby('case:concept:name')['time:timestamp'].transform(lambda x: x - x.min())
log['timeApplication'] = log['timeApplication'].dt.total_seconds()
log

Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,resourceId,treatment,successful,treatmentSuccess,weekdayApplication,timeApplication
0,0,3,Start Loan Application Process,assign,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2,2.0,0.00
1,0,3,Start Loan Application Process,start,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2,2.0,0.00
2,0,3,Start Loan Application Process,complete,2024-01-24 09:00:00+00:00,StartEvent_1y45yut,,1,1,2,2.0,0.00
3,0,3,A_Create Application (with employee),assign,2024-01-24 09:00:00+00:00,Activity_0r5aevx,,1,1,2,2.0,0.00
4,0,3,A_Create Application (with employee),start,2024-01-24 09:00:00+00:00,Activity_0r5aevx,Lane_0rqv7vq,1,1,2,2.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...
650461,9999,3,A_Pending,complete,2025-08-27 15:20:45.133000+00:00,Activity_1smaynk,,1,1,2,2.0,1174.92
650462,9999,3,EVENT 11 CATCH MESSAGE,start,2025-08-27 15:20:45.133000+00:00,Event_194v609,,1,1,2,2.0,1174.92
650463,9999,3,EVENT 36 END,assign,2025-08-27 15:20:45.133000+00:00,Event_0tdhb1r,,1,1,2,2.0,1174.92
650464,9999,3,EVENT 36 END,start,2025-08-27 15:20:45.133000+00:00,Event_0tdhb1r,,1,1,2,2.0,1174.92


In [29]:
from sklearn.preprocessing import LabelEncoder
import numpy as np
final_log = log

In [30]:
non_numeric_columns = list(final_log.select_dtypes(exclude=[np.number]).columns)

for col in non_numeric_columns:
    final_log[col] = LabelEncoder().fit_transform(final_log[col])

final_log = final_log.astype('float32')
final_log

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Unnamed: 0,case:concept:name,NumberOfOffers,concept:name,lifecycle:transition,time:timestamp,elementId,resourceId,treatment,successful,treatmentSuccess,weekdayApplication,timeApplication
0,0.0,3.0,29.0,0.0,0.0,28.0,1.0,1.0,1.0,2.0,2.0,0.000000
1,0.0,3.0,29.0,2.0,0.0,28.0,1.0,1.0,1.0,2.0,2.0,0.000000
2,0.0,3.0,29.0,1.0,0.0,28.0,1.0,1.0,1.0,2.0,2.0,0.000000
3,0.0,3.0,5.0,0.0,0.0,7.0,1.0,1.0,1.0,2.0,2.0,0.000000
4,0.0,3.0,5.0,2.0,0.0,7.0,0.0,1.0,1.0,2.0,2.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...
650461,9999.0,3.0,8.0,1.0,52719.0,20.0,1.0,1.0,1.0,2.0,2.0,1174.920044
650462,9999.0,3.0,15.0,2.0,52719.0,27.0,1.0,1.0,1.0,2.0,2.0,1174.920044
650463,9999.0,3.0,19.0,0.0,52719.0,26.0,1.0,1.0,1.0,2.0,2.0,1174.920044
650464,9999.0,3.0,19.0,2.0,52719.0,26.0,1.0,1.0,1.0,2.0,2.0,1174.920044


In [34]:
unique_counts = final_log.nunique()
unique_counts

case:concept:name       10000
NumberOfOffers              1
concept:name               30
lifecycle:transition        3
time:timestamp          52720
elementId                  30
resourceId                  2
treatment                   1
successful                  2
treatmentSuccess            1
weekdayApplication          5
timeApplication         41805
dtype: int64

In [31]:
final_log.to_csv('synthetic_dataset.csv', index=False)