## <b>1. Import Packages</b>

In [1]:
import pandas as pd
import pm4py
from sklearn.linear_model import LinearRegression

from src import SRC_DIR

<br></br>
## <b>2. Data Preprocessing</b>

In [23]:
# Import Eventlog
event_log = pd.read_csv(SRC_DIR / 'Datasets' / 'Example' / 'Running_Example' / 'running-example.csv', sep=";")
event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,resource
0,3,register request,2010-12-30 14:32:00+01:00,50,Pete
1,3,examine casually,2010-12-30 15:06:00+01:00,400,Mike
2,3,check ticket,2010-12-30 16:34:00+01:00,100,Ellen
3,3,decide,2011-01-06 09:18:00+01:00,200,Sara
4,3,reinitiate request,2011-01-06 12:18:00+01:00,200,Sara


In [24]:
# Change "timestamp" format
event_log['timestamp'] = pd.to_datetime(event_log['timestamp'])
event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,resource
0,3,register request,2010-12-30 14:32:00+01:00,50,Pete
1,3,examine casually,2010-12-30 15:06:00+01:00,400,Mike
2,3,check ticket,2010-12-30 16:34:00+01:00,100,Ellen
3,3,decide,2011-01-06 09:18:00+01:00,200,Sara
4,3,reinitiate request,2011-01-06 12:18:00+01:00,200,Sara


In [25]:
# Replace Activities with keys
activities_code = {activity:str(idx) for idx, activity in enumerate(event_log.activity.unique())}

event_log.replace({"activity": activities_code}, inplace=True)
event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,resource
0,3,0,2010-12-30 14:32:00+01:00,50,Pete
1,3,1,2010-12-30 15:06:00+01:00,400,Mike
2,3,2,2010-12-30 16:34:00+01:00,100,Ellen
3,3,3,2011-01-06 09:18:00+01:00,200,Sara
4,3,4,2011-01-06 12:18:00+01:00,200,Sara


<br></br>
## <b>3. Prediction with vanilla method</b>

### <b>3.1. Making dummy variables for Resources</b>

In [26]:
event_log = pd.get_dummies(event_log, columns=['resource'], prefix="Resource")
event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,Resource_Ellen,Resource_Mike,Resource_Pete,Resource_Sara,Resource_Sean,Resource_Sue
0,3,0,2010-12-30 14:32:00+01:00,50,0,0,1,0,0,0
1,3,1,2010-12-30 15:06:00+01:00,400,0,1,0,0,0,0
2,3,2,2010-12-30 16:34:00+01:00,100,1,0,0,0,0,0
3,3,3,2011-01-06 09:18:00+01:00,200,0,0,0,1,0,0
4,3,4,2011-01-06 12:18:00+01:00,200,0,0,0,1,0,0


<br></br>
### <b>3.2. Reshape Eventlog (Generate State Dataset)</b>

In [30]:
reshaped_event_log_lst = []

for group_name, group in event_log.groupby('case_id'):
    group.sort_values("timestamp", inplace=True)
    group.reset_index(drop=True, inplace=True)

    # Input measures
    prefix = [tuple(group['activity'].values[:i]) for i in range(len(group) + 1)]

    # Total Elapsed time
    elapsed_time = []
    for i in range(len(group) + 1):
        start_time = min(group['timestamp'].values)
        end_time = max(group['timestamp'].values[:i]) if i != 0 else start_time
        duration = (end_time - start_time)
        elapsed_time.append(duration)

    # Total Paid Costs
    paid_costs = [sum(group['costs'].values[:i]) for i in range(len(group) + 1)]

    # Total number of done activities
    number_of_done_activities = [len(group['activity'].values[:i]) for i in range(len(group) + 1)]

    # Resourse
    resourses = {}
    for col in group.columns:
        if "Resource_" in col:
            resourses[col] = [sum(group[col].values[:i]) for i in range(len(group) + 1)]

    # Output measures
    total_time = [(max(group['timestamp']) - min(group['timestamp'])) for _ in range(len(group) + 1)]
    total_cost = [sum(group['costs'].values) for _ in range(len(group) + 1)]


    # Create DataFrame
    reshaped_group = pd.DataFrame({'Prefix': prefix,
                                   'Elapsed_Time': elapsed_time,
                                   'Paid_Costs': paid_costs,
                                   '#Activities': number_of_done_activities,
                                   'Total_Time': total_time,
                                   'Total_Cost': total_cost,
                                   **resourses         # Merge Two Dictionaries
                                  })
    reshaped_group['Case_Name'] = group_name
    reshaped_group.Elapsed_Time = reshaped_group.Elapsed_Time.dt.total_seconds()
    reshaped_group.Total_Time = reshaped_group.Total_Time.dt.total_seconds()

    # Append 'reshaped group' to 'reshaped eventlog'
    reshaped_event_log_lst.append(reshaped_group)

reshaped_event_log = pd.concat(reshaped_event_log_lst, axis=0)

In [31]:
reshaped_group

Unnamed: 0,Prefix,Elapsed_Time,Paid_Costs,#Activities,Total_Time,Total_Cost,Resource_Ellen,Resource_Mike,Resource_Pete,Resource_Sara,Resource_Sean,Resource_Sue,Case_Name
0,(),0.0,0,0,852300.0,950,0,0,0,0,0,0,6
1,"(0,)",0.0,50,1,852300.0,950,0,1,0,0,0,0,6
2,"(0, 1)",3840.0,450,2,852300.0,950,1,1,0,0,0,0,6
3,"(0, 1, 2)",91200.0,550,3,852300.0,950,1,2,0,0,0,0,6
4,"(0, 1, 2, 3)",93000.0,750,4,852300.0,950,1,2,0,1,0,0,6
5,"(0, 1, 2, 3, 6)",852300.0,950,5,852300.0,950,1,3,0,1,0,0,6


In [32]:
reshaped_event_log.head()

Unnamed: 0,Prefix,Elapsed_Time,Paid_Costs,#Activities,Total_Time,Total_Cost,Resource_Ellen,Resource_Mike,Resource_Pete,Resource_Sara,Resource_Sean,Resource_Sue,Case_Name
0,(),0.0,0,0,703320.0,950,0,0,0,0,0,0,1
1,"(0,)",0.0,50,1,703320.0,950,0,0,1,0,0,0,1
2,"(0, 5)",83040.0,450,2,703320.0,950,0,0,1,0,0,1,1
3,"(0, 5, 2)",533400.0,550,3,703320.0,950,0,1,1,0,0,1,1
4,"(0, 5, 2, 3)",605760.0,750,4,703320.0,950,0,1,1,1,0,1,1


<br></br>
### <b>3.3. Making dummy variables for Prefix</b>

In [33]:
reshaped_event_log = pd.get_dummies(reshaped_event_log, columns=['Prefix'], prefix="Prefix")
reshaped_event_log.head()

Unnamed: 0,Elapsed_Time,Paid_Costs,#Activities,Total_Time,Total_Cost,Resource_Ellen,Resource_Mike,Resource_Pete,Resource_Sara,Resource_Sean,...,"Prefix_('0', '2', '1')","Prefix_('0', '2', '1', '3')","Prefix_('0', '2', '1', '3', '6')","Prefix_('0', '2', '5')","Prefix_('0', '2', '5', '3')","Prefix_('0', '2', '5', '3', '7')","Prefix_('0', '5')","Prefix_('0', '5', '2')","Prefix_('0', '5', '2', '3')","Prefix_('0', '5', '2', '3', '7')"
0,0.0,0,0,703320.0,950,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0.0,50,1,703320.0,950,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,83040.0,450,2,703320.0,950,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
3,533400.0,550,3,703320.0,950,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
4,605760.0,750,4,703320.0,950,0,1,1,1,0,...,0,0,0,0,0,0,0,0,1,0


<br></br>
### <b>3.4. Train Models</b>

#### <b>3.4.1. LinearRegression</b>

In [35]:
model = LinearRegression()

x_columns_name = ["Elapsed_Time", "Paid_Costs", "#Activities"] + \
                 [col for col in reshaped_event_log.columns if "Prefix_" in col or "Resource_" in col]

y_column_name = ["Total_Cost"]

In [37]:
x = reshaped_event_log[x_columns_name]
y = reshaped_event_log[y_column_name]

model.fit(x, y)

r_sq = model.score(x, y)
print(f"Coefficient of determination: {r_sq}")

Coefficient of determination: 0.7320797939397508


In [38]:
print(f"intercept: {model.intercept_}")
print(f"slope: {model.coef_}")

intercept: [1633.50074041]
slope: [[ 7.03382794e-04  3.91500261e-01 -1.43558434e+02  8.58052741e+02
  -5.80805270e+02  1.30219739e+02  2.92206220e+01 -3.61817288e+02
  -2.18428979e+02 -2.33500740e+02 -1.24034222e+02  2.70127479e+02
   1.95807133e+02  5.34670170e+01  1.75451441e+02 -2.57333973e+02
   1.85635451e+02  5.08355988e+01  2.23028186e+01  1.76286913e+02
   1.44229543e+02  5.45868658e+01  6.35489135e+02  2.34568891e+01
  -1.20396142e+02 -1.40572735e+02 -1.30011088e+03 -8.19845255e+01
   3.23472789e+02  1.04835596e+03  7.27103396e+02 -2.49822510e+02
   2.23509548e+02  2.05569713e+02 -7.78910525e+02 -5.42758656e+02
  -1.74320453e+02 -1.89179472e+02 -3.22762854e+02]]
