## <b>1. Import Packages</b>

In [3]:
import pandas as pd
import pm4py
from sklearn.linear_model import LinearRegression
from src import DATA_DIR

<br></br>
## <b>2. Data Preprocessing</b>

In [71]:
# Import Eventlog
event_log = pd.read_csv(DATA_DIR / 'Example/Running_Example/running-example.csv', sep=";")
event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,resource
0,3,register request,2010-12-30 14:32:00+01:00,50,Pete
1,3,examine casually,2010-12-30 15:06:00+01:00,400,Mike
2,3,check ticket,2010-12-30 16:34:00+01:00,100,Ellen
3,3,decide,2011-01-06 09:18:00+01:00,200,Sara
4,3,reinitiate request,2011-01-06 12:18:00+01:00,200,Sara


In [72]:
# Change "timestamp" format
event_log["timestamp"] = event_log["timestamp"].str.split("+", expand=True).iloc[:,0]
event_log['timestamp'] = pd.to_datetime(event_log['timestamp'])
event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,resource
0,3,register request,2010-12-30 14:32:00,50,Pete
1,3,examine casually,2010-12-30 15:06:00,400,Mike
2,3,check ticket,2010-12-30 16:34:00,100,Ellen
3,3,decide,2011-01-06 09:18:00,200,Sara
4,3,reinitiate request,2011-01-06 12:18:00,200,Sara


In [73]:
# Replace Activities with keys
activities_code = {item:str(i) for i, item in enumerate(event_log.activity.unique())}
event_log.replace({"activity": activities_code}, inplace=True)
event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,resource
0,3,0,2010-12-30 14:32:00,50,Pete
1,3,1,2010-12-30 15:06:00,400,Mike
2,3,2,2010-12-30 16:34:00,100,Ellen
3,3,3,2011-01-06 09:18:00,200,Sara
4,3,4,2011-01-06 12:18:00,200,Sara


<br></br>
## <b>3. Prediction with vanilla method</b>

### <b>3.1. Making dummy variables for Resources</b>

In [74]:
event_log = pd.concat([
        event_log.drop("resource", axis=1),
        pd.get_dummies(event_log.resource, prefix="Resource")
    ], axis=1)

event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,Resource_Ellen,Resource_Mike,Resource_Pete,Resource_Sara,Resource_Sean,Resource_Sue
0,3,0,2010-12-30 14:32:00,50,0,0,1,0,0,0
1,3,1,2010-12-30 15:06:00,400,0,1,0,0,0,0
2,3,2,2010-12-30 16:34:00,100,1,0,0,0,0,0
3,3,3,2011-01-06 09:18:00,200,0,0,0,1,0,0
4,3,4,2011-01-06 12:18:00,200,0,0,0,1,0,0


<br></br>
### <b>3.2. Reshape Eventlog</b>

In [81]:
new_event_log = []

for group_name, group in event_log.groupby('case_id'):
    group.sort_values("timestamp", inplace=True)
    group.reset_index(drop=True, inplace=True)

    # Input measures
    group["prefix"] = [tuple(group.activity.values[:i+1]) for i in range(len(group))]
    group["elapsed_time"] = (group["timestamp"] - group["timestamp"].min()).astype('timedelta64[m]')

    # group["elapsed_time"] = [(tuple(group.timestamp.values[:i+1])[-1] - tuple(group.timestamp.values[:i+1])[0]).astype('timedelta64[m]')
    #                          for i in range(len(group))]

    group["paid_cost"] = [sum(group.costs.values[:i+1]) for i in range(len(group))]
    group["#done_activities"] = [len(group.activity.values[:i+1]) for i in range(len(group))]


    # Output measures
    group["total_cost"] = sum(group["costs"])
    
    for col in group.columns:
        if "Resource" in col:
            group[col] = [sum(group[col].values[:i+1]) for i in range(len(group))]
    
    group = group.iloc[:-1, :]

    if len(new_event_log):
        new_event_log = pd.concat([new_event_log, group], axis=0)
    else:
        new_event_log = group

In [82]:
new_event_log.sort_values(["case_id", "timestamp"]).head()

Unnamed: 0,case_id,activity,timestamp,costs,Resource_Ellen,Resource_Mike,Resource_Pete,Resource_Sara,Resource_Sean,Resource_Sue,prefix,elapsed_time,paid_cost,#done_activities,total_cost
0,1,0,2010-12-30 11:02:00,50,0,0,1,0,0,0,"(0,)",0.0,50,1,950
1,1,5,2010-12-31 10:06:00,400,0,0,1,0,0,1,"(0, 5)",1384.0,450,2,950
2,1,2,2011-01-05 15:12:00,100,0,1,1,0,0,1,"(0, 5, 2)",8890.0,550,3,950
3,1,3,2011-01-06 11:18:00,200,0,1,1,1,0,1,"(0, 5, 2, 3)",10096.0,750,4,950
0,2,0,2010-12-30 11:32:00,50,0,1,0,0,0,0,"(0,)",0.0,50,1,950


<br></br>
### <b>3.3. Making dummy variables for prefix</b>

In [83]:
new_event_log = pd.concat([
        new_event_log.drop("prefix", axis=1),
        pd.get_dummies(new_event_log.prefix, prefix="Prefix")
    ], axis=1)

In [85]:
new_event_log.head()

Unnamed: 0,case_id,activity,timestamp,costs,Resource_Ellen,Resource_Mike,Resource_Pete,Resource_Sara,Resource_Sean,Resource_Sue,...,"Prefix_('0', '1', '2', '3', '4', '5', '2')","Prefix_('0', '1', '2', '3', '4', '5', '2', '3')","Prefix_('0', '2')","Prefix_('0', '2', '1')","Prefix_('0', '2', '1', '3')","Prefix_('0', '2', '5')","Prefix_('0', '2', '5', '3')","Prefix_('0', '5')","Prefix_('0', '5', '2')","Prefix_('0', '5', '2', '3')"
0,1,0,2010-12-30 11:02:00,50,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,5,2010-12-31 10:06:00,400,0,0,1,0,0,1,...,0,0,0,0,0,0,0,1,0,0
2,1,2,2011-01-05 15:12:00,100,0,1,1,0,0,1,...,0,0,0,0,0,0,0,0,1,0
3,1,3,2011-01-06 11:18:00,200,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,1
0,2,0,2010-12-30 11:32:00,50,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<br></br>
### <b>3.4. Train Models</b>

#### <b>3.4.1. LinearRegression</b>

In [88]:
model = LinearRegression()

x_columns_name = ["elapsed_time", "paid_cost", "#done_activities"] + \
                 [item for item in new_event_log.columns if "Prefix_" in item or "Resource_" in item]

y_column_name = ["total_cost"]

In [89]:
x = new_event_log[x_columns_name]
y = new_event_log[y_column_name]

model.fit(x, y)

r_sq = model.score(x, y)
print(f"Coefficient of determination: {r_sq}")

Coefficient of determination: 0.7778015862541308


In [90]:
print(f"intercept: {model.intercept_}")
print(f"slope: {model.coef_}")

intercept: [1514.78510666]
slope: [[ 4.22029676e-02  5.85850258e-02 -2.35976861e+01  8.76143111e+02
  -5.62714900e+02  1.48310109e+02 -8.43700803e+01 -1.13457430e+02
  -2.87508495e+02 -1.26723945e+02  2.62552733e+02  8.34727924e+01
   1.34567715e+00  1.83543102e+02 -3.54001906e+02  8.40824942e+01
   9.49564243e+00  4.11758632e+01  2.77444820e+02  1.40627856e+02
   1.11198179e+02 -2.03605963e+02 -4.52218588e+02 -4.12182180e+02
   2.16023472e+02  7.05752130e+02  4.44712568e+02 -1.19094282e+02
  -7.68211161e+01 -4.63163516e+02 -1.99484908e+02 -1.54130926e+02]]
