# Predictive Process Mining with Vanilla Methods (2)

This notebook covers advanced methods for predictive process mining. We will preprocess data, perform feature engineering, and apply different machine learning models to predict outcomes based on event log data.

## 1. Import Packages

In [2]:
import pandas as pd
import sys
import hashlib
import numpy as np
from tqdm import tqdm
import hashlib

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import to_categorical

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from src import SRC_DIR

<br></br>
## 2. Data Preprocessing

### 2.1. Load and Preprocess Data

Import the event log and target page datasets.

In [65]:
# Import data and keep the target pages
event_log = pd.read_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Website Event Log.csv')
target_page = pd.read_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Target_Pages.csv')
event_log = event_log.loc[event_log.Visited_Page.isin(target_page.Pages)].reset_index(drop=True)

# Display the first few rows of the event log
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,hv5xru,12/30/2021 23:58:00,12/30/2021 23:58:00,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,168
1,hv5xru,12/30/2021 23:58:00,12/31/2021 0:00:48,ChromeMobile,Android,Mobile,Iran,Visited:.../online-issuance-and-cancellation/,153
2,hv5xru,12/30/2021 23:58:00,12/31/2021 0:03:21,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,5
3,hv5xru,12/30/2021 23:58:00,12/31/2021 0:04:07,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,26
4,92c26h,12/30/2021 23:57:00,12/30/2021 23:57:00,Firefox,Windows,PC,Iran,Visited:learning.emofid.com,129


### 2.2. Create Unique User IDs

Combine 'User_ID' and 'Case_Start_Date' to create unique user identifiers.

In [66]:
def sha_hash(string):
    return hashlib.sha256(string.encode('utf-8')).hexdigest()[:15]

event_log.User_ID = (event_log['User_ID'] + event_log['Case_Start_Date']).apply(sha_hash)
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,4b35f33828b3a50,12/30/2021 23:58:00,12/30/2021 23:58:00,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,168
1,4b35f33828b3a50,12/30/2021 23:58:00,12/31/2021 0:00:48,ChromeMobile,Android,Mobile,Iran,Visited:.../online-issuance-and-cancellation/,153
2,4b35f33828b3a50,12/30/2021 23:58:00,12/31/2021 0:03:21,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,5
3,4b35f33828b3a50,12/30/2021 23:58:00,12/31/2021 0:04:07,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,26
4,d8f70158d816f3f,12/30/2021 23:57:00,12/30/2021 23:57:00,Firefox,Windows,PC,Iran,Visited:learning.emofid.com,129


### 2.3. Sort Data

Sort the event log by 'User_ID' and 'Activity_Start_Date'.

In [67]:
event_log = event_log.sort_values(['User_ID', 'Activity_Start_Date'])
event_log.reset_index(drop=True, inplace=True)
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,000a76889a8f30c,12/6/2021 19:20:00,12/6/2021 19:32:03,MiuiBrowser,Android,Mobile,Iran,Visited:.../investment-fund/,348
1,000a76889a8f30c,12/6/2021 19:20:00,12/6/2021 19:38:25,MiuiBrowser,Android,Mobile,Iran,Visited:.../atlas-etf/,698
2,000afe30924023c,12/1/2021 10:33:00,12/1/2021 10:33:00,Chrome,Windows,PC,Iran,Visited:learning.emofid.com,7
3,000afe30924023c,12/1/2021 10:33:00,12/1/2021 10:33:07,Chrome,Windows,PC,Iran,Visited:learning.emofid.com,7
4,000afe30924023c,12/1/2021 10:33:00,12/1/2021 10:33:14,Chrome,Windows,PC,Iran,Visited:learning.emofid.com,9


<br></br>
## 3. Feature Engineering

### 3.1. Merge Refreshed Pages

Combine events that represent refreshed pages.

#### 3.1.1. Method 1:

In [68]:
# class IncorrectMethodError(Exception):
#     def __init__(self, method):
#         super().__init__(f"Method should be 'even_with_odd' or 'odd_with_even', '{method}' is incorrect")


# def merge_refreshed_pages(event_log, method='even_with_odd') -> pd.DataFrame:
#     if method == 'even_with_odd':
#         chunk_1 = event_log[::2]   # 0   2   4   6   ...   40010
#         chunk_2 = event_log[1::2]  # 1   3   5   7   ...   40011
#     elif method == 'odd_with_even':
#         chunk_1 = event_log[1::2]  # 1   3   5   7   ...   40011
#         chunk_2 = event_log[2::2]  # 2   4   6   8   ...   40010
#     else:
#         raise IncorrectMethodError(method)

#     chunk_1 = chunk_1[['User_ID', 'Visited_Page']]
#     chunk_2 = chunk_2[['User_ID', 'Visited_Page']]
#     chunk_2.rename(columns={'User_ID': 'User_ID_chunk2',
#                             'Visited_Page': 'Visited_Page_chunk2'},
#                    inplace=True)
    

#     chunk_2.index = chunk_2.index - 1
#     merged_df = pd.concat([chunk_1, chunk_2], axis=1)

#     merged_df = merged_df.loc[(merged_df.User_ID == merged_df.User_ID_chunk2) & 
#                               (merged_df.Visited_Page == merged_df.Visited_Page_chunk2)]

#     index = merged_df.index
#     index_rm = merged_df.index + 1

#     event_log.loc[event_log.index.isin(index), 'Time_on_Page'] += list(event_log.loc[event_log.index.isin(index_rm)]['Time_on_Page'])
#     event_log = event_log.loc[~event_log.index.isin(index_rm)]
#     return event_log.reset_index(drop=True)

In [69]:
# while True:
#     event_log_len = len(event_log)
#     event_log = merge_refreshed_pages(event_log, method='even_with_odd')
#     event_log = merge_refreshed_pages(event_log, method='odd_with_even')

#     if event_log_len == len(event_log):
#         break

# event_log.head()

#### 3.1.2. Method 2:

In [70]:
event_log['index'] = (event_log['User_ID'].astype(str) + event_log['Visited_Page']).apply(hash)

event_log['difference'] = event_log['index'].diff().fillna(1)

event_log['difference'] = event_log['difference'].replace(0, np.nan)

event_log.loc[~event_log['difference'].isna(), 'difference'] += pd.to_datetime(event_log['Activity_Start_Date']).astype(int)

event_log['difference'] = event_log['difference'].ffill()
event_log = event_log.groupby(
    ['User_ID', 'Activity_Start_Date', 'Browser', 'Operating_System', 'Device',
     'Country', 'Visited_Page', 'difference']).agg(
    Time_on_Page = ('Time_on_Page', 'sum')).reset_index()

event_log = event_log.drop(['difference'], axis=1)

<br></br>
### 3.2. Encode Labels

Encode categorical labels into numerical values.

In [71]:
# Function to create code mappings
def create_code_mapping(column, prefix):
    unique_items = column.unique()
    code_map = {item: f"{prefix}_{idx}" for idx, item in enumerate(unique_items)}
    return code_map

# Creating code mappings for each column
user_code = create_code_mapping(event_log['User_ID'], 'U')
activities_code = create_code_mapping(event_log['Visited_Page'], 'Page')
device_code = create_code_mapping(event_log['Device'], 'D')
os_code = create_code_mapping(event_log['Operating_System'], 'OS')
browser_code = create_code_mapping(event_log['Browser'], 'B')

# Create a copy of the DataFrame for modification
mapped_event_log = event_log.copy()

# Applying the code mappings using .loc
mapped_event_log.replace(
    {'User_ID': user_code,
     'Visited_Page': activities_code,
     'Device': device_code,
     'Operating_System': os_code,
     'Browser': browser_code,
    },
    inplace=True)

# Displaying the updated DataFrame
mapped_event_log.head()

Unnamed: 0,User_ID,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,U_0,12/6/2021 19:32:03,B_0,OS_0,D_0,Iran,Page_0,348
1,U_0,12/6/2021 19:38:25,B_0,OS_0,D_0,Iran,Page_1,698
2,U_1,12/1/2021 10:33:00,B_1,OS_1,D_1,Iran,Page_2,7
3,U_1,12/1/2021 10:33:07,B_1,OS_1,D_1,Iran,Page_2,7
4,U_1,12/1/2021 10:33:14,B_1,OS_1,D_1,Iran,Page_2,9


<br></br>
### 3.3. Save dataframe

In [72]:
mapped_event_log.to_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Website_EventLog_Preprossed_With_Python.csv', index=False)

<br></br>
## 3. Model Training and Evaluation

In [73]:
# Import Eventlog
event_log = pd.read_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Website_EventLog_Preprossed_With_Python.csv')

<br></br>
### 3.1. Filter data

In [74]:
# Filter Activities based on duration
event_log = event_log.loc[event_log["Time_on_Page"] > 10]
event_log = event_log.loc[event_log["Time_on_Page"] <= 600]
event_log.reset_index(drop=True, inplace=True)

In [75]:
# Filter cases with fewer than 3 page views.
user_id_visited_pages_count = event_log.groupby('User_ID').agg(
    Count_Page=('User_ID', 'count'))

filtered_users = user_id_visited_pages_count.loc[user_id_visited_pages_count['Count_Page'] >= 3].index

event_log = event_log[event_log['User_ID'].isin(filtered_users)].reset_index(drop=True)
event_log.head()

Unnamed: 0,User_ID,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,U_1,12/1/2021 10:33:23,B_1,OS_1,D_1,Iran,Page_2,15
1,U_1,12/1/2021 10:33:39,B_1,OS_1,D_1,Iran,Page_4,14
2,U_1,12/1/2021 10:33:53,B_1,OS_1,D_1,Iran,Page_2,11
3,U_1,12/1/2021 10:34:04,B_1,OS_1,D_1,Iran,Page_4,46
4,U_1,12/1/2021 10:34:59,B_1,OS_1,D_1,Iran,Page_2,97


<br></br>
### 3.2. Making dummy variables for Device and OS

In [76]:
event_log = pd.get_dummies(
    event_log,
    columns=['Device', 'Operating_System'],
    prefix=['Device', 'Operating_System'])

event_log.head()

Unnamed: 0,User_ID,Activity_Start_Date,Browser,Country,Visited_Page,Time_on_Page,Device_D_0,Device_D_1,Device_D_2,Device_D_3,Operating_System_OS_0,Operating_System_OS_1,Operating_System_OS_2,Operating_System_OS_3,Operating_System_OS_4,Operating_System_OS_5,Operating_System_OS_6
0,U_1,12/1/2021 10:33:23,B_1,Iran,Page_2,15,0,1,0,0,0,1,0,0,0,0,0
1,U_1,12/1/2021 10:33:39,B_1,Iran,Page_4,14,0,1,0,0,0,1,0,0,0,0,0
2,U_1,12/1/2021 10:33:53,B_1,Iran,Page_2,11,0,1,0,0,0,1,0,0,0,0,0
3,U_1,12/1/2021 10:34:04,B_1,Iran,Page_4,46,0,1,0,0,0,1,0,0,0,0,0
4,U_1,12/1/2021 10:34:59,B_1,Iran,Page_2,97,0,1,0,0,0,1,0,0,0,0,0


<br></br>
### 3.3. Reshape Eventlog

In [77]:
reshaped_event_log_lst = []

for group_name, group in tqdm(event_log.groupby('User_ID')):
    group.sort_values("Activity_Start_Date", inplace=True)
    group.reset_index(drop=True, inplace=True)

    # Input Variables
    prefix = [tuple(group["Visited_Page"].values[:i+1]) for i in range(len(group))]
    elapsed_time = [sum(group["Time_on_Page"].values[:i+1]) for i in range(len(group))]
    number_of_visited_page = [len(group["Visited_Page"].values[:i+1]) for i in range(len(group))]

    operating_systems = {}
    devices = {}
    for col in group.columns:
        if 'OS_' in col:
            operating_systems[col] = [max(group[col]) for i in range(len(group))]
        if 'Device_' in col:
            devices[col] = [max(group[col]) for i in range(len(group))]

    # Output Variable
    next_page = [group["Visited_Page"].values[i+1] for i in range(len(group) - 1)] + ["End"]

    reshaped_group = pd.DataFrame({
        'Visited_Page': prefix,
        'Elapsed_Time': elapsed_time,
        '#Visited_Page': number_of_visited_page,
        'Next_Page': next_page,
        **operating_systems,
        **devices,
    })

    reshaped_event_log_lst.append(reshaped_group)

reshaped_event_log = pd.concat(reshaped_event_log_lst, axis=0)

100%|███████████████████████████████████████████████████████████████████████████████████| 3972/3972 [00:04<00:00, 799.65it/s]


In [88]:
reshaped_event_log.reset_index(drop=True, inplace=True)
reshaped_event_log.shape

(20599, 15)

<br></br>
### 3.4. Making dummy variables for Visited Pages

In [89]:
reshaped_event_log = pd.get_dummies(reshaped_event_log, columns=['Visited_Page'], prefix="Visited")

reshaped_event_log.head()

Unnamed: 0,Elapsed_Time,#Visited_Page,Next_Page,Operating_System_OS_0,Operating_System_OS_1,Operating_System_OS_2,Operating_System_OS_3,Operating_System_OS_4,Operating_System_OS_5,Operating_System_OS_6,...,"Visited_('Page_99', 'Page_67', 'Page_100')","Visited_('Page_99', 'Page_67', 'Page_100', 'Page_115')","Visited_('Page_99', 'Page_67', 'Page_100', 'Page_115', 'Page_100')","Visited_('Page_99', 'Page_67', 'Page_100', 'Page_115', 'Page_100', 'Page_67')","Visited_('Page_99', 'Page_67', 'Page_100', 'Page_115', 'Page_100', 'Page_67', 'Page_42')","Visited_('Page_99', 'Page_83')","Visited_('Page_99', 'Page_83', 'Page_99')","Visited_('Page_99', 'Page_83', 'Page_99', 'Page_100')","Visited_('Page_99', 'Page_83', 'Page_99', 'Page_100', 'Page_115')","Visited_('Page_99', 'Page_83', 'Page_99', 'Page_100', 'Page_115', 'Page_105')"
0,15,1,Page_4,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,29,2,Page_2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,40,3,Page_4,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,86,4,Page_2,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,183,5,End,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<br></br>
### 3.5. Train Models

In [91]:
x_columns_name = ["Elapsed_Time", "#Visited_Page"] + [item for item in reshaped_event_log.columns if "Visited_" in item or
                                                                                                     "Device_" in item or 
                                                                                                     "OS_" in item]
y_column_name = ["Next_Page"]

['Next_Page']

In [94]:
x = reshaped_event_log[x_columns_name]
y = reshaped_event_log[y_column_name]

y = np.array(y).flatten()

label_encoder = LabelEncoder()

# Fit and transform the y array to integer labels
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

<br></br>
#### <b>3.5.1. Logistic Regression</b>

In [19]:
# Fit Mode
model = LogisticRegression(random_state=0)

# model = LogisticRegression(solver='liblinear', C=0.05, multi_class='ovr', random_state=0)
model.fit(x_train, y_train)


KeyboardInterrupt



In [None]:
y_pred = model.predict(x_test)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

<br></br>
#### <b>3.5.2. KNN</b>

In [95]:
knn_model = KNeighborsClassifier(n_neighbors=12)
knn_model.fit(x_train, y_train)

In [96]:
pred = knn_model.predict(x_test)
accuracy_score(pred, y_test)

0.14660194174757282

<br></br>
#### <b>3.5.3. Neural Network</b>

In [97]:
# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

y_encoded = to_categorical([*y_train, *y_test])

y_train_encoded = y_encoded[:y_train.shape[0]]
y_test_encoded = y_encoded[y_train.shape[0]:]

In [98]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# Creating the neural network model
model = keras.Sequential([
    layers.Dense(32, activation='relu'),
    Dropout(0.5),
    layers.Dense(32, activation='relu'),
    Dropout(0.5),
    layers.Dense(y_encoded.shape[1], activation='softmax')
])

# Compiling the model
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

# # Early stopping callback
# early_stopping = EarlyStopping(patience=20, restore_best_weights=True)

# Training the model
epochs = 60
batch_size = 32
model.fit(x_train, y_train_encoded, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# # Evaluating the model on the test set
# test_loss, test_accuracy = model.evaluate(x_test, y_test_encoded)
# print(f"Test Accuracy: {test_accuracy}")

Epoch 1/60


2024-06-28 15:41:38.171456: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 643383132 exceeds 10% of free system memory.


[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.1114 - loss: 5.3376 - val_accuracy: 0.2382 - val_loss: 4.7256
Epoch 2/60
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.2080 - loss: 4.6415 - val_accuracy: 0.2339 - val_loss: 4.6965
Epoch 3/60
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.2139 - loss: 4.4996 - val_accuracy: 0.2342 - val_loss: 4.4764
Epoch 4/60
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.2202 - loss: 4.4117 - val_accuracy: 0.2345 - val_loss: 4.3645
Epoch 5/60
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.2247 - loss: 4.3116 - val_accuracy: 0.2354 - val_loss: 4.4170
Epoch 6/60
[1m412/412[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 6ms/step - accuracy: 0.2335 - loss: 4.1957 - val_accuracy: 0.2342 - val_loss: 4.5856
Epoch 7/60
[1m412/412[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7f713cf036d0>

In [99]:
model.summary()

<br></br>
#### <b>3.5.4. Calculate recommendation accuracy</b>

In [102]:
def get_top_n_classes(probs, n):
    top_n_indices = np.argsort(probs)[-n:]
    return top_n_indices


# Make predictions on the test set
predictions = model.predict(x_test)

# Initialize variables for accuracy calculation
num_correct = 0
total_samples = len(x_test)

# Evaluate accuracy based on recommending the top five classes
for i in range(total_samples):
    true_class = y_test[i]
    predicted_probs = predictions[i]
    top_classes = get_top_n_classes(predicted_probs, n=5)
    if true_class in top_classes:
        num_correct += 1

# Calculate and print the accuracy
accuracy = num_correct / total_samples
print(f"Accuracy based on recommending top five classes: {accuracy}")

[1m120/129[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 1ms/step

2024-06-28 15:49:04.256113: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 201072480 exceeds 10% of free system memory.


[1m129/129[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step
Accuracy based on recommending top five classes: 0.4303398058252427


In [27]:
true_class

0

In [28]:
top_classes

array([102,   2,  15, 168,   1])