## <b>1. Import Packages</b>

In [2]:
import pandas as pd
import sys
import hashlib
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from src import DATA_DIR

<br></br>
## <b>2. Data Preprocessing</b>

In [3]:
# Import data and keep the target pages
event_log = pd.read_csv(DATA_DIR / 'Real/Customer_Journey/Website Event Log.csv')
target_page = pd.read_csv(DATA_DIR / 'Real/Customer_Journey/Target_Pages.csv')
event_log = event_log.loc[event_log.Visited_Page.isin(target_page.Pages)].reset_index(drop=True)

In [49]:
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,hv5xru,12/30/2021 23:58:00,12/30/2021 23:58:00,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,168
1,hv5xru,12/30/2021 23:58:00,12/31/2021 0:00:48,ChromeMobile,Android,Mobile,Iran,Visited:.../online-issuance-and-cancellation/,153
2,hv5xru,12/30/2021 23:58:00,12/31/2021 0:03:21,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,5
3,hv5xru,12/30/2021 23:58:00,12/31/2021 0:04:07,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,26
4,92c26h,12/30/2021 23:57:00,12/30/2021 23:57:00,Firefox,Windows,PC,Iran,Visited:learning.emofid.com,129


In [50]:
# Create unique user id
event_log.User_ID = (event_log.User_ID + '_' + event_log.Case_Start_Date).apply(hash)

In [51]:
# Even     Odd
# 0        1
# 2        3
# 4        5
# ...

# 0
# 1 ×
# 2 -> 1
# 3 -> 2
# 4 -> 3
# 5 -> 4

# 0
# 1
# 2
# 3
# 4

# Even     Odd
# 0        1
# 2        3
# 4        5
# ...

In [52]:
# %%timeit
# event_log['Time_on_Page'] + 1

In [53]:
# %%timeit
# for time in event_log.Time_on_Page:
#     time = time + 1

In [54]:
# sort rows
event_log = event_log.sort_values(['User_ID', 'Activity_Start_Date'])
event_log.reset_index(drop=True, inplace=True)
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,-9222220339580515528,12/2/2021 14:57:00,12/2/2021 14:57:00,Chrome,Windows,PC,Iran,Visited:.../what-is-the-bourse-view/,48
1,-9222220339580515528,12/2/2021 14:57:00,12/2/2021 15:00:10,Chrome,Windows,PC,Iran,Visited:.../bourseview-fundamental-graph-analy...,445
2,-9222220339580515528,12/2/2021 14:57:00,12/2/2021 15:07:35,Chrome,Windows,PC,Iran,Visited:.../introduction-portfolio-section-in-...,907
3,-9220913366454553897,12/21/2021 22:39:00,12/21/2021 22:39:00,Firefox,Windows,PC,Iran,Visited:.../how-much-do-you-know-about-financi...,512
4,-9220913366454553897,12/21/2021 22:39:00,12/21/2021 22:47:32,Firefox,Windows,PC,Iran,Visited:.../how-much-do-you-know-about-financi...,440


In [55]:
# Merging refreshed pages

class IncorrectMethodError(Exception):
    def __init__(self, method):
        super().__init__(f"Method should be 'event_with_odd' or 'odd_with_even', {method} is incorrect")


def merge_refreshed_pages(event_log, method='even_with_odd') -> pd.DataFrame:
    if method == 'even_with_odd':
        chunk_1 = event_log[::2]   # 0   2   4   6   ...   40010
        chunk_2 = event_log[1::2]  # 1   3   5   7   ...   40011
    elif method == 'odd_with_even':
        chunk_1 = event_log[1::2]  # 1   3   5   7   ...   40011
        chunk_2 = event_log[2::2]  # 2   4   6   8   ...   40010
    else:
        raise IncorrectMethodError(method)

    chunk_2.index = chunk_2.index - 1
    merged_df = chunk_1.merge(chunk_2[['User_ID', 'Visited_Page']], left_index=True, right_index=True, suffixes=('', '_Chunck2'), how='left')
    merged_df = merged_df.loc[(merged_df.User_ID == merged_df.User_ID_Chunck2) & (merged_df.Visited_Page == merged_df.Visited_Page_Chunck2)]
    # merged_df : 0   2   4    12    24    40    ...

    index = merged_df.index
    index_rm = merged_df.index + 1

    event_log.loc[event_log.index.isin(index), 'Time_on_Page'] += list(event_log.loc[event_log.index.isin(index_rm)]['Time_on_Page'])
    event_log = event_log.loc[~event_log.index.isin(index_rm)]
    return event_log.reset_index(drop=True)

In [56]:
while True:
    event_log_len = len(event_log)
    event_log = merge_refreshed_pages(event_log, method='even_with_odd')
    event_log = merge_refreshed_pages(event_log, method='odd_with_even')

    if event_log_len == len(event_log):
        break

In [58]:
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,-9222220339580515528,12/2/2021 14:57:00,12/2/2021 14:57:00,Chrome,Windows,PC,Iran,Visited:.../what-is-the-bourse-view/,48
1,-9222220339580515528,12/2/2021 14:57:00,12/2/2021 15:00:10,Chrome,Windows,PC,Iran,Visited:.../bourseview-fundamental-graph-analy...,445
2,-9222220339580515528,12/2/2021 14:57:00,12/2/2021 15:07:35,Chrome,Windows,PC,Iran,Visited:.../introduction-portfolio-section-in-...,907
3,-9220913366454553897,12/21/2021 22:39:00,12/21/2021 22:39:00,Firefox,Windows,PC,Iran,Visited:.../how-much-do-you-know-about-financi...,952
4,-9220913366454553897,12/21/2021 22:39:00,12/21/2021 22:54:52,Firefox,Windows,PC,Iran,Visited:.../nasdaq-second-market-stock-in-the-...,58


In [59]:
# Function to create code mappings
def create_code_mapping(column, prefix):
    unique_items = column.unique()
    code_map = {item: f"{prefix}_{idx}" for idx, item in enumerate(unique_items)}
    return code_map

# Creating code mappings for each column
user_code = create_code_mapping(event_log['User_ID'], 'U')
activities_code = create_code_mapping(event_log['Visited_Page'], 'Page')
device_code = create_code_mapping(event_log['Device'], 'D')
os_code = create_code_mapping(event_log['Operating_System'], 'OS')
browser_code = create_code_mapping(event_log['Browser'], 'B')

# Create a copy of the DataFrame for modification
mapped_event_log = event_log.copy()

# Applying the code mappings using .loc
mapped_event_log['User_ID'].replace(user_code, inplace=True)
mapped_event_log['Visited_Page'].replace(activities_code, inplace=True)
mapped_event_log['Device'].replace(device_code, inplace=True)
mapped_event_log['Operating_System'].replace(os_code, inplace=True)
mapped_event_log['Browser'].replace(browser_code, inplace=True)

# Displaying the updated DataFrame
mapped_event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,U_0,12/2/2021 14:57:00,12/2/2021 14:57:00,B_0,OS_0,D_0,Iran,Page_0,48
1,U_0,12/2/2021 14:57:00,12/2/2021 15:00:10,B_0,OS_0,D_0,Iran,Page_1,445
2,U_0,12/2/2021 14:57:00,12/2/2021 15:07:35,B_0,OS_0,D_0,Iran,Page_2,907
3,U_1,12/21/2021 22:39:00,12/21/2021 22:39:00,B_1,OS_0,D_0,Iran,Page_3,952
4,U_1,12/21/2021 22:39:00,12/21/2021 22:54:52,B_1,OS_0,D_0,Iran,Page_4,58


In [60]:
mapped_event_log.to_csv(DATA_DIR / 'Real/Customer_Journey/Website_EventLog_Preprossed_With_Python.csv', index=False)

<br></br>
## <b>3. Prediction with vanilla method</b>

In [17]:
# Import Eventlog
event_log = pd.read_csv(DATA_DIR / 'Real/Customer_Journey/Website_EventLog_Preprossed_With_Python.csv')

<br></br>
### <b>3.1. Filter Visited Pages</b>

In [18]:
# Filter Activities based on duration
event_log = event_log.loc[event_log["Time_on_Page"] > 10]
event_log = event_log.loc[event_log["Time_on_Page"] <= 600]
event_log.reset_index(drop=True, inplace=True)

In [19]:
# Filter Cases

# Method 1
# 1. Group by based on User_ID and Calculate Visited Page Count (index: User_ID ,  Value: Page Count)
# 2. Join Table step 1 with event_log based on User_ID

# event_log = event_log[event_log.groupby('User_ID')['Visited_Page'].transform('count') >= 3].reset_index(drop=True)
# event_log


# Method 2
user_id_visited_pages_count = event_log.groupby('User_ID').agg(
                                        Count_Page=('User_ID', 'count')
                              )

filtered_users = user_id_visited_pages_count.loc[user_id_visited_pages_count.Count_Page >= 3].index

event_log = event_log[event_log.User_ID.isin(filtered_users)].reset_index(drop=True)
event_log

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,U_1,12/21/2021 22:39:00,12/21/2021 22:54:52,B_1,OS_0,D_0,Iran,Page_4,58
1,U_1,12/21/2021 22:39:00,12/21/2021 22:55:50,B_1,OS_0,D_0,Iran,Page_5,300
2,U_1,12/21/2021 22:39:00,12/21/2021 23:33:04,B_1,OS_0,D_0,Iran,Page_6,18
3,U_5,12/6/2021 7:45:00,12/6/2021 7:45:00,B_0,OS_0,D_0,Iran,Page_14,35
4,U_5,12/6/2021 7:45:00,12/6/2021 8:33:09,B_0,OS_0,D_0,Iran,Page_14,19
...,...,...,...,...,...,...,...,...,...
13882,U_7893,12/17/2021 20:48:00,12/17/2021 20:51:56,B_2,OS_1,D_1,Iran,Page_9,302
13883,U_7893,12/17/2021 20:48:00,12/17/2021 21:11:09,B_2,OS_1,D_1,Iran,Page_6,15
13884,U_7897,12/15/2021 13:39:00,12/15/2021 13:39:00,B_2,OS_1,D_1,Iran,Page_7,29
13885,U_7897,12/15/2021 13:39:00,12/15/2021 13:39:45,B_2,OS_1,D_1,Iran,Page_21,22


<br></br>
### <b>3.2. Making dummy variables for Device and OS</b>

In [20]:
event_log = pd.concat([
        event_log.drop("Device", axis=1),
        pd.get_dummies(event_log.Device, prefix="Device")
    ], axis=1)

event_log = pd.concat([
        event_log.drop("Operating_System", axis=1),
        pd.get_dummies(event_log["Operating_System"])
    ], axis=1)

event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Country,Visited_Page,Time_on_Page,Device_D_0,Device_D_1,Device_D_2,Device_D_3,OS_0,OS_1,OS_2,OS_3,OS_4,OS_5,OS_6
0,U_1,12/21/2021 22:39:00,12/21/2021 22:54:52,B_1,Iran,Page_4,58,1,0,0,0,1,0,0,0,0,0,0
1,U_1,12/21/2021 22:39:00,12/21/2021 22:55:50,B_1,Iran,Page_5,300,1,0,0,0,1,0,0,0,0,0,0
2,U_1,12/21/2021 22:39:00,12/21/2021 23:33:04,B_1,Iran,Page_6,18,1,0,0,0,1,0,0,0,0,0,0
3,U_5,12/6/2021 7:45:00,12/6/2021 7:45:00,B_0,Iran,Page_14,35,1,0,0,0,1,0,0,0,0,0,0
4,U_5,12/6/2021 7:45:00,12/6/2021 8:33:09,B_0,Iran,Page_14,19,1,0,0,0,1,0,0,0,0,0,0


<br></br>
### <b>3.3. Reshape Eventlog</b>

In [21]:
from tqdm import tqdm

In [22]:
reshaped_event_log = pd.DataFrame()

for group_name, group in tqdm(event_log.groupby('User_ID')):
    group.sort_values("Activity_Start_Date", inplace=True)
    group.reset_index(drop=True, inplace=True)

    # Input Variables
    activities = [tuple(group["Visited_Page"].values[:i+1]) for i in range(len(group))]
    elapsed_time = [sum(group["Time_on_Page"].values[:i+1]) for i in range(len(group))]
    number_of_visited_page = [len(group["Visited_Page"].values[:i+1]) for i in range(len(group))]

    operating_systems = {}
    devices = {}
    for col in group.columns:
        if 'OS_' in col:
            operating_systems[col] = [max(group[col]) for i in range(len(group))]
        if 'Device_' in col:
            devices[col] = [max(group[col]) for i in range(len(group))]

    # Output Variable
    next_page = [group["Visited_Page"].values[i+1] for i in range(len(group) - 1)] + ["End"]

    reshaped_group = pd.DataFrame({
        'Visited_Page': activities,
        'Elapsed_Time': elapsed_time,
        '#Visited_Page': number_of_visited_page,
        'Next_Page': next_page,
        **operating_systems,
        **devices,
    })
    reshaped_event_log = pd.concat([reshaped_event_log, reshaped_group], axis=0)

100%|██████████████████████████████████████████████████████████████████████████████| 2804/2804 [00:05<00:00, 475.19it/s]


In [23]:
reshaped_event_log.reset_index(drop=True, inplace=True)
reshaped_event_log.head()

Unnamed: 0,Visited_Page,Elapsed_Time,#Visited_Page,Next_Page,OS_0,OS_1,OS_2,OS_3,OS_4,OS_5,OS_6,Device_D_0,Device_D_1,Device_D_2,Device_D_3
0,"(Page_4,)",58,1,Page_5,1,0,0,0,0,0,0,1,0,0,0
1,"(Page_4, Page_5)",358,2,Page_6,1,0,0,0,0,0,0,1,0,0,0
2,"(Page_4, Page_5, Page_6)",376,3,End,1,0,0,0,0,0,0,1,0,0,0
3,"(Page_21,)",31,1,Page_39,0,1,0,0,0,0,0,0,0,1,0
4,"(Page_21, Page_39)",72,2,Page_21,0,1,0,0,0,0,0,0,0,1,0


<br></br>
### <b>3.4. Making dummy variables for Visited Pages</b>

In [24]:
reshaped_event_log = pd.concat([
        reshaped_event_log.drop("Visited_Page", axis=1),
        pd.get_dummies(reshaped_event_log.Visited_Page, prefix="Visited")
    ], axis=1)

reshaped_event_log.head()

Unnamed: 0,Elapsed_Time,#Visited_Page,Next_Page,OS_0,OS_1,OS_2,OS_3,OS_4,OS_5,OS_6,...,"Visited_('Page_99', 'Page_41', 'Page_360', 'Page_63', 'Page_227')","Visited_('Page_99', 'Page_41', 'Page_360', 'Page_63', 'Page_227', 'Page_369')","Visited_('Page_99', 'Page_41', 'Page_360', 'Page_63', 'Page_227', 'Page_369', 'Page_293')","Visited_('Page_99', 'Page_41', 'Page_360', 'Page_63', 'Page_227', 'Page_369', 'Page_293', 'Page_227')","Visited_('Page_99', 'Page_41', 'Page_360', 'Page_63', 'Page_227', 'Page_369', 'Page_293', 'Page_227', 'Page_369')","Visited_('Page_99', 'Page_41', 'Page_360', 'Page_63', 'Page_227', 'Page_369', 'Page_293', 'Page_227', 'Page_369', 'Page_360')","Visited_('Page_99', 'Page_47')","Visited_('Page_99', 'Page_47', 'Page_49')","Visited_('Page_99', 'Page_47', 'Page_49', 'Page_50')","Visited_('Page_99', 'Page_47', 'Page_49', 'Page_50', 'Page_160')"
0,58,1,Page_5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,358,2,Page_6,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,376,3,End,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,31,1,Page_39,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,72,2,Page_21,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<br></br>
### <b>3.5. Train Models</b>

In [25]:
x_columns_name = ["Elapsed_Time", "#Visited_Page"] + [item for item in reshaped_event_log.columns if "Visited_" in item
                                                                                                  or "Device_" in item
                                                                                                  or "OS_" in item]
y_column_name = ["Next_Page"]

In [60]:
# Train Test Split
x = reshaped_event_log[x_columns_name]
y = reshaped_event_log[y_column_name]
y = np.ravel(y)

label_encoder = LabelEncoder()

# Fit and transform the y array to integer labels
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

<br></br>
#### <b>3.5.1. Logistic Regression</b>

In [None]:
# Fit Mode
model = LogisticRegression(solver='liblinear', random_state=0)

# model = LogisticRegression(solver='liblinear', C=0.05, multi_class='ovr', random_state=0)
model.fit(x_train, y_train)

In [254]:
y_pred = model.predict(x_test)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

0.4496578690127077
0.189453125


<br></br>
#### <b>3.5.2. KNN</b>

In [31]:
knn_model = KNeighborsClassifier(n_neighbors=15)
knn_model.fit(x_train, y_train)

In [32]:
pred = knn_model.predict(x_test)
accuracy_score(pred, y_test)

0.09287257019438445

<br></br>
#### <b>3.5.3. Neural Network</b>

In [79]:
# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

y_encoded = to_categorical([*y_train, *y_test])

y_train_encoded = y_encoded[:y_train.shape[0]]
y_test_encoded = y_encoded[y_train.shape[0]:]

In [93]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import to_categorical


# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

# Convert y to one-hot encoded format
y_encoded = to_categorical([*y_train, *y_test])

y_train_encoded = y_encoded[:y_train.shape[0]]
y_test_encoded = y_encoded[y_train.shape[0]:]


scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# Creating the neural network model
model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(x_train.shape[1],)),
    Dropout(0.3),
    layers.Dense(32, activation='relu'),
    Dropout(0.2),
    layers.Dense(y_train_encoded.shape[1], activation='softmax')
])

# Compiling the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# # Early stopping callback
# early_stopping = EarlyStopping(patience=20, restore_best_weights=True)

# Training the model
epochs = 10
batch_size = 32
model.fit(x_train, y_train_encoded, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# Evaluating the model on the test set
test_loss, test_accuracy = model.evaluate(x_test, y_test_encoded)
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.1000719964504242


<br></br>
#### <b>3.5.4. Calculate recommendation accuracy</b>

In [100]:
def get_top_n_classes(probs, n):
    top_n_indices = np.argsort(probs)[-n:]
    return top_n_indices


# Make predictions on the test set
predictions = model.predict(x_test)

# Initialize variables for accuracy calculation
num_correct = 0
total_samples = len(x_test)

# Evaluate accuracy based on recommending the top five classes
for i in range(total_samples):
    true_class = y_test[i]
    predicted_probs = predictions[i]
    top_classes = get_top_n_classes(predicted_probs, n=5)

    if true_class in top_classes:
        num_correct += 1


# Calculate and print the accuracy
accuracy = num_correct / total_samples
print(f"Accuracy based on recommending top five classes: {accuracy}")

Accuracy based on recommending top five classes: 0.2580993520518359
