## <b>1. Import Packages</b>

In [1]:
import pandas as pd
import sys
import hashlib
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from src import DATA_DIR

2023-08-04 15:04:37.487716: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-04 15:04:37.536392: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-08-04 15:04:37.537183: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


<br></br>
## <b>2. Data Preprocessing</b>

In [2]:
# Import data and keep the target pages
event_log = pd.read_csv(DATA_DIR / 'Real/Customer_Journey/Website Event Log.csv')
target_page = pd.read_csv(DATA_DIR / 'Real/Customer_Journey/Target_Pages.csv')
event_log = event_log.loc[event_log.Visited_Page.isin(target_page.Pages)].reset_index(drop=True)

In [3]:
# Create unique user id
event_log.User_ID = event_log.User_ID + '_' + event_log.Case_Start_Date

In [4]:
# sort rows
event_log = event_log.sort_values(['User_ID', 'Activity_Start_Date'])
event_log.reset_index(drop=True, inplace=True)
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:47:11,Firefox,Windows,PC,Iran,Visited:.../nominal-value-and-market-price/,14
1,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:47:25,Firefox,Windows,PC,Iran,Visited:.../base-volume/,28
2,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:47:53,Firefox,Windows,PC,Iran,Visited:.../base-volume/,16
3,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:48:09,Firefox,Windows,PC,Iran,Visited:.../what-is-the-concept-of-return-on-s...,9
4,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:48:18,Firefox,Windows,PC,Iran,Visited:.../base-volume/,3


In [5]:
# Merging refreshed pages

class IncorrectMethodError(Exception):
    def __init__(self, method):
        super().__init__(f"Method should be 'event_with_odd' or 'odd_with_even', {method} is incorrect")


def merge_refreshed_pages(event_log, method='even_with_odd') -> pd.DataFrame:
    if method == 'even_with_odd':
        chunk_1 = event_log[::2]
        chunk_2 = event_log[1::2]
    elif method == 'odd_with_even':
        chunk_1 = event_log[1::2]
        chunk_2 = event_log[::2]
    else:
        raise IncorrectMethodError(method)

    chunk_2.index = chunk_2.index - 1
    merged_df = chunk_1.merge(chunk_2[['User_ID', 'Visited_Page']], left_index=True, right_index=True, suffixes=('', '_Chunck2'))
    merged_df = merged_df.loc[(merged_df.User_ID == merged_df.User_ID_Chunck2) & (merged_df.Visited_Page == merged_df.Visited_Page_Chunck2)]

    index = merged_df.index
    index_rm = merged_df.index + 1

    event_log.loc[event_log.index.isin(index), 'Time_on_Page'] += list(event_log.loc[event_log.index.isin(index_rm)]['Time_on_Page'])
    event_log = event_log.loc[~event_log.index.isin(index_rm)]
    return event_log.reset_index(drop=True)

In [6]:
while True:
    event_log_len = len(event_log)
    event_log = merge_refreshed_pages(event_log, method='even_with_odd')
    event_log = merge_refreshed_pages(event_log, method='odd_with_even')

    if event_log_len == len(event_log):
        break

In [7]:
event_log

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:47:11,Firefox,Windows,PC,Iran,Visited:.../nominal-value-and-market-price/,14
1,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:47:25,Firefox,Windows,PC,Iran,Visited:.../base-volume/,44
2,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:48:09,Firefox,Windows,PC,Iran,Visited:.../what-is-the-concept-of-return-on-s...,9
3,100od9e_12/21/2021 12:44:00,12/21/2021 12:44:00,12/21/2021 12:48:18,Firefox,Windows,PC,Iran,Visited:.../base-volume/,3
4,1018dg1_12/8/2021 9:18:00,12/8/2021 9:18:00,12/8/2021 9:22:06,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,12
...,...,...,...,...,...,...,...,...,...
28437,zzmtg1_12/3/2021 3:43:00,12/3/2021 3:43:00,12/3/2021 3:43:00,ChromeMobile,Android,Mobile,Iran,Visited:.../technical-analysis/,7
28438,zzmtg1_12/3/2021 3:43:00,12/3/2021 3:43:00,12/3/2021 3:43:07,ChromeMobile,Android,Mobile,Iran,Visited:.../trendline-in-technical-analysis/,494
28439,zzmtg1_12/3/2021 3:43:00,12/3/2021 3:43:00,12/3/2021 3:53:03,ChromeMobile,Android,Mobile,Iran,Visited:.../technical-analysis/,3
28440,zzmtg1_12/3/2021 3:43:00,12/3/2021 3:43:00,12/3/2021 3:53:06,ChromeMobile,Android,Mobile,Iran,Visited:.../what-you-need-to-know-about-fibona...,321


In [8]:
# Function to create code mappings
def create_code_mapping(column, prefix):
    unique_items = column.unique()
    code_map = {item: f"{prefix}_{i}" for i, item in enumerate(unique_items)}
    return code_map

# Creating code mappings for each column
user_code = create_code_mapping(event_log['User_ID'], 'U')
activities_code = create_code_mapping(event_log['Visited_Page'], 'Page')
device_code = create_code_mapping(event_log['Device'], 'D')
os_code = create_code_mapping(event_log['Operating_System'], 'OS')
browser_code = create_code_mapping(event_log['Browser'], 'B')

# Create a copy of the DataFrame for modification
event_log = event_log.copy()

# Applying the code mappings using .loc
event_log['User_ID'].replace(user_code, inplace=True)
event_log['Visited_Page'].replace(activities_code, inplace=True)
event_log['Device'].replace(device_code, inplace=True)
event_log['Operating_System'].replace(os_code, inplace=True)
event_log['Browser'].replace(browser_code, inplace=True)

# Reordering the columns
event_log = event_log[['User_ID', 'Case_Start_Date', 'Activity_Start_Date', 'Browser', 'Operating_System', 'Device', 'Visited_Page', 'Time_on_Page']]

# Displaying the updated DataFrame
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Visited_Page,Time_on_Page
0,U_0,12/21/2021 12:44:00,12/21/2021 12:47:11,B_0,OS_0,D_0,Page_0,14
1,U_0,12/21/2021 12:44:00,12/21/2021 12:47:25,B_0,OS_0,D_0,Page_1,44
2,U_0,12/21/2021 12:44:00,12/21/2021 12:48:09,B_0,OS_0,D_0,Page_2,9
3,U_0,12/21/2021 12:44:00,12/21/2021 12:48:18,B_0,OS_0,D_0,Page_1,3
4,U_1,12/8/2021 9:18:00,12/8/2021 9:22:06,B_1,OS_1,D_1,Page_3,12


In [9]:
event_log.to_csv(DATA_DIR / 'Real/Customer_Journey/Website_EventLog_Preprossed_With_Python.csv', index=False)

<br></br>
## <b>3. Prediction with vanilla method</b>

In [10]:
# Import Eventlog
event_log = pd.read_csv(DATA_DIR / 'Real/Customer_Journey/Website_EventLog_Preprossed_With_Python.csv')

<br></br>
### <b>3.1. Filter Visited Pages</b>

In [11]:
# Filter Activities based on duration
event_log = event_log.loc[event_log["Time_on_Page"] > 10]
event_log = event_log.loc[event_log["Time_on_Page"] <= 600]
event_log.reset_index(drop=True, inplace=True)

In [12]:
# Filter Cases
event_log = event_log[event_log.groupby('User_ID')['Visited_Page'].transform('count') >= 3].reset_index(drop=True)
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Visited_Page,Time_on_Page
0,U_4,12/11/2021 20:55:00,12/11/2021 20:55:00,B_1,OS_1,D_1,Page_6,24
1,U_4,12/11/2021 20:55:00,12/11/2021 20:56:00,B_1,OS_1,D_1,Page_7,169
2,U_4,12/11/2021 20:55:00,12/11/2021 20:58:49,B_1,OS_1,D_1,Page_6,253
3,U_9,12/28/2021 11:56:00,12/28/2021 11:56:35,B_1,OS_1,D_1,Page_3,11
4,U_9,12/28/2021 11:56:00,12/28/2021 11:57:45,B_1,OS_1,D_1,Page_13,13


<br></br>
### <b>3.2. Making dummy variables for Device and OS</b>

In [13]:
event_log = pd.concat([
        event_log.drop("Device", axis=1),
        pd.get_dummies(event_log.Device, prefix="Device")
    ], axis=1)

event_log = pd.concat([
        event_log.drop("Operating_System", axis=1),
        pd.get_dummies(event_log["Operating_System"])
    ], axis=1)

event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Visited_Page,Time_on_Page,Device_D_0,Device_D_1,Device_D_2,Device_D_3,OS_0,OS_1,OS_2,OS_3,OS_4,OS_5,OS_6
0,U_4,12/11/2021 20:55:00,12/11/2021 20:55:00,B_1,Page_6,24,0,1,0,0,0,1,0,0,0,0,0
1,U_4,12/11/2021 20:55:00,12/11/2021 20:56:00,B_1,Page_7,169,0,1,0,0,0,1,0,0,0,0,0
2,U_4,12/11/2021 20:55:00,12/11/2021 20:58:49,B_1,Page_6,253,0,1,0,0,0,1,0,0,0,0,0
3,U_9,12/28/2021 11:56:00,12/28/2021 11:56:35,B_1,Page_3,11,0,1,0,0,0,1,0,0,0,0,0
4,U_9,12/28/2021 11:56:00,12/28/2021 11:57:45,B_1,Page_13,13,0,1,0,0,0,1,0,0,0,0,0


<br></br>
### <b>3.3. Reshape Eventlog</b>

In [14]:
new_event_log = []

for ind, group_info in enumerate(event_log.groupby('User_ID')):
    group_name, group = group_info
    group.sort_values("Activity_Start_Date", inplace=True)
    group.reset_index(drop=True, inplace=True)

    # Input Variables
    group["Activity"] = [tuple(group["Visited_Page"].values[:i+1]) for i in range(len(group))]
    group["Elapsed_Time"] = [sum(group["Time_on_Page"].values[:i+1]) for i in range(len(group))]
    group["N. Done Activities"] = [len(group["Visited_Page"].values[:i+1]) for i in range(len(group))]

    # Output Variable
    group["NextPage"] = [group["Visited_Page"].values[i+1] for i in range(len(group) - 1)] + ["End"]

    for col in group.columns:
        if "Device_" in col or "OS_" in col:
            group[col] = [sum(group[col].values[:i+1]) for i in range(len(group))]

    group = group.iloc[:-1, :]

    if len(new_event_log):
        new_event_log = pd.concat([new_event_log, group], axis=0)
    else:
        new_event_log = group

    sys.stdout.write('\r')
    sys.stdout.write("Case: " + str(ind+1) + " From " + str(len(event_log.User_ID.unique())))
    sys.stdout.flush()

Case: 2804 From 2804

In [15]:
new_event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Visited_Page,Time_on_Page,Device_D_0,Device_D_1,Device_D_2,Device_D_3,...,OS_1,OS_2,OS_3,OS_4,OS_5,OS_6,Activity,Elapsed_Time,N. Done Activities,NextPage
0,U_10,12/3/2021 18:27:00,12/3/2021 18:27:00,B_1,Page_8,306,0,1,0,0,...,1,0,0,0,0,0,"(Page_8,)",306,1,Page_9
1,U_10,12/3/2021 18:27:00,12/3/2021 18:32:06,B_1,Page_9,255,0,2,0,0,...,2,0,0,0,0,0,"(Page_8, Page_9)",561,2,Page_10
2,U_10,12/3/2021 18:27:00,12/3/2021 18:36:21,B_1,Page_10,44,0,3,0,0,...,3,0,0,0,0,0,"(Page_8, Page_9, Page_10)",605,3,Page_11
3,U_10,12/3/2021 18:27:00,12/3/2021 18:38:08,B_1,Page_11,67,0,4,0,0,...,4,0,0,0,0,0,"(Page_8, Page_9, Page_10, Page_11)",672,4,Page_12
4,U_10,12/3/2021 18:27:00,12/3/2021 18:39:15,B_1,Page_12,73,0,5,0,0,...,5,0,0,0,0,0,"(Page_8, Page_9, Page_10, Page_11, Page_12)",745,5,Page_16


<br></br>
### <b>3.4. Making dummy variables for Visited Pages</b>

In [16]:
new_event_log = pd.concat([
        new_event_log.drop("Visited_Page", axis=1),
        pd.get_dummies(new_event_log.Visited_Page, prefix="Visited_Page")
    ], axis=1)

new_event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Time_on_Page,Device_D_0,Device_D_1,Device_D_2,Device_D_3,OS_0,...,Visited_Page_Page_90,Visited_Page_Page_91,Visited_Page_Page_92,Visited_Page_Page_93,Visited_Page_Page_94,Visited_Page_Page_95,Visited_Page_Page_96,Visited_Page_Page_97,Visited_Page_Page_98,Visited_Page_Page_99
0,U_10,12/3/2021 18:27:00,12/3/2021 18:27:00,B_1,306,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,U_10,12/3/2021 18:27:00,12/3/2021 18:32:06,B_1,255,0,2,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,U_10,12/3/2021 18:27:00,12/3/2021 18:36:21,B_1,44,0,3,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,U_10,12/3/2021 18:27:00,12/3/2021 18:38:08,B_1,67,0,4,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,U_10,12/3/2021 18:27:00,12/3/2021 18:39:15,B_1,73,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<br></br>
### <b>3.5. Train Models</b>

In [17]:
x_columns_name = ["Elapsed_Time", "N. Done Activities"] + [item for item in new_event_log.columns if "Visited_" in item or "Device_" in item or "OS_" in item]
y_column_name = ["NextPage"]

In [18]:
# Train Test Split
x = new_event_log[x_columns_name]
y = new_event_log[y_column_name]
y = np.ravel(y)

label_encoder = LabelEncoder()

# Fit and transform the y array to integer labels
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

<br></br>
#### <b>3.5.1. Logistic Regression</b>

In [None]:
# Fit Mode
model = LogisticRegression(solver='liblinear', random_state=0)

# model = LogisticRegression(solver='liblinear', C=0.05, multi_class='ovr', random_state=0)
model.fit(x_train, y_train)

In [254]:
y_pred = model.predict(x_test)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

0.4496578690127077
0.189453125


<br></br>
#### <b>3.5.2. KNN</b>

In [19]:
knn_model = KNeighborsClassifier(n_neighbors=20)
knn_model.fit(x_train, y_train)

In [20]:
pred = knn_model.predict(x_test)
accuracy_score(pred,y_test)

0.2318448353631033

<br></br>
#### <b>3.5.3. Neural Network</b>

In [45]:
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import to_categorical


# Convert y to one-hot encoded format
y_train_encoded = to_categorical(y_train)
y_test_encoded = to_categorical(y_test)


# Creating the neural network model
model = keras.Sequential([
    layers.Dense(64, activation='relu', input_shape=(x_train.shape[1],)),
    Dropout(0.1),
    layers.Dense(32, activation='relu'),
    Dropout(0.1),
    layers.Dense(y_train_encoded.shape[1], activation='softmax')
])

# Compiling the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# # Early stopping callback
# early_stopping = EarlyStopping(patience=20, restore_best_weights=True)

# Training the model
epochs = 100
batch_size = 32
model.fit(x_train, y_train_encoded, epochs=epochs, batch_size=batch_size, validation_split=0.1)

# Evaluating the model on the test set
test_loss, test_accuracy = model.evaluate(x_test, y_test_encoded)
print(f"Test Accuracy: {test_accuracy}")

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
250/250 [==============================] - 0s 1ms/step - loss: 2.5261 - accuracy: 0.3359 - val_loss: 3.9773 - val_accuracy: 0.2244
250/250 [==============================] - 0s 1ms/step - loss: 2.6248 - accuracy: 0.2841 - val_loss: 5.3057 - val_accuracy: 0.2379

<br></br>
#### <b>3.5.4. Calculate recommendation accuracy</b>

In [41]:
# Make predictions on the test set
predictions = model.predict(x_test)

# Function to get the top n classes based on their probabilities
def get_top_n_classes(probs, n):
    top_n_indices = np.argsort(probs)[-n:]
    return top_n_indices

# Initialize variables for accuracy calculation
num_correct = 0
total_samples = len(y_test)

# Evaluate accuracy based on recommending the top five classes
for i in range(total_samples):
    true_class = y_test[i]
    predicted_probs = predictions[i]
    top_classes = get_top_n_classes(predicted_probs, n=5)

    if true_class in top_classes:
        num_correct += 1


# Calculate and print the accuracy
accuracy = num_correct / total_samples
print(f"Accuracy based on recommending top five classes: {accuracy}")

Accuracy based on recommending top five classes: 0.501127649977447


In [28]:
predictions

array([[2.6336619e-07, 1.8900846e-05, 1.5887901e-03, ..., 1.2131714e-04,
        6.2909341e-05, 1.4917110e-06],
       [4.0138049e-05, 6.0398493e-06, 1.7327825e-06, ..., 7.9755746e-06,
        2.1090776e-05, 2.8857077e-05],
       [6.8340955e-07, 1.4817973e-05, 3.5111944e-03, ..., 2.4915293e-05,
        2.3856033e-03, 3.9337629e-06],
       ...,
       [3.7109612e-03, 4.2041784e-04, 3.1084455e-05, ..., 1.8249104e-04,
        1.9759244e-04, 4.8936605e-03],
       [3.9181000e-06, 1.6780710e-05, 1.1217508e-04, ..., 2.9388891e-04,
        3.6239009e-03, 7.2615316e-05],
       [3.7790153e-06, 1.0430654e-05, 5.8233309e-01, ..., 8.2113465e-06,
        1.2688900e-03, 1.7484141e-05]], dtype=float32)

In [31]:
true_class

155

In [32]:
predicted_probs

array([2.63366189e-07, 1.89008460e-05, 1.58879010e-03, 7.26186329e-07,
       9.85950578e-07, 5.88296825e-05, 2.78115767e-05, 6.14868623e-05,
       1.18760487e-04, 3.29480531e-06, 1.43632917e-06, 3.21687321e-06,
       8.72198052e-06, 1.28823391e-03, 5.66705069e-07, 5.66347671e-06,
       1.41529948e-04, 1.61719327e-05, 3.52380027e-08, 1.85423505e-06,
       7.80694123e-08, 3.41585178e-07, 6.00086150e-06, 1.33535477e-05,
       1.08151790e-03, 2.30558817e-05, 6.19562063e-03, 9.62797344e-08,
       9.35221669e-06, 7.10487802e-06, 3.87443805e-07, 1.02710092e-05,
       1.12671796e-06, 4.50305924e-05, 5.72460692e-07, 2.02328642e-03,
       7.27367365e-07, 2.43276554e-05, 5.29161980e-07, 2.62874755e-06,
       5.55745326e-04, 2.38704956e-07, 3.93172877e-06, 1.88071481e-05,
       1.83143657e-05, 3.98916425e-04, 7.64129709e-07, 6.56962584e-07,
       1.44411968e-06, 7.73282773e-06, 5.79641957e-04, 5.22511471e-07,
       3.98747125e-06, 4.07143352e-06, 6.58164718e-05, 3.90119349e-05,
      

In [40]:
np.argsort(predicted_probs)[-5:]

array([169, 364, 144, 353, 155])

In [44]:
num_correct / total_samples

0.501127649977447

In [43]:
total_samples

2217