## <b>1. Import Packages</b>

In [1]:
import pandas as pd
import sys
import hashlib
import numpy as np
from tqdm import tqdm

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from tensorflow.keras.layers import Dropout
from tensorflow.keras.callbacks import EarlyStopping
from keras.utils import to_categorical

import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

from src import SRC_DIR

2024-06-09 11:23:48.419443: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-09 11:23:48.420180: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-09 11:23:48.476955: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-06-09 11:23:48.612652: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
  machar = _get_machar(dtype)


<br></br>
## <b>2. Data Preprocessing</b>

In [2]:
# Import data and keep the target pages
event_log = pd.read_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Website Event Log.csv')
target_page = pd.read_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Target_Pages.csv')
event_log = event_log.loc[event_log.Visited_Page.isin(target_page.Pages)].reset_index(drop=True)

In [3]:
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,hv5xru,12/30/2021 23:58:00,12/30/2021 23:58:00,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,168
1,hv5xru,12/30/2021 23:58:00,12/31/2021 0:00:48,ChromeMobile,Android,Mobile,Iran,Visited:.../online-issuance-and-cancellation/,153
2,hv5xru,12/30/2021 23:58:00,12/31/2021 0:03:21,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,5
3,hv5xru,12/30/2021 23:58:00,12/31/2021 0:04:07,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,26
4,92c26h,12/30/2021 23:57:00,12/30/2021 23:57:00,Firefox,Windows,PC,Iran,Visited:learning.emofid.com,129


In [153]:
# Create unique user id
event_log.User_ID = (event_log['User_ID'] + event_log['Case_Start_Date']).apply(hash)

In [154]:
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,5390517100263705501,12/30/2021 23:58:00,12/30/2021 23:58:00,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,168
1,5390517100263705501,12/30/2021 23:58:00,12/31/2021 0:00:48,ChromeMobile,Android,Mobile,Iran,Visited:.../online-issuance-and-cancellation/,153
2,5390517100263705501,12/30/2021 23:58:00,12/31/2021 0:03:21,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,5
3,5390517100263705501,12/30/2021 23:58:00,12/31/2021 0:04:07,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,26
4,5165595762199768401,12/30/2021 23:57:00,12/30/2021 23:57:00,Firefox,Windows,PC,Iran,Visited:learning.emofid.com,129


In [4]:
# sort rows
event_log = event_log.sort_values(['User_ID', 'Activity_Start_Date'])
event_log.reset_index(drop=True, inplace=True)
event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,100od9e,12/21/2021 12:44:00,12/21/2021 12:47:11,Firefox,Windows,PC,Iran,Visited:.../nominal-value-and-market-price/,14
1,100od9e,12/21/2021 12:44:00,12/21/2021 12:47:25,Firefox,Windows,PC,Iran,Visited:.../base-volume/,28
2,100od9e,12/21/2021 12:44:00,12/21/2021 12:47:53,Firefox,Windows,PC,Iran,Visited:.../base-volume/,16
3,100od9e,12/21/2021 12:44:00,12/21/2021 12:48:09,Firefox,Windows,PC,Iran,Visited:.../what-is-the-concept-of-return-on-s...,9
4,100od9e,12/21/2021 12:44:00,12/21/2021 12:48:18,Firefox,Windows,PC,Iran,Visited:.../base-volume/,3


In [156]:
# Even     Odd
# 0        1
# 2        3
# 4        5
# ...

# 1 -> 0 then 1 ×
# 5 -> 4 then 5 ×


# 0
# 1
# 2
# 3
# 4

# Even     Odd
# 0        1
# 2        3
# 4        5
# ...

### Method 1:

In [10]:
# Merging refreshed pages

class IncorrectMethodError(Exception):
    def __init__(self, method):
        super().__init__(f"Method should be 'even_with_odd' or 'odd_with_even', {method} is incorrect")


def merge_refreshed_pages(event_log, method='even_with_odd') -> pd.DataFrame:
    if method == 'even_with_odd':
        chunk_1 = event_log[::2]   # 0   2   4   6   ...   40010
        chunk_2 = event_log[1::2]  # 1   3   5   7   ...   40011
    elif method == 'odd_with_even':
        chunk_1 = event_log[1::2]  # 1   3   5   7   ...   40011
        chunk_2 = event_log[2::2]  # 2   4   6   8   ...   40010
    else:
        raise IncorrectMethodError(method)

    chunk_1 = chunk_1[['User_ID', 'Visited_Page']]
    chunk_2 = chunk_2[['User_ID', 'Visited_Page']]
    chunk_2.rename(columns={'User_ID': 'User_ID_chunk2',
                            'Visited_Page': 'Visited_Page_chunk2'},
                   inplace=True)
    

    chunk_2.index = chunk_2.index - 1
    merged_df = pd.concat([chunk_1, chunk_2], axis=1)

    merged_df = merged_df.loc[(merged_df.User_ID == merged_df.User_ID_chunk2) & 
                              (merged_df.Visited_Page == merged_df.Visited_Page_chunk2)]

    index = merged_df.index
    index_rm = merged_df.index + 1

    event_log.loc[event_log.index.isin(index), 'Time_on_Page'] += list(event_log.loc[event_log.index.isin(index_rm)]['Time_on_Page'])
    event_log = event_log.loc[~event_log.index.isin(index_rm)]
    return event_log.reset_index(drop=True)

In [11]:
while True:
    event_log_len = len(event_log)
    event_log = merge_refreshed_pages(event_log, method='even_with_odd')
    event_log = merge_refreshed_pages(event_log, method='odd_with_even')

    if event_log_len == len(event_log):
        break

In [12]:
event_log

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,100od9e,12/21/2021 12:44:00,12/21/2021 12:47:11,Firefox,Windows,PC,Iran,Visited:.../nominal-value-and-market-price/,14
1,100od9e,12/21/2021 12:44:00,12/21/2021 12:47:25,Firefox,Windows,PC,Iran,Visited:.../base-volume/,44
2,100od9e,12/21/2021 12:44:00,12/21/2021 12:48:09,Firefox,Windows,PC,Iran,Visited:.../what-is-the-concept-of-return-on-s...,9
3,100od9e,12/21/2021 12:44:00,12/21/2021 12:48:18,Firefox,Windows,PC,Iran,Visited:.../base-volume/,3
4,1018dg1,12/8/2021 9:18:00,12/8/2021 9:22:06,ChromeMobile,Android,Mobile,Iran,Visited:learning.emofid.com,12
...,...,...,...,...,...,...,...,...,...
27744,zzmtg1,12/3/2021 3:43:00,12/3/2021 3:43:00,ChromeMobile,Android,Mobile,Iran,Visited:.../technical-analysis/,7
27745,zzmtg1,12/3/2021 3:43:00,12/3/2021 3:43:07,ChromeMobile,Android,Mobile,Iran,Visited:.../trendline-in-technical-analysis/,494
27746,zzmtg1,12/3/2021 3:43:00,12/3/2021 3:53:03,ChromeMobile,Android,Mobile,Iran,Visited:.../technical-analysis/,3
27747,zzmtg1,12/3/2021 3:43:00,12/3/2021 3:53:06,ChromeMobile,Android,Mobile,Iran,Visited:.../what-you-need-to-know-about-fibona...,321


### Method 2:

In [162]:
# event_log['index'] = (event_log['User_ID'].astype(str) + event_log['Visited_Page']).apply(hash)

# event_log['difference'] = event_log['index'].diff().fillna(1)
# event_log['difference'] = event_log['difference'].replace(0, np.nan)
# event_log.loc[~event_log['difference'].isna(), 'difference'] += pd.to_datetime(event_log['Activity_Start_Date']).astype(int)

# event_log['difference'].ffill()

# event_log.groupby(['User_ID', 'Case_Start_Date', 'Browser', 'Operating_System', 'Device',
#                    'Country', 'Visited_Page', 'difference']).agg(
#     Time_on_Page = ('Time_on_Page', 'sum')
#                    ).reset_index()

In [13]:
# Function to create code mappings
def create_code_mapping(column, prefix):
    unique_items = column.unique()
    code_map = {item: f"{prefix}_{idx}" for idx, item in enumerate(unique_items)}
    return code_map

# Creating code mappings for each column
user_code = create_code_mapping(event_log['User_ID'], 'U')
activities_code = create_code_mapping(event_log['Visited_Page'], 'Page')
device_code = create_code_mapping(event_log['Device'], 'D')
os_code = create_code_mapping(event_log['Operating_System'], 'OS')
browser_code = create_code_mapping(event_log['Browser'], 'B')

# Create a copy of the DataFrame for modification
mapped_event_log = event_log.copy()

# Applying the code mappings using .loc
mapped_event_log.replace(
    {'User_ID': user_code,
     'Visited_Page': activities_code,
     'Device': device_code,
     'Operating_System': os_code,
     'Browser': browser_code,
    },
    inplace=True)

# Displaying the updated DataFrame
mapped_event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,U_0,12/21/2021 12:44:00,12/21/2021 12:47:11,B_0,OS_0,D_0,Iran,Page_0,14
1,U_0,12/21/2021 12:44:00,12/21/2021 12:47:25,B_0,OS_0,D_0,Iran,Page_1,44
2,U_0,12/21/2021 12:44:00,12/21/2021 12:48:09,B_0,OS_0,D_0,Iran,Page_2,9
3,U_0,12/21/2021 12:44:00,12/21/2021 12:48:18,B_0,OS_0,D_0,Iran,Page_1,3
4,U_1,12/8/2021 9:18:00,12/8/2021 9:22:06,B_1,OS_1,D_1,Iran,Page_3,12


In [14]:
mapped_event_log.to_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Website_EventLog_Preprossed_With_Python.csv', index=False)

<br></br>
## <b>3. Prediction with vanilla method</b>

In [15]:
# Import Eventlog
event_log = pd.read_csv(SRC_DIR / 'Datasets' / 'Real' / 'Customer_Journey' / 'Website_EventLog_Preprossed_With_Python.csv')

<br></br>
### <b>3.1. Filter Visited Pages</b>

In [17]:
# Filter Activities based on duration
event_log = event_log.loc[event_log["Time_on_Page"] > 10]
event_log = event_log.loc[event_log["Time_on_Page"] <= 600]
event_log.reset_index(drop=True, inplace=True)

In [18]:
user_id_visited_pages_count = event_log.groupby('User_ID').agg(
                                        Count_Page=('User_ID', 'count')
                              )

filtered_users = user_id_visited_pages_count.loc[user_id_visited_pages_count['Count_Page'] >= 3].index

event_log = event_log[event_log['User_ID'].isin(filtered_users)].reset_index(drop=True)
event_log

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Operating_System,Device,Country,Visited_Page,Time_on_Page
0,U_4,12/11/2021 20:55:00,12/11/2021 20:55:00,B_1,OS_1,D_1,Iran,Page_6,24
1,U_4,12/11/2021 20:55:00,12/11/2021 20:56:00,B_1,OS_1,D_1,Iran,Page_7,169
2,U_4,12/11/2021 20:55:00,12/11/2021 20:58:49,B_1,OS_1,D_1,Iran,Page_6,253
3,U_8,12/28/2021 11:56:00,12/28/2021 11:56:35,B_1,OS_1,D_1,Iran,Page_3,11
4,U_8,12/28/2021 11:56:00,12/28/2021 11:57:45,B_1,OS_1,D_1,Iran,Page_13,13
...,...,...,...,...,...,...,...,...,...
14754,U_6244,12/10/2021 20:56:00,12/10/2021 20:56:44,B_1,OS_1,D_1,Iran,Page_61,118
14755,U_6244,12/10/2021 20:56:00,12/10/2021 20:59:30,B_1,OS_1,D_1,Iran,Page_38,341
14756,U_6247,12/3/2021 3:43:00,12/3/2021 3:43:07,B_1,OS_1,D_1,Iran,Page_8,494
14757,U_6247,12/3/2021 3:43:00,12/3/2021 3:53:06,B_1,OS_1,D_1,Iran,Page_121,321


<br></br>
### <b>3.2. Making dummy variables for Device and OS</b>

In [19]:
event_log = pd.get_dummies(
    event_log,
    columns=['Device', 'Operating_System'],
    prefix=["Device", 'Operating_System'])

event_log.head()

Unnamed: 0,User_ID,Case_Start_Date,Activity_Start_Date,Browser,Country,Visited_Page,Time_on_Page,Device_D_0,Device_D_1,Device_D_2,Device_D_3,Operating_System_OS_0,Operating_System_OS_1,Operating_System_OS_2,Operating_System_OS_3,Operating_System_OS_4,Operating_System_OS_5,Operating_System_OS_6
0,U_4,12/11/2021 20:55:00,12/11/2021 20:55:00,B_1,Iran,Page_6,24,0,1,0,0,0,1,0,0,0,0,0
1,U_4,12/11/2021 20:55:00,12/11/2021 20:56:00,B_1,Iran,Page_7,169,0,1,0,0,0,1,0,0,0,0,0
2,U_4,12/11/2021 20:55:00,12/11/2021 20:58:49,B_1,Iran,Page_6,253,0,1,0,0,0,1,0,0,0,0,0
3,U_8,12/28/2021 11:56:00,12/28/2021 11:56:35,B_1,Iran,Page_3,11,0,1,0,0,0,1,0,0,0,0,0
4,U_8,12/28/2021 11:56:00,12/28/2021 11:57:45,B_1,Iran,Page_13,13,0,1,0,0,0,1,0,0,0,0,0


<br></br>
### <b>3.3. Reshape Eventlog</b>

In [20]:
from tqdm import tqdm

In [22]:
reshaped_event_log_lst = []

for group_name, group in tqdm(event_log.groupby('User_ID')):
    group.sort_values("Activity_Start_Date", inplace=True)
    group.reset_index(drop=True, inplace=True)

    # Input Variables
    prefix = [tuple(group["Visited_Page"].values[:i+1]) for i in range(len(group))]
    elapsed_time = [sum(group["Time_on_Page"].values[:i+1]) for i in range(len(group))]
    number_of_visited_page = [len(group["Visited_Page"].values[:i+1]) for i in range(len(group))]

    operating_systems = {}
    devices = {}
    for col in group.columns:
        if 'OS_' in col:
            operating_systems[col] = [max(group[col]) for i in range(len(group))]
        if 'Device_' in col:
            devices[col] = [max(group[col]) for i in range(len(group))]

    # Output Variable
    next_page = [group["Visited_Page"].values[i+1] for i in range(len(group) - 1)] + ["End"]

    reshaped_group = pd.DataFrame({
        'Visited_Page': prefix,
        'Elapsed_Time': elapsed_time,
        '#Visited_Page': number_of_visited_page,
        'Next_Page': next_page,
        **operating_systems,
        **devices,
    })

    reshaped_event_log_lst.append(reshaped_group)

reshaped_event_log = pd.concat(reshaped_event_log_lst, axis=0)

100%|██████████████████████████████████████████████████████████████████████████████| 2483/2483 [00:02<00:00, 833.12it/s]


In [23]:
reshaped_event_log.reset_index(drop=True, inplace=True)
reshaped_event_log.shape

(14759, 15)

<br></br>
### <b>3.4. Making dummy variables for Visited Pages</b>

In [24]:
reshaped_event_log = pd.get_dummies(reshaped_event_log, columns=['Visited_Page'], prefix="Visited")

reshaped_event_log.head()

Unnamed: 0,Elapsed_Time,#Visited_Page,Next_Page,Operating_System_OS_0,Operating_System_OS_1,Operating_System_OS_2,Operating_System_OS_3,Operating_System_OS_4,Operating_System_OS_5,Operating_System_OS_6,...,"Visited_('Page_98', 'Page_98', 'Page_91', 'Page_98')","Visited_('Page_98', 'Page_98', 'Page_91', 'Page_98', 'Page_91')","Visited_('Page_98', 'Page_98', 'Page_91', 'Page_98', 'Page_91', 'Page_98')","Visited_('Page_99',)","Visited_('Page_99', 'Page_273')","Visited_('Page_99', 'Page_273', 'Page_330')","Visited_('Page_99', 'Page_6')","Visited_('Page_99', 'Page_6', 'Page_3')","Visited_('Page_99', 'Page_6', 'Page_3', 'Page_13')","Visited_('Page_99', 'Page_6', 'Page_3', 'Page_13', 'Page_102')"
0,161,1,Page_27,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,394,2,Page_3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,419,3,Page_3,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,435,4,Page_31,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,813,5,End,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


<br></br>
### <b>3.5. Train Models</b>

In [25]:
x_columns_name = ["Elapsed_Time", "#Visited_Page"] + [item for item in reshaped_event_log.columns if "Visited_" in item or
                                                                                                     "Device_" in item or 
                                                                                                     "OS_" in item]
y_column_name = ["Next_Page"]

In [34]:
# Train Test Split
x = reshaped_event_log[x_columns_name]
y = reshaped_event_log[y_column_name]

y = np.array(y).flatten()

label_encoder = LabelEncoder()

# Fit and transform the y array to integer labels
y = label_encoder.fit_transform(y)
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)

<br></br>
#### <b>3.5.1. Logistic Regression</b>

In [None]:
# Fit Mode
model = LogisticRegression(solver='liblinear', random_state=0)

# model = LogisticRegression(solver='liblinear', C=0.05, multi_class='ovr', random_state=0)
model.fit(x_train, y_train)

In [None]:
y_pred = model.predict(x_test)

print(model.score(x_train, y_train))
print(model.score(x_test, y_test))

<br></br>
#### <b>3.5.2. KNN</b>

In [36]:
knn_model = KNeighborsClassifier(n_neighbors=12)
knn_model.fit(x_train, y_train)

In [37]:
pred = knn_model.predict(x_test)
accuracy_score(pred, y_test)

0.07621951219512195

<br></br>
#### <b>3.5.3. Neural Network</b>

In [38]:
# Train Test split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

y_encoded = to_categorical([*y_train, *y_test])

y_train_encoded = y_encoded[:y_train.shape[0]]
y_test_encoded = y_encoded[y_train.shape[0]:]

In [None]:
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)


# Creating the neural network model
model = keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(x_train.shape[1],)),
    Dropout(0.3),
    layers.Dense(32, activation='relu'),
    Dropout(0.2),
    layers.Dense(y_encoded.shape[1], activation='softmax')
])

# Compiling the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

# # Early stopping callback
# early_stopping = EarlyStopping(patience=20, restore_best_weights=True)

# Training the model
epochs = 100
batch_size = 32
model.fit(x_train, y_train_encoded, epochs=epochs, batch_size=batch_size, validation_split=0.2)

# # Evaluating the model on the test set
# test_loss, test_accuracy = model.evaluate(x_test, y_test_encoded)
# print(f"Test Accuracy: {test_accuracy}")

In [216]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 32)                284576    
                                                                 
 dropout (Dropout)           (None, 32)                0         
                                                                 
 dense_1 (Dense)             (None, 32)                1056      
                                                                 
 dropout_1 (Dropout)         (None, 32)                0         
                                                                 
 dense_2 (Dense)             (None, 376)               12408     
                                                                 
Total params: 298040 (1.14 MB)
Trainable params: 298040 (1.14 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


<br></br>
#### <b>3.5.4. Calculate recommendation accuracy</b>

In [224]:
def get_top_n_classes(probs, n):
    top_n_indices = np.argsort(probs)[-n:]
    return top_n_indices


# Make predictions on the test set
predictions = model.predict(x_test)

# predictions

# Initialize variables for accuracy calculation
num_correct = 0
total_samples = len(x_test)

# Evaluate accuracy based on recommending the top five classes
for i in range(total_samples):
    true_class = y_test[i]
    predicted_probs = predictions[i]
    top_classes = get_top_n_classes(predicted_probs, n=5)
    if true_class in top_classes:
        num_correct += 1

# Calculate and print the accuracy
accuracy = num_correct / total_samples
print(f"Accuracy based on recommending top five classes: {accuracy}")

Accuracy based on recommending top five classes: 0.2609791216702664


In [222]:
true_class

45

In [223]:
top_classes

array([316,  58, 323, 213, 312, 191, 169,   2, 114,   1])