# Objective:
    1. Show how the dataset is structured
    2. Explore a bit about customer behaviour based on the data and do basic customer segmentation
    3. Recommendations for future analysis

This dataset was taken from the Retail Rocket Recommender System dataset: https://www.kaggle.com/retailrocket/ecommerce-dataset/home

And data was between June 2, 2015 and August 1, 2015

In [209]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm.auto import tqdm
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder , MinMaxScaler , StandardScaler , RobustScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

Let us load the Retail Rocket CSV files into DataFrames

In [210]:
dtypes_events = {
    "timestamp": "int64",
    "visitorid": "int64",
    "event": "category",
    "itemid": "int64",
    "transactionid": "float64",  # nullable; will cast to Int64 later if needed
}

events = pd.read_csv('data/events.csv' ,  dtype=dtypes_events)
category_tree_df = pd.read_csv('data/category_tree.csv')
item_properties_1_df = pd.read_csv('data/item_properties_part1.csv')
item_properties_2_df = pd.read_csv('data/item_properties_part2.csv')

In [211]:
if "transactionid" in events:
    events["transactionid"] = events["transactionid"].astype("Int64")

events = events.dropna(subset=["visitorid", "itemid", "timestamp"])

In [212]:
events = events.drop_duplicates(
    subset=["timestamp", "visitorid", "event", "itemid", "transactionid"]
)

In [213]:
user_counts = events.groupby("visitorid")["itemid"].nunique()
user_counts

visitorid
0          3
1          1
2          4
3          1
4          1
          ..
1407575    1
1407576    1
1407577    1
1407578    1
1407579    1
Name: itemid, Length: 1407580, dtype: int64

In [214]:
good_users = user_counts[user_counts >= 3].index
events = events[events["visitorid"].isin(good_users)]

In [215]:
item_user_counts = events.groupby("itemid")["visitorid"].nunique()
good_items = item_user_counts[item_user_counts >= 50].index
events = events[events["itemid"].isin(good_items)]

In [216]:
t = events["timestamp"].values
t_train_end = np.quantile(t, 0.9)

In [217]:
events.drop(["transactionid"] , inplace=True , axis=1)

In [218]:
# events = events.sort_values(["visitorid", "timestamp"])
# events["next_event"] = events.groupby("visitorid")["event"].shift(-1)
# events["prev_event"] = events.groupby("visitorid")["event"].shift(1)
# events["prev_event"] = events["prev_event"].cat.add_categories(["START"])
# events["prev_event"] = events["prev_event"].fillna("START")
# events["next_event"] = events["next_event"].cat.add_categories(["END"])
# events["next_event"] = events["next_event"].fillna("END")


In [219]:
events = events.sort_values(["visitorid", "timestamp"])
events["prev_event"] = events.groupby("visitorid")["event"].shift(1)
events["prev_event"] = events["prev_event"].cat.add_categories(["START"])
events["prev_event"] = events["prev_event"].fillna("START")
events["user_event_count"] = events.groupby("visitorid").cumcount()
events["item_freq"] = events.groupby("itemid").cumcount()


In [220]:
events

Unnamed: 0,timestamp,visitorid,event,itemid,prev_event,user_event_count,item_freq
1367342,1442004917175,0,view,67045,START,0,0
737711,1438970212664,2,view,259884,START,0,0
726292,1438970468920,2,view,216305,view,1,0
737615,1438970905669,2,view,342816,view,2,0
735202,1438971444375,2,view,342816,view,3,1
...,...,...,...,...,...,...,...
339600,1434585451071,1407527,view,85307,view,4,79
2621445,1437942621780,1407530,view,20981,START,0,134
2622577,1437964225112,1407530,view,126396,view,1,100
656195,1438648367715,1407573,view,82278,START,0,134


In [221]:
train = events[events["timestamp"] <= t_train_end].copy()
test = events[events["timestamp"] > t_train_end].copy()

In [222]:
item_props = pd.concat([item_properties_1_df, item_properties_2_df], ignore_index=True)

In [223]:
item_props["timestamp"] = item_props["timestamp"].astype("int64")
item_props["itemid"] = item_props["itemid"].astype("int64")
item_props["property"] = item_props["property"].astype("string")
item_props["value"] = item_props["value"].astype("string")

In [224]:
ip_keep = item_props[item_props["property"].isin(["available"])].copy()

ip_keep = ip_keep.sort_values(["itemid", "property", "timestamp"])
ip_last = ip_keep.groupby(["itemid", "property"], as_index=False).tail(1)

ip_wide = ip_last.pivot(index="itemid", columns="property", values="value").reset_index()
ip_wide.sample(5)


property,itemid,available
299836,335529,0
144118,161211,0
224180,250908,0
271677,303997,0
343911,384906,0


In [225]:
if "available" in ip_wide:
    ip_wide["available"] = (
        ip_wide["available"].astype(str).str.lower().map({"1": 1, "0": 0})
    )

def join_item_features(df, feat):
    return df.merge(feat, on="itemid", how="left")


train = join_item_features(train, ip_wide)
test = join_item_features(test, ip_wide)


pos_events = {"transaction", "addtocart"}
for split in [train, test]:
    split["y"] = split["event"].isin(pos_events).astype("int8")

train.to_parquet("rr_train.parquet", index=False)
test.to_parquet("rr_test.parquet", index=False)

In [226]:
for df in [train, test]:
    if "available" in df:
        df["available"] = df["available"].fillna(0)


In [227]:
train.drop(["timestamp" , "event"] , inplace=True , axis=1)
test.drop(["timestamp" , "event"] , inplace=True , axis=1)

In [228]:
test

Unnamed: 0,visitorid,itemid,prev_event,user_event_count,item_freq,available,y
0,0,67045,START,0,0,1.0,0
1,6,344723,view,5,3,1.0,0
2,54,283115,START,0,0,1.0,0
3,54,38965,view,1,0,1.0,0
4,54,319680,view,2,0,1.0,0
...,...,...,...,...,...,...,...
21500,1406536,95977,view,2,132,1.0,0
21501,1406536,354153,view,3,149,1.0,0
21502,1406646,188395,START,0,85,1.0,0
21503,1406893,206817,START,0,70,1.0,0


In [229]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193542 entries, 0 to 193541
Data columns (total 7 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   visitorid         193542 non-null  int64   
 1   itemid            193542 non-null  int64   
 2   prev_event        193542 non-null  category
 3   user_event_count  193542 non-null  int64   
 4   item_freq         193542 non-null  int64   
 5   available         193542 non-null  float64 
 6   y                 193542 non-null  int8    
dtypes: category(1), float64(1), int64(4), int8(1)
memory usage: 7.8 MB


In [230]:
col = "y"

last_col_train = train[col]
last_col_test = test[col]

train = train.drop(columns=[col])
test = test.drop(columns=[col])

train[col] = last_col_train
test[col] = last_col_test

In [231]:
x_train = train.iloc[: , :-1]
x_test = test.iloc[: , :-1]
y_train = train.iloc[: , -1]
y_test = test.iloc[: , -1]

In [232]:
x_train

Unnamed: 0,visitorid,itemid,prev_event,user_event_count,item_freq,available
0,2,259884,START,0,0,1.0
1,2,216305,view,1,0,1.0
2,2,342816,view,2,0,1.0
3,2,342816,view,3,1,1.0
4,2,216305,view,4,1,1.0
...,...,...,...,...,...,...
193537,1407527,85307,view,4,79,1.0
193538,1407530,20981,START,0,134,1.0
193539,1407530,126396,view,1,100,0.0
193540,1407573,82278,START,0,134,0.0


In [233]:
cat_col = ["prev_event"]

ohe = OneHotEncoder(handle_unknown="ignore" ,  sparse_output=False)

ohe.fit(x_train[cat_col])

train_ohe = pd.DataFrame(
    ohe.transform(x_train[cat_col]),
    columns=ohe.get_feature_names_out(cat_col),
    index=x_train.index,
)

x_train = pd.concat([x_train.drop(columns=cat_col), train_ohe], axis=1)

test_ohe = pd.DataFrame(
    ohe.transform(x_test[cat_col]),
    columns=ohe.get_feature_names_out(cat_col),
    index=x_test.index,
)

x_test = pd.concat([x_test.drop(columns=cat_col), test_ohe], axis=1)


In [234]:
x_train

Unnamed: 0,visitorid,itemid,user_event_count,item_freq,available,prev_event_START,prev_event_addtocart,prev_event_transaction,prev_event_view
0,2,259884,0,0,1.0,1.0,0.0,0.0,0.0
1,2,216305,1,0,1.0,0.0,0.0,0.0,1.0
2,2,342816,2,0,1.0,0.0,0.0,0.0,1.0
3,2,342816,3,1,1.0,0.0,0.0,0.0,1.0
4,2,216305,4,1,1.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...
193537,1407527,85307,4,79,1.0,0.0,0.0,0.0,1.0
193538,1407530,20981,0,134,1.0,1.0,0.0,0.0,0.0
193539,1407530,126396,1,100,0.0,0.0,0.0,0.0,1.0
193540,1407573,82278,0,134,0.0,1.0,0.0,0.0,0.0


In [235]:
y_train

0         0
1         0
2         0
3         0
4         0
         ..
193537    0
193538    0
193539    0
193540    0
193541    0
Name: y, Length: 193542, dtype: int8

In [236]:
# mm = MinMaxScaler()

# x_train.iloc[: , :2] = mm.fit_transform(x_train.iloc[: , :2])
# x_test.iloc[: , :2] = mm.fit_transform(x_test.iloc[: , :2])

ss = StandardScaler()

x_train = ss.fit_transform(x_train)
x_test = ss.fit_transform(x_test)

# rs = RobustScaler()

# x_train = rs.fit_transform(x_train)
# x_test = rs.fit_transform(x_test)

In [237]:
# from imblearn.over_sampling import SMOTE

# sm = SMOTE()
# x_train, y_train = sm.fit_resample(x_train, y_train)

In [238]:
# from imblearn.under_sampling import RandomUnderSampler

# rus = RandomUnderSampler()
# x_train, y_train = rus.fit_resample(x_train, y_train)

In [239]:
y_train.value_counts()

y
0    181215
1     12327
Name: count, dtype: int64

In [240]:
pos = (y_train == 1).sum()
neg = (y_train == 0).sum()
scale_pos_weight = neg / pos

print("scale_pos_weight =", scale_pos_weight)


scale_pos_weight = 14.7006570941835


In [241]:

xgbModel = XGBClassifier(
    n_estimators=700,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="mlogloss",
    use_label_encoder=False,
    scale_pos_weight=scale_pos_weight 
)


rf = RandomForestClassifier(
    n_estimators=700,
    max_depth=5,
    random_state=42,
    class_weight="balanced"
)

logreg = LogisticRegression(max_iter=1000, random_state=42 , class_weight="balanced")
knn = KNeighborsClassifier(n_neighbors=7)

estimators = [("xgb", xgbModel), ("rf", rf), ("logreg", logreg), ("knn", knn)]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=XGBClassifier(
        n_estimators=700,
        max_depth=5,
        random_state=42,
        learning_rate=0.05,
        scale_pos_weight=scale_pos_weight 
    ),
    passthrough=True,
    n_jobs=-1,
)

In [242]:
stack_model.fit(x_train, y_train)

0,1,2
,estimators,"[('xgb', ...), ('rf', ...), ...]"
,final_estimator,"XGBClassifier...ree=None, ...)"
,cv,
,stack_method,'auto'
,n_jobs,-1
,passthrough,True
,verbose,0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,700
,criterion,'gini'
,max_depth,5
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [243]:
# y_train_stack_model = stack_model.predict(x_train)
# y_test_stack_model = stack_model.predict(x_test)

In [244]:
best_threshold = 0.58

probs_train = stack_model.predict_proba(x_train)[:,1]
probs_test = stack_model.predict_proba(x_test)[:,1]
preds_train = (probs_train > best_threshold).astype(int)
preds_test = (probs_test > best_threshold).astype(int)


In [245]:
print("train data : ")
print("Accuracy:", metrics.accuracy_score(y_train, preds_train))
print("\nClassification Report:\n", metrics.classification_report(y_train, preds_train))
print("\nConfusion Matrix:\n", metrics.confusion_matrix(y_train, preds_train))
print("\ntest data : ")
print("Accuracy:", metrics.accuracy_score(y_test, preds_test))
print("\nClassification Report:\n", metrics.classification_report(y_test, preds_test))
print("\nConfusion Matrix:\n", metrics.confusion_matrix(y_test, preds_test)) 

train data : 
Accuracy: 0.888334315032396

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.92      0.94    181215
           1       0.28      0.48      0.35     12327

    accuracy                           0.89    193542
   macro avg       0.62      0.70      0.65    193542
weighted avg       0.92      0.89      0.90    193542


Confusion Matrix:
 [[166018  15197]
 [  6415   5912]]

test data : 
Accuracy: 0.8877935363868867

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.92      0.94     20154
           1       0.26      0.43      0.32      1351

    accuracy                           0.89     21505
   macro avg       0.61      0.67      0.63     21505
weighted avg       0.92      0.89      0.90     21505


Confusion Matrix:
 [[18516  1638]
 [  775   576]]
