# Objective:
    1. Show how the dataset is structured
    2. Explore a bit about customer behaviour based on the data and do basic customer segmentation
    3. Recommendations for future analysis

This dataset was taken from the Retail Rocket Recommender System dataset: https://www.kaggle.com/retailrocket/ecommerce-dataset/home

And data was between June 2, 2015 and August 1, 2015

In [782]:
import pandas as pd
import numpy as np


import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn import metrics
from tqdm.auto import tqdm
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import OneHotEncoder , MinMaxScaler , StandardScaler , RobustScaler
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier

Let us load the Retail Rocket CSV files into DataFrames

In [783]:
dtypes_events = {
    "timestamp": "int64",
    "visitorid": "int64",
    "event": "category",
    "itemid": "int64",
    "transactionid": "float64",  # nullable; will cast to Int64 later if needed
}

events = pd.read_csv('data/events.csv' ,  dtype=dtypes_events)
category_tree_df = pd.read_csv('data/category_tree.csv')
item_properties_1_df = pd.read_csv('data/item_properties_part1.csv')
item_properties_2_df = pd.read_csv('data/item_properties_part2.csv')

In [784]:
if "transactionid" in events:
    events["transactionid"] = events["transactionid"].astype("Int64")

events = events.dropna(subset=["visitorid", "itemid", "timestamp"])

In [785]:
events = events.drop_duplicates(
    subset=["timestamp", "visitorid", "event", "itemid", "transactionid"]
)

In [786]:
user_counts = events.groupby("visitorid")["itemid"].nunique()
user_counts

visitorid
0          3
1          1
2          4
3          1
4          1
          ..
1407575    1
1407576    1
1407577    1
1407578    1
1407579    1
Name: itemid, Length: 1407580, dtype: int64

In [787]:
good_users = user_counts[user_counts >= 3].index
events = events[events["visitorid"].isin(good_users)]

In [788]:
item_user_counts = events.groupby("itemid")["visitorid"].nunique()
good_items = item_user_counts[item_user_counts >= 50].index
events = events[events["itemid"].isin(good_items)]

In [789]:
t = events["timestamp"].values
t_train_end = np.quantile(t, 0.9)

In [790]:
events.drop(["transactionid"] , inplace=True , axis=1)

In [791]:

events['prev_event'] = events.groupby('visitorid')['event'].shift(1)
events['prev_event'] = events['prev_event'].cat.add_categories(["start"])
events['prev_event'] = events['prev_event'].fillna("start")

events.drop(["event"] , inplace=True , axis=1)

In [792]:
events

Unnamed: 0,timestamp,visitorid,itemid,prev_event
46,1433223785793,392042,280375,start
50,1433223790254,392042,16813,view
104,1433222930873,428642,167337,start
125,1433222860128,428642,190172,view
140,1433222793547,761633,432152,start
...,...,...,...,...
2756020,1438397370584,890976,219933,view
2756028,1438398728888,1176313,303715,view
2756078,1438399807937,804736,447661,view
2756087,1438397455397,611270,432925,view


In [793]:
train = events[events["timestamp"] <= t_train_end].copy()
test = events[events["timestamp"] > t_train_end].copy()

In [794]:
item_props = pd.concat([item_properties_1_df, item_properties_2_df], ignore_index=True)

In [795]:
item_props["timestamp"] = item_props["timestamp"].astype("int64")
item_props["itemid"] = item_props["itemid"].astype("int64")
item_props["property"] = item_props["property"].astype("string")
item_props["value"] = item_props["value"].astype("string")

In [796]:
ip_keep = item_props[item_props["property"].isin(["available"])].copy()

ip_keep = ip_keep.sort_values(["itemid", "property", "timestamp"])
ip_last = ip_keep.groupby(["itemid", "property"], as_index=False).tail(1)

ip_wide = ip_last.pivot(index="itemid", columns="property", values="value").reset_index()
ip_wide.sample(5)


property,itemid,available
282926,316610,0
61273,68553,0
302679,338725,0
312105,349322,0
295251,330396,0


In [797]:
if "available" in ip_wide:
    ip_wide["available"] = (
        ip_wide["available"].astype(str).str.lower().map({"1": 1, "0": 0})
    )

def join_item_features(df, feat):
    return df.merge(feat, on="itemid", how="left")


train = join_item_features(train, ip_wide)
test = join_item_features(test, ip_wide)


pos_events = {"transaction", "addtocart"}
for split in [train, test]:
    split["y"] = split["prev_event"].isin(pos_events).astype("int8")

train.to_parquet("rr_train.parquet", index=False)
test.to_parquet("rr_test.parquet", index=False)

In [798]:
for df in [train, test]:
    if "available" in df:
        df["available"] = df["available"].fillna(0)


In [799]:
train.drop(["timestamp"] , inplace=True , axis=1)
test.drop(["timestamp"] , inplace=True , axis=1)

In [800]:
test

Unnamed: 0,visitorid,itemid,prev_event,available,y
0,975283,203002,start,1.0,0
1,530559,16813,view,1.0,0
2,1397781,455223,start,1.0,0
3,530559,441852,transaction,0.0,1
4,1210965,174815,start,0.0,0
...,...,...,...,...,...
21500,518438,46443,view,1.0,0
21501,994820,150882,view,1.0,0
21502,152963,334401,view,1.0,0
21503,133942,51969,start,1.0,0


In [801]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 193542 entries, 0 to 193541
Data columns (total 5 columns):
 #   Column      Non-Null Count   Dtype   
---  ------      --------------   -----   
 0   visitorid   193542 non-null  int64   
 1   itemid      193542 non-null  int64   
 2   prev_event  193542 non-null  category
 3   available   193542 non-null  float64 
 4   y           193542 non-null  int8    
dtypes: category(1), float64(1), int64(2), int8(1)
memory usage: 4.8 MB


In [802]:
col = "y"

last_col_train = train[col]
last_col_test = test[col]

train = train.drop(columns=[col])
test = test.drop(columns=[col])

train[col] = last_col_train
test[col] = last_col_test

In [803]:
x_train = train.iloc[: , :-1]
x_test = test.iloc[: , :-1]
y_train = train.iloc[: , -1]
y_test = test.iloc[: , -1]

In [804]:
x_train

Unnamed: 0,visitorid,itemid,prev_event,available
0,392042,280375,start,1.0
1,392042,16813,view,1.0
2,428642,167337,start,0.0
3,428642,190172,view,0.0
4,761633,432152,start,1.0
...,...,...,...,...
193537,890976,219933,view,1.0
193538,1176313,303715,view,0.0
193539,804736,447661,view,1.0
193540,611270,432925,view,0.0


In [805]:
cat_col = ["prev_event"]

ohe = OneHotEncoder(handle_unknown="ignore" ,  sparse_output=False)

ohe.fit(x_train[cat_col])

train_ohe = pd.DataFrame(
    ohe.transform(x_train[cat_col]),
    columns=ohe.get_feature_names_out(cat_col),
    index=x_train.index,
)

x_train = pd.concat([x_train.drop(columns=cat_col), train_ohe], axis=1)

test_ohe = pd.DataFrame(
    ohe.transform(x_test[cat_col]),
    columns=ohe.get_feature_names_out(cat_col),
    index=x_test.index,
)

x_test = pd.concat([x_test.drop(columns=cat_col), test_ohe], axis=1)


In [806]:
x_train

Unnamed: 0,visitorid,itemid,available,prev_event_addtocart,prev_event_start,prev_event_transaction,prev_event_view
0,392042,280375,1.0,0.0,1.0,0.0,0.0
1,392042,16813,1.0,0.0,0.0,0.0,1.0
2,428642,167337,0.0,0.0,1.0,0.0,0.0
3,428642,190172,0.0,0.0,0.0,0.0,1.0
4,761633,432152,1.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...
193537,890976,219933,1.0,0.0,0.0,0.0,1.0
193538,1176313,303715,0.0,0.0,0.0,0.0,1.0
193539,804736,447661,1.0,0.0,0.0,0.0,1.0
193540,611270,432925,0.0,0.0,0.0,0.0,1.0


In [807]:
y_train

0         0
1         0
2         0
3         0
4         0
         ..
193537    0
193538    0
193539    0
193540    0
193541    1
Name: y, Length: 193542, dtype: int8

In [808]:
mm = MinMaxScaler()

x_train.iloc[: , :2] = mm.fit_transform(x_train.iloc[: , :2])
x_test.iloc[: , :2] = mm.fit_transform(x_test.iloc[: , :2])

# ss = StandardScaler()

# x_train = ss.fit_transform(x_train)
# x_test = ss.fit_transform(x_test)

# rs = RobustScaler()

# x_train = rs.fit_transform(x_train)
# x_test = rs.fit_transform(x_test)

  x_train.iloc[: , :2] = mm.fit_transform(x_train.iloc[: , :2])
  x_train.iloc[: , :2] = mm.fit_transform(x_train.iloc[: , :2])
  x_test.iloc[: , :2] = mm.fit_transform(x_test.iloc[: , :2])
  x_test.iloc[: , :2] = mm.fit_transform(x_test.iloc[: , :2])


In [809]:
from imblearn.over_sampling import SMOTE

sm = SMOTE()
x_train, y_train = sm.fit_resample(x_train, y_train)

In [810]:
# from imblearn.under_sampling import RandomUnderSampler

# rus = RandomUnderSampler()
# x_train, y_train = rus.fit_resample(x_train, y_train)

In [811]:
y_train.value_counts()

y
0    182667
1    182667
Name: count, dtype: int64

In [812]:

xgbModel = XGBClassifier(
    n_estimators=600,
    max_depth=4,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric="mlogloss",
    use_label_encoder=False,
)


rf = RandomForestClassifier(
    n_estimators=600,
    max_depth=4,
    random_state=42,
    class_weight="balanced"
)

logreg = LogisticRegression(max_iter=1000, random_state=42 , class_weight="balanced")
knn = KNeighborsClassifier(n_neighbors=7)

estimators = [("xgb", xgbModel), ("rf", rf), ("logreg", logreg), ("knn", knn)]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=XGBClassifier(
        n_estimators=600,
        max_depth=4,
        random_state=42,
        learning_rate=0.05,
    ),
    passthrough=True,
    n_jobs=-1,
)

In [813]:
stack_model.fit(x_train, y_train)

0,1,2
,estimators,"[('xgb', ...), ('rf', ...), ...]"
,final_estimator,"XGBClassifier...ree=None, ...)"
,cv,
,stack_method,'auto'
,n_jobs,-1
,passthrough,True
,verbose,0

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,0.8
,device,
,early_stopping_rounds,
,enable_categorical,False

0,1,2
,n_estimators,600
,criterion,'gini'
,max_depth,4
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,'balanced'
,random_state,42
,solver,'lbfgs'
,max_iter,1000

0,1,2
,n_neighbors,7
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,

0,1,2
,objective,'binary:logistic'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [814]:
y_train_stack_model = stack_model.predict(x_train)
y_test_stack_model = stack_model.predict(x_test)

In [815]:
print("train data : ")
print("Accuracy:", metrics.accuracy_score(y_train, y_train_stack_model))
print("\nClassification Report:\n", metrics.classification_report(y_train, y_train_stack_model))
print("\nConfusion Matrix:\n", metrics.confusion_matrix(y_train, y_train_stack_model))
print("\ntest data : ")
print("Accuracy:", metrics.accuracy_score(y_test, y_test_stack_model))
print("\nClassification Report:\n", metrics.classification_report(y_test, y_test_stack_model))
print("\nConfusion Matrix:\n", metrics.confusion_matrix(y_test, y_test_stack_model))

train data : 
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00    182667
           1       1.00      1.00      1.00    182667

    accuracy                           1.00    365334
   macro avg       1.00      1.00      1.00    365334
weighted avg       1.00      1.00      1.00    365334


Confusion Matrix:
 [[182667      0]
 [     0 182667]]

test data : 
Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00     20316
           1       1.00      1.00      1.00      1189

    accuracy                           1.00     21505
   macro avg       1.00      1.00      1.00     21505
weighted avg       1.00      1.00      1.00     21505


Confusion Matrix:
 [[20316     0]
 [    0  1189]]
