# Objective:
    1. Show how the dataset is structured
    2. Explore a bit about customer behaviour based on the data and do basic customer segmentation
    3. Recommendations for future analysis

This dataset was taken from the Retail Rocket Recommender System dataset: https://www.kaggle.com/retailrocket/ecommerce-dataset/home

And data was between June 2, 2015 and August 1, 2015

In [1]:
import pandas as pd
import numpy as np

import datetime 
import time

%matplotlib inline
import matplotlib.pyplot as plt 
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


Let us load the Retail Rocket CSV files into DataFrames

In [2]:
events_df = pd.read_csv('data/events.csv')
category_tree_df = pd.read_csv('data/category_tree.csv')
item_properties_1_df = pd.read_csv('data/item_properties_part1.csv')
item_properties_2_df = pd.read_csv('data/item_properties_part2.csv')
items_df = pd.concat([item_properties_1_df , item_properties_2_df])

# Let's take a peek at the Events dataframe

In [3]:
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid
0,1433221332117,257597,view,355908,
1,1433224214164,992329,view,248676,
2,1433221999827,111016,view,318965,
3,1433221955914,483717,view,253185,
4,1433221337106,951259,view,367447,


The timestamp portion is in Unix Epoch format e.g. 1433221332117 will be converted to Tuesday, 2 June 2015 5:02:12.117 AM GMT

Visitor Id is the unique user currently browsing the website

Event is what the user is currently doing in that current timestamp

Transaction ID will only have value if the user made a purchase as shown below

In [4]:
events_df = events_df.drop_duplicates()
events_df['ts'] = pd.to_datetime(events_df['timestamp'], unit='ms') 
events_df = events_df.sort_values('ts')

items_df = items_df.drop_duplicates()
items_df['ts'] = pd.to_datetime(items_df['timestamp'], unit='ms') 
items_df = items_df.sort_values('ts')


In [5]:
events_df = events_df.dropna(subset=['itemid', 'timestamp'])
events_df = events_df[events_df['event'].isin(['view', 'addtocart', 'transaction'])]


In [6]:
weight_map = {'view':1, 'addtocart':3, 'transaction':6}
events_df['weight'] = events_df['event'].map(weight_map).fillna(0)


In [7]:
events_df.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,ts,weight
1462974,1430622004384,693516,addtocart,297662,,2015-05-03 03:00:04.384,3
1464806,1430622011289,829044,view,60987,,2015-05-03 03:00:11.289,1
1463000,1430622013048,652699,view,252860,,2015-05-03 03:00:13.048,1
1465287,1430622024154,1125936,view,33661,,2015-05-03 03:00:24.154,1
1462955,1430622026228,693516,view,297662,,2015-05-03 03:00:26.228,1


In [8]:
items_df.head()

Unnamed: 0,timestamp,itemid,property,value,ts
5903679,1431226800000,317951,790,n32880.000,2015-05-10 03:00:00
5668945,1431226800000,422842,480,1133979,2015-05-10 03:00:00
314220,1431226800000,310185,776,103591,2015-05-10 03:00:00
4170323,1431226800000,110973,112,679677,2015-05-10 03:00:00
4170324,1431226800000,179597,available,0,2015-05-10 03:00:00


In [9]:
events_df = events_df[:500000]
items_df = items_df[:500000]

In [10]:
events_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 1462974 to 1956044
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype         
---  ------         --------------   -----         
 0   timestamp      500000 non-null  int64         
 1   visitorid      500000 non-null  int64         
 2   event          500000 non-null  object        
 3   itemid         500000 non-null  int64         
 4   transactionid  3879 non-null    float64       
 5   ts             500000 non-null  datetime64[ns]
 6   weight         500000 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(4), object(1)
memory usage: 30.5+ MB


In [11]:
items_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 500000 entries, 5903679 to 8366927
Data columns (total 5 columns):
 #   Column     Non-Null Count   Dtype         
---  ------     --------------   -----         
 0   timestamp  500000 non-null  int64         
 1   itemid     500000 non-null  int64         
 2   property   500000 non-null  object        
 3   value      500000 non-null  object        
 4   ts         500000 non-null  datetime64[ns]
dtypes: datetime64[ns](1), int64(2), object(2)
memory usage: 22.9+ MB


In [12]:
items_df['timestamp'] = pd.to_datetime(items_df['timestamp'] , unit='ms')
events_df['timestamp'] = pd.to_datetime(events_df['timestamp'] , unit='ms')

In [13]:
events_sorted = events_df.sort_values('timestamp')
item_props_sorted = items_df.sort_values('ts')

ipivot = item_props_sorted.pivot_table(index=['itemid','ts'], columns='property', values='value', aggfunc='last').reset_index()

events_sorted = events_sorted.sort_values(['itemid', 'timestamp']).reset_index(drop=True)
ipivot_sorted = ipivot.sort_values(['itemid', 'ts']).reset_index(drop=True)

last_snapshot = ipivot_sorted.sort_values('ts').groupby('itemid').last().reset_index()

merged = events_sorted.merge(last_snapshot, on='itemid', how='left')

In [14]:
merged.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,ts_x,weight,ts_y,10,100,...,966,970,976,977,980,984,989,999,available,categoryid
0,2015-05-06 20:33:13.122,330981,view,6,,2015-05-06 20:33:13.122,1,NaT,,,...,,,,,,,,,,
1,2015-05-06 20:35:35.962,330981,view,6,,2015-05-06 20:35:35.962,1,NaT,,,...,,,,,,,,,,
2,2015-05-20 20:39:23.871,275013,view,6,,2015-05-20 20:39:23.871,1,NaT,,,...,,,,,,,,,,
3,2015-05-26 00:04:29.485,316532,view,6,,2015-05-26 00:04:29.485,1,NaT,,,...,,,,,,,,,,
4,2015-05-13 22:53:14.505,1205411,view,9,,2015-05-13 22:53:14.505,1,NaT,,,...,,,,,,,,,,


In [15]:
ui = merged.groupby(['visitorid', 'itemid'])['weight'].sum().reset_index()

now = merged['timestamp'].max()
merged['age_days'] = (now - merged['timestamp']).dt.days
lambda_ = 0.01
merged['decayed_w'] = merged['weight'] * np.exp(-lambda_*merged['age_days'])

ui_decayed = merged.groupby(['visitorid','itemid'])['decayed_w'].sum().reset_index().rename(columns={'decayed_w':'weight'})


In [16]:
ui_decayed.head()

Unnamed: 0,visitorid,itemid,weight
0,7,139394,0.88692
1,7,164941,0.88692
2,7,226353,0.895834
3,12,70225,0.794534
4,16,382645,0.869358


In [17]:
idx = merged.groupby('visitorid')['timestamp'].idxmax()
test = merged.loc[idx]
train = merged.drop(idx)


In [18]:
test.head()

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,ts_x,weight,ts_y,10,100,...,976,977,980,984,989,999,available,categoryid,age_days,decayed_w
237516,2015-05-16 04:20:39.214,7,view,226353,,2015-05-16 04:20:39.214,1,2015-05-10 03:00:00,,,...,,,,,,,0.0,642.0,11,0.895834
72429,2015-05-03 15:50:12.744,12,view,70225,,2015-05-03 15:50:12.744,1,2015-05-10 03:00:00,,,...,,,,,,,0.0,,23,0.794534
412177,2015-05-12 13:17:47.164,16,view,382645,,2015-05-12 13:17:47.164,1,NaT,,,...,,,,,,,,,14,0.869358
87282,2015-05-23 01:57:17.390,19,view,84663,,2015-05-23 01:57:17.390,1,NaT,,,...,,,,,,,,,4,0.960789
481736,2015-05-22 18:23:43.676,24,view,449369,,2015-05-22 18:23:43.676,1,2015-05-10 03:00:00,,,...,,,,,,,,,4,0.960789


In [19]:
# وزن eventها
w = {'view': 1, 'addtocart': 3, 'transaction': 5}
train['weight'] = train['event'].map(w)
test['weight'] = test['event'].map(w)

# mapping user/item → index
user_ids = train['visitorid'].unique()
item_ids = train['itemid'].unique()  # فقط آیتم‌های train

user_map = {u:i for i,u in enumerate(user_ids)}
item_map = {i:j for j,i in enumerate(item_ids)}

train['user_idx'] = train['visitorid'].map(user_map).astype(int)
train['item_idx'] = train['itemid'].map(item_map).astype(int)

test['user_idx'] = test['visitorid'].map(user_map)
test['item_idx'] = test['itemid'].map(item_map)

# حذف کاربران یا آیتم‌های unseen در test
test = test[test['user_idx'].notna() & test['item_idx'].notna()].copy()
test['user_idx'] = test['user_idx'].astype(int)
test['item_idx'] = test['item_idx'].astype(int)

# sparse matrix
rows = train['user_idx']
cols = train['item_idx']
vals = train['weight']

from scipy.sparse import csr_matrix
R_train = csr_matrix((vals, (rows, cols)), shape=(len(user_ids), len(item_ids)))


In [20]:
from scipy.sparse import csr_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics.pairwise import cosine_similarity

n_factors = 50
svd = TruncatedSVD(n_components=n_factors, random_state=42)
svd.fit(R_train)

# user و item embeddings
user_emb = svd.transform(R_train)       
item_emb = svd.components_.T          


In [21]:
pred_matrix = np.dot(user_emb, item_emb.T) 


In [22]:
def recommend_top_k(user_idx, k=10):
    scores = pred_matrix[user_idx]
    top_items_idx = np.argsort(scores)[::-1][:k]  # رتبه‌بندی نزولی
    top_items = [item_ids[i] for i in top_items_idx]
    return top_items

# مثال برای کاربر اول در train
print("Top-10 recommended items:", recommend_top_k(0))


Top-10 recommended items: [np.int64(318333), np.int64(14758), np.int64(250962), np.int64(418408), np.int64(259962), np.int64(215522), np.int64(117677), np.int64(126214), np.int64(397692), np.int64(271930)]


In [23]:
def hit_rate_at_k(k=10):
    hits = 0
    total = 0
    for _, row in test.iterrows():
        user_idx = int(row['user_idx'])
        true_item = int(row['item_idx'])
        rec_items_idx = np.argsort(pred_matrix[user_idx])[::-1][:k]
        if true_item in rec_items_idx:
            hits += 1
        total += 1
    return hits / total


print("Hit Rate @ 10:", hit_rate_at_k(10))


Hit Rate @ 10: 0.03755782809836864


In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics import ndcg_score, precision_score, recall_score

def evaluate_recommender_sklearn(pred_matrix, test, k=10):
    """
    محاسبه معیارهای recommendation با توابع sklearn
    pred_matrix: np.array(user_count, item_count) - نمرات پیش‌بینی
    test: dataframe با ستون user_idx و item_idx
    """
    n_users, n_items = pred_matrix.shape
    
    # ساخت ground truth matrix
    y_true = np.zeros((n_users, n_items))
    for _, row in test.iterrows():
        u = int(row['user_idx'])
        i = int(row['item_idx'])
        y_true[u, i] = 1

    # رتبه بندی top-k
    y_score = pred_matrix.copy()
    
    # HitRate@k
    hits = 0
    for u in range(n_users):
        top_k_idx = np.argsort(y_score[u])[::-1][:k]
        if np.any(y_true[u, top_k_idx] > 0):
            hits += 1
    hr = hits / n_users
    
    # NDCG@k
    ndcg = ndcg_score(y_true, y_score, k=k)
    
    # Precision@k & Recall@k
    precision_list = []
    recall_list = []
    for u in range(n_users):
        top_k_idx = np.argsort(y_score[u])[::-1][:k]
        pred_bin = np.zeros(n_items)
        pred_bin[top_k_idx] = 1
        precision_list.append(precision_score(y_true[u], pred_bin))
        recall_list.append(recall_score(y_true[u], pred_bin))
    precision = np.mean(precision_list)
    recall = np.mean(recall_list)
    
    # MRR@k
    mrr_list = []
    for u in range(n_users):
        top_k_idx = np.argsort(y_score[u])[::-1][:k]
        ranks = np.where(y_true[u, top_k_idx] > 0)[0]
        if len(ranks) > 0:
            mrr_list.append(1 / (ranks[0] + 1))
        else:
            mrr_list.append(0)
    mrr = np.mean(mrr_list)
    
    # Coverage
    recommended_items = set()
    for u in range(n_users):
        top_k_idx = np.argsort(y_score[u])[::-1][:k]
        recommended_items.update(top_k_idx)
    coverage = len(recommended_items) / n_items
    
    print(f"HitRate@{k}: {hr:.4f}")
    print(f"MRR@{k}: {mrr:.4f}")
    print(f"Precision@{k}: {precision:.4f}")
    print(f"Recall@{k}: {recall:.4f}")
    print(f"NDCG@{k}: {ndcg:.4f}")
    print(f"Coverage: {coverage:.4f}")
    
    return {'HR': hr, 'MRR': mrr, 'Precision': precision, 'Recall': recall,
            'NDCG': ndcg, 'Coverage': coverage}

# مثال استفاده:
metrics = evaluate_recommender_sklearn(pred_matrix, test, k=10)


MemoryError: Unable to allocate 30.8 GiB for an array with shape (73357, 56407) and data type float64

In [None]:
data = merged.copy()


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb

# -------------------------
# 1. آماده‌سازی داده‌ها
# -------------------------
data = merged.copy()

# وزن eventها برای نمونه‌ها
w = {'view': 1, 'addtocart': 3, 'transaction': 5}
data['weight'] = data['event'].map(w)

# مرتب‌سازی بر اساس زمان
data = data.sort_values('timestamp')

# تقسیم train/test بر اساس زمان
train_frac = 0.8
split_idx = int(len(data)*train_frac)
train = data.iloc[:split_idx].copy()
test = data.iloc[split_idx:].copy()

# -------------------------
# 2. ساخت ویژگی‌ها پیشرفته
# -------------------------
# ویژگی‌های کاربر
user_stats = train.groupby('visitorid')['event'].value_counts().unstack(fill_value=0)
user_stats['user_total'] = user_stats.sum(axis=1)
user_stats['user_view_ratio'] = user_stats['view'] / (user_stats['user_total']+1e-8)
user_stats['user_addtocart_ratio'] = user_stats['addtocart'] / (user_stats['user_total']+1e-8)
user_stats['user_transaction_ratio'] = user_stats['transaction'] / (user_stats['user_total']+1e-8)

# ویژگی‌های آیتم
item_stats = train.groupby('itemid')['event'].value_counts().unstack(fill_value=0)
item_stats['item_total'] = item_stats.sum(axis=1)
item_stats['item_view_ratio'] = item_stats['view'] / (item_stats['item_total']+1e-8)
item_stats['item_addtocart_ratio'] = item_stats['addtocart'] / (item_stats['item_total']+1e-8)
item_stats['item_transaction_ratio'] = item_stats['transaction'] / (item_stats['item_total']+1e-8)

def make_features(df):
    df = df.copy()
    df['user_views'] = df['visitorid'].map(user_stats.get('view', pd.Series(0)))
    df['user_addtocart'] = df['visitorid'].map(user_stats.get('addtocart', pd.Series(0)))
    df['user_transaction'] = df['visitorid'].map(user_stats.get('transaction', pd.Series(0)))
    df['user_total'] = df['visitorid'].map(user_stats.get('user_total', pd.Series(0)))
    df['user_view_ratio'] = df['visitorid'].map(user_stats.get('user_view_ratio', pd.Series(0)))
    df['user_addtocart_ratio'] = df['visitorid'].map(user_stats.get('user_addtocart_ratio', pd.Series(0)))
    df['user_transaction_ratio'] = df['visitorid'].map(user_stats.get('user_transaction_ratio', pd.Series(0)))

    df['item_views'] = df['itemid'].map(item_stats.get('view', pd.Series(0)))
    df['item_addtocart'] = df['itemid'].map(item_stats.get('addtocart', pd.Series(0)))
    df['item_transaction'] = df['itemid'].map(item_stats.get('transaction', pd.Series(0)))
    df['item_total'] = df['itemid'].map(item_stats.get('item_total', pd.Series(0)))
    df['item_view_ratio'] = df['itemid'].map(item_stats.get('item_view_ratio', pd.Series(0)))
    df['item_addtocart_ratio'] = df['itemid'].map(item_stats.get('item_addtocart_ratio', pd.Series(0)))
    df['item_transaction_ratio'] = df['itemid'].map(item_stats.get('item_transaction_ratio', pd.Series(0)))
    
    # تعاملات کاربر با آیتم
    df['user_item_interactions'] = df.groupby(['visitorid','itemid']).cumcount()
    
    # target باینری
    df['target'] = (df['event']=='transaction').astype(int)
    
    return df

train_feat = make_features(train)
test_feat = make_features(test)

# -------------------------
# 3. Negative sampling واقعی
# -------------------------
# نمونه‌های منفی: تمام non-transaction ها
positive_samples = train_feat[train_feat['target']==1].copy()
negative_samples = train_feat[train_feat['target']==0].copy()
neg_sample_size = len(positive_samples)*3
negative_samples = negative_samples.sample(n=neg_sample_size, random_state=42)
train_final = pd.concat([positive_samples, negative_samples]).sample(frac=1, random_state=42)

features = ['user_views','user_addtocart','user_transaction','user_total',
            'user_view_ratio','user_addtocart_ratio','user_transaction_ratio',
            'item_views','item_addtocart','item_transaction','item_total',
            'item_view_ratio','item_addtocart_ratio','item_transaction_ratio',
            'user_item_interactions']

X_train = train_final[features]
y_train = train_final['target']
X_test = test_feat[features]
y_test = test_feat['target']

# -------------------------
# 4. آموزش XGBoost با weight
# -------------------------
sample_weight = train_final['weight']  # وزن eventها برای تقویت transaction
model = xgb.XGBClassifier(
    n_estimators=300,
    max_depth=6,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss'
)
model.fit(X_train, y_train, sample_weight=sample_weight)

# -------------------------
# 5. پیش‌بینی احتمال خرید
# -------------------------
test_feat['score'] = model.predict_proba(X_test)[:,1]

# -------------------------
# 6. Top-k prediction بدون هشدار pandas
# -------------------------
top_k = 10
topk_pred = test_feat.groupby('visitorid', group_keys=False).apply(lambda x: x.nlargest(top_k, 'score'))

# -------------------------
# 7. محاسبه معیارهای کامل
# -------------------------
def evaluate_topk_full(topk_pred, test_df, k=10):
    hits, precisions, recalls, ndcgs, rr_list = [], [], [], [], []
    recommended_items_set = set()
    
    user_true = test_df.groupby('visitorid')['itemid'].apply(list).to_dict()
    
    for user, group in topk_pred.groupby('visitorid'):
        topk_items = group['itemid'].tolist()
        true_items = user_true.get(user, [])
        recommended_items_set.update(topk_items)
        
        hits.append(int(len(set(topk_items) & set(true_items))>0))
        precisions.append(len(set(topk_items) & set(true_items))/k)
        recalls.append(len(set(topk_items) & set(true_items))/len(true_items) if len(true_items)>0 else 0)
        
        dcg = sum([1/np.log2(i+2) for i,item in enumerate(topk_items) if item in true_items])
        idcg = sum([1/np.log2(i+2) for i in range(min(len(true_items), k))])
        ndcgs.append(dcg/idcg if idcg>0 else 0)
        
        rr = 0
        for i, item in enumerate(topk_items):
            if item in true_items:
                rr = 1/(i+1)
                break
        rr_list.append(rr)
    
    coverage = len(recommended_items_set)/test_df['itemid'].nunique()
    
    metrics = {
        'HitRate@{}'.format(k): np.mean(hits),
        'Precision@{}'.format(k): np.mean(precisions),
        'Recall@{}'.format(k): np.mean(recalls),
        'NDCG@{}'.format(k): np.mean(ndcgs),
        'MRR@{}'.format(k): np.mean(rr_list),
        'Coverage': coverage
    }
    return metrics

metrics_full = evaluate_topk_full(topk_pred, test_feat, k=10)
print("Improved Full evaluation on test data (XGBoost):")
print(metrics_full)


Improved Full evaluation on test data (XGBoost):
{'HitRate@10': np.float64(1.0), 'Precision@10': np.float64(0.12974939369442198), 'Recall@10': np.float64(0.9396460747788239), 'NDCG@10': np.float64(1.0), 'MRR@10': np.float64(1.0), 'Coverage': 0.9489993544222078}


  topk_pred = test_feat.groupby('visitorid', group_keys=False).apply(lambda x: x.nlargest(top_k, 'score'))
