In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.algorithms import TARSItemKNN, ItemKNN
from recpack.scenarios import Timed
from recpack.matrix import InteractionMatrix

# import utils file from previous lecture
import sys
sys.path.append('../lecture4')
from utils import DATA_PATH
from evaluation import apk

# This file builds on the code in https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03a_Basic_Model_Local_Validation.ipynb

In [2]:
sample = False

if sample:
    transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train_sample_0.05.parquet')
    customers = pd.read_parquet(f'{DATA_PATH}/customers_sample_0.05.parquet')
    articles = pd.read_parquet(f'{DATA_PATH}/articles_train_sample_0.05.parquet')
else:
    transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
    customers = pd.read_parquet(f'{DATA_PATH}/customers.parquet')
    articles = pd.read_parquet(f'{DATA_PATH}/articles.parquet')

In [3]:
test_week = transactions.week.max()
transactions = transactions[transactions.week != transactions.week.max()]
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [4]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 8.06 s, sys: 148 ms, total: 8.21 s
Wall time: 8.25 s


In [5]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 244 ms, sys: 24.5 ms, total: 268 ms
Wall time: 268 ms


In [6]:
candidates_last_purchase = transactions.copy()

In [7]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 9.06 s, sys: 192 ms, total: 9.25 s
Wall time: 9.25 s


### Bestsellers candidates

In [8]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [9]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [10]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [11]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [12]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [13]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [14]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)
candidates_bestsellers_test_week.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,bestseller_rank,price
0,2020-07-08,857913002275398,1,104,909370001,1,0.032947
1,2020-07-08,857913002275398,1,104,865799006,2,0.03334
2,2020-07-08,857913002275398,1,104,918522001,3,0.041416
3,2020-07-08,857913002275398,1,104,924243001,4,0.041549
4,2020-07-08,857913002275398,1,104,448509014,5,0.041604


In [15]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)
candidates_bestsellers.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-15,272412481300040,1,95,806388001,0.013301
1,2020-07-15,272412481300040,1,95,730683021,0.025643
2,2020-07-15,272412481300040,1,95,610776002,0.008303
3,2020-07-15,272412481300040,1,95,805308002,0.013609
4,2020-07-15,272412481300040,1,95,866383006,0.024971


### K most similar items

In [16]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(10, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(10, item_ix='article_id', user_ix='customer_id'))

transaction_matrix = proc.process(transactions)

  0%|          | 0/1264753 [00:00<?, ?it/s]

  0%|          | 0/1264753 [00:00<?, ?it/s]

In [17]:
knn = TARSItemKNN(K=580, fit_decay=0.1, predict_decay=1/3, similarity='cosine')
# knn = ItemKNN(K=90,normalize_X=False,normalize_sim=True,pop_discount=None,similarity='cosine') 
knn.fit(transaction_matrix)

2022-11-27 12:24:52,750 - base - recpack - INFO - Fitting TARSItemKNN complete - Took 9.96s


In [18]:
prediction_matrix = knn.predict(transaction_matrix)

In [19]:
def not_bought_by_user(interaction_matrix: csr_matrix) -> csr_matrix:
    """given a csr_matrix, set all values > 0 to 0, and all values == 0 to 1
    """
    return csr_matrix(np.ones(interaction_matrix.shape) - (interaction_matrix >= 1), dtype=int)

In [20]:
# only keep predictions for items that the user has not bought yet
not_bought_matrix = not_bought_by_user(transaction_matrix.values)

In [21]:
similar_unbought_items_matrix = prediction_matrix.multiply(not_bought_matrix)

In [22]:
def top_n_idx_sparse(matrix: csr_matrix, n: int):
    """Return index of top n values in each row of a sparse matrix.
    source: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
    """
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]].tolist())
    
    # Get the values corresponding to the indices
    top_n_values = []
    for row_idx, col_idxs in enumerate(top_n_idx):
        top_n_values.append(matrix[row_idx, col_idxs].toarray().tolist()[0])
        assert(len(top_n_values[row_idx]) == len(top_n_idx[row_idx]))
    return top_n_idx, top_n_values

def get_top_k_similar_articles_per_user(prediction_matrix: csr_matrix, interaction_matrix: InteractionMatrix, k: int) -> pd.DataFrame:
    """given a prediction matrix and a transaction matrix, return a dataframe with the top k similar articles per user
    """
    # use interaction_matrix._df to map back to original customer and article ids
    uid_cid_map = interaction_matrix._df[["uid", "customer_id"]].drop_duplicates().set_index("uid").to_dict()["customer_id"]
    iid_aid_map = interaction_matrix._df[["iid", "article_id"]].drop_duplicates().set_index("iid").to_dict()["article_id"]

    # get column indices of top k articles per user
    top_k_idx, top_k_values = top_n_idx_sparse(prediction_matrix, k)

    similar_customers = []
    similar_articles = []
    similarity_scores = []

    for i, row in enumerate(top_k_idx):
        user_predictions = [iid_aid_map[iid] for iid in row]
        similar_customers.extend([uid_cid_map[i]] * len(user_predictions))
        similar_articles.extend(user_predictions)
        similarity_scores.extend(top_k_values[i])

        
    assert len(similar_customers) == len(similar_articles) == len(similarity_scores), "lengths of lists should be equal"
    return pd.DataFrame({"customer_id": similar_customers, "article_id": similar_articles, "similarity": similarity_scores})

In [23]:
similar_items = get_top_k_similar_articles_per_user(prediction_matrix, transaction_matrix, k=12)

In [24]:
test_set_transactions.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-08,857913002275398,1,104
1,2020-07-08,1658289241058394,1,104
2,2020-07-08,3828854365940846,1,104
3,2020-07-08,4195624216542755,1,104
4,2020-07-08,4233235614030232,2,104


In [25]:
candidates_similar_items = pd.merge(
    similar_items,
    test_set_transactions,
    on='customer_id',
    how='left'
)

candidates_similar_items.drop(columns='similarity', inplace=True)
candidates_similar_items.head()

Unnamed: 0,customer_id,article_id,t_dat,sales_channel_id,week
0,857913002275398,599580038,2020-07-08,1,104
1,857913002275398,730863043,2020-07-08,1,104
2,857913002275398,636207014,2020-07-08,1,104
3,857913002275398,720504010,2020-07-08,1,104
4,857913002275398,865595002,2020-07-08,1,104


In [26]:
# get the last price of all article_id's
last_price = transactions \
    .groupby(['article_id', 'week']) \
    .price \
    .last() \
    .reset_index() \
    .groupby('article_id') \
    .price \
    .last() \
    .reset_index()
    
last_price.head()

Unnamed: 0,article_id,price
0,108775015,0.002068
1,108775044,0.008458
2,110065001,0.006763
3,110065002,0.005186
4,110065011,0.011847


In [27]:
candidates_similar_items = pd.merge(
    candidates_similar_items,
    last_price,
    on='article_id',
    how='left'
)

candidates_similar_items.head()

Unnamed: 0,customer_id,article_id,t_dat,sales_channel_id,week,price
0,857913002275398,599580038,2020-07-08,1,104,0.016932
1,857913002275398,730863043,2020-07-08,1,104,0.049932
2,857913002275398,636207014,2020-07-08,1,104,0.033881
3,857913002275398,720504010,2020-07-08,1,104,0.033881
4,857913002275398,865595002,2020-07-08,1,104,0.015356


### Time-weighted popularity candidates

In [28]:
popularity = transactions\
    .groupby(['article_id', 'week']).size().reset_index(name='weekly_purchase_count')\

popularity.head()

Unnamed: 0,article_id,week,weekly_purchase_count
0,108775015,95,2
1,108775015,96,1
2,108775044,94,17
3,108775044,95,3
4,108775044,96,8


In [29]:
weekly_popularity = []
def func(row):
    weeks_before = popularity[(row.article_id == popularity.article_id) & (row.week > popularity.week)]
    # get last row of weeks_before
    previous_week_popularity = 0
    if weeks_before.shape[0] > 0:
        previous_week_popularity = weekly_popularity[-1]
    return previous_week_popularity / 2.0 + float(row.weekly_purchase_count)

In [30]:
# iterate over all rows
for i, row in popularity.iterrows():
    weekly_popularity.append(func(row))
popularity['weekly_popularity'] = weekly_popularity

In [31]:
popular_articles_per_week = popularity.sort_values(['week', 'weekly_popularity'], ascending=False)\
    .groupby('week').head(20).reset_index(drop=True)

In [32]:
popular_articles_previous_week = pd.merge(popular_articles_per_week, mean_price, on=['week', 'article_id']).reset_index(drop=True)
# make a new column to rank the weekly_popularity
popular_articles_previous_week['last_week_popularity_rank'] = popular_articles_previous_week.groupby('week')['weekly_popularity'].rank(ascending=False).astype(np.int32)
popular_articles_previous_week.week += 1

In [33]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [34]:
candidates_most_popular = pd.merge(
    unique_transactions,
    popular_articles_previous_week,
    on='week',
)

In [35]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [36]:
candidates_most_popular_test_week = pd.merge(
    test_set_transactions,
    popular_articles_previous_week,
    on='week'
)
candidates_most_popular_test_week.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,weekly_purchase_count,weekly_popularity,price,last_week_popularity_rank
0,2020-07-08,857913002275398,1,104,909370001,1283,1307.0,0.032947,1
1,2020-07-08,857913002275398,1,104,751471001,607,1295.238281,0.033268,2
2,2020-07-08,857913002275398,1,104,918292001,546,1104.414062,0.041424,3
3,2020-07-08,857913002275398,1,104,915526001,443,1090.375,0.033136,4
4,2020-07-08,857913002275398,1,104,706016001,453,1083.681641,0.033259,5


In [37]:
candidates_most_popular = pd.concat([candidates_most_popular, candidates_most_popular_test_week])
candidates_most_popular.drop(columns=['weekly_purchase_count', 'weekly_popularity', 'last_week_popularity_rank'], inplace=True)
candidates_most_popular.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-15,272412481300040,1,95,806388001,0.013301
1,2020-07-15,272412481300040,1,95,730683021,0.025643
2,2020-07-15,272412481300040,1,95,610776002,0.008303
3,2020-07-15,272412481300040,1,95,805308002,0.013609
4,2020-07-15,272412481300040,1,95,866383006,0.024971


# Combining transactions and candidates / negative examples

In [38]:
transactions['purchased'] = 1

In [39]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers, candidates_most_popular])
data.purchased.fillna(0, inplace=True)

data.purchased.mean()

0.06744294605535182

In [40]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

### Add bestseller information

In [41]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [42]:
data = data[data.week != data.week.min()]

### Add item similarity information

In [43]:
data = pd.merge(
    data, 
    similar_items[['customer_id', 'article_id', 'similarity']], 
    on=['customer_id', 'article_id'], 
    how='left'
)

### Add item popularity information

In [44]:
data = pd.merge(
    data,
    popular_articles_previous_week[['week', 'article_id', 'weekly_purchase_count', 'weekly_popularity', 'last_week_popularity_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [45]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

data['weekly_purchase_count'].fillna(0, inplace=True)
data['weekly_popularity'].fillna(0, inplace=True)
data['similarity'].fillna(data['purchased'], inplace=True)
data['bestseller_rank'].fillna(data.bestseller_rank.max() + 1, inplace=True)
data['last_week_popularity_rank'].fillna(data.last_week_popularity_rank.max() + 1, inplace=True)
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased,bestseller_rank,similarity,weekly_purchase_count,...,section_name,garment_group_no,garment_group_name,detail_desc,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0,12.0,1.0,350.0,...,30,1002,2,309,1,1,0,1,48,333369
1,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0,13.0,1.0,0.0,...,1,1005,0,14172,1,1,0,1,48,333369
2,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0,13.0,1.0,0.0,...,1,1009,5,388,1,1,0,1,48,333369
3,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0,13.0,1.0,0.0,...,0,1025,16,4602,1,1,0,1,48,333369
4,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0,13.0,1.0,0.0,...,1,1012,18,26119,1,1,0,1,48,333369


### preprocessing

In [46]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
scaler = StandardScaler()
columns_to_scale = ['product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'last_week_popularity_rank', 'similarity']
data[columns_to_scale] = scaler.fit_transform(data[columns_to_scale])

In [47]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [48]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [49]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [50]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'last_week_popularity_rank', 'similarity']

In [51]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 331 ms, sys: 1.35 s, total: 1.68 s
Wall time: 2.46 s


In [52]:
train_y.head()

0    1.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: purchased, dtype: float64

# Model training

In [53]:
from lightgbm.sklearn import LGBMRanker

In [54]:
ranker = LGBMRanker(
    objective="lambdarank",
    num_leaves=200,
    metric="ndcg",
    boosting_type="dart",
    n_estimators=100,
    importance_type='gain',
    verbose=10,
)

In [55]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.861320
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.135998
[LightGBM] [Debug] init for col-wise cost 0.282695 seconds, init for row-wise cost 0.369977 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1384
[LightGBM] [Info] Number of data points in the train set: 18511355, number of used features: 19
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 25
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 20
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 22
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 21
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 19
[LightGBM] [Debug] Trained a tree with leave

In [56]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

similarity 0.9958198118462323
last_week_popularity_rank 0.0012963570821060701
article_id 0.0006485888334290512
postal_code 0.00041835886509345956
department_no 0.000319005900066112
age 0.0002890126308965154
product_type_no 0.0002865680608572314
colour_group_code 0.00020152152871903025
section_no 0.00014035838326053942
graphical_appearance_no 0.0001373528446523842
garment_group_no 0.0001321620491119065
perceived_colour_master_id 0.00011260027395852463
perceived_colour_value_id 8.671319529346187e-05
index_code 4.8447382664786874e-05
index_group_no 2.257590173382204e-05
Active 1.455980387376809e-05
FN 1.436058388091965e-05
fashion_news_frequency 8.329390776861297e-06
club_member_status 3.3154433933105026e-06


# Calculate predictions

In [57]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 1e+03 ns, sys: 0 ns, total: 1e+03 ns
Wall time: 2.86 µs


# Evaluate results

In [58]:
if sample:
    val_week_purchases_by_cust = pd.read_pickle(f'{DATA_PATH}/val_week_purchases_by_cust_sample_0.05.pkl')
else:
    val_week_purchases_by_cust = pd.read_pickle(f'{DATA_PATH}/val_week_purchases_by_cust.pkl')

In [59]:
"""
[transactions, candidates_last_purchase, candidates_similar_unbought_items]: 0.022571337170999257
[transactions, candidates_last_purchase, candidates_similar_unbought_items]: 0.022603068026213192 (preprocessing)
[transactions, candidates_last_purchase, candidates_similar_unbought_items]: 0.022460058520452002 (preprocessing, 'weekly_purchase_count', 'weekly_popularity', no 'bestseller_rank')
[transactions, candidates_last_purchase, candidates_similar_unbought_items, candidates_most_popular]: 0.010322612374701558
no
"""
apks = []

for c_id, gt in val_week_purchases_by_cust.items():
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    apks.append(apk(gt, pred[:12]))

np.mean(apks) # 0.024427225394874404

0.023125142916277073