In [1]:
import pandas as pd
import numpy as np
# import utils file from previous lecture
import sys
sys.path.append('../lecture4')
from utils import DATA_PATH
from evaluation import apk

# This file builds on the code in https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03a_Basic_Model_Local_Validation.ipynb

In [2]:
transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
customers = pd.read_parquet(f'{DATA_PATH}/customers.parquet')
articles = pd.read_parquet(f'{DATA_PATH}/articles.parquet')

In [3]:
test_week = transactions.week.max()
transactions = transactions[transactions.week != transactions.week.max()]
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [4]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 7.93 s, sys: 93.7 ms, total: 8.02 s
Wall time: 8 s


In [5]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 270 ms, sys: 24.8 ms, total: 295 ms
Wall time: 297 ms


In [6]:
candidates_last_purchase = transactions.copy()

In [7]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 8.96 s, sys: 72 ms, total: 9.03 s
Wall time: 9.03 s


### Bestsellers candidates

In [8]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [9]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [10]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1
bestsellers_previous_week.head()

Unnamed: 0,week,article_id,bestseller_rank,price
0,95,806388001,1,0.013301
1,95,730683021,2,0.025643
2,95,610776002,3,0.008303
3,95,805308002,4,0.013609
4,95,866383006,5,0.024971


In [45]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [46]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [47]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [48]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [49]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

# Combining transactions and candidates / negative examples

In [50]:
transactions['purchased'] = 1

In [51]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

data.purchased.mean()

0.14683737277719777

In [52]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

### Add bestseller information

In [53]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [54]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [55]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [56]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [57]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [58]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [59]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [60]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 107 ms, sys: 141 ms, total: 248 ms
Wall time: 247 ms


In [61]:
train_y.head()

0    1.0
1    1.0
2    0.0
3    0.0
4    0.0
Name: purchased, dtype: float64

# Model training

In [62]:
from lightgbm.sklearn import LGBMRanker

In [63]:
ranker = LGBMRanker(
    objective="lambdarank",
    num_leaves=200,
    metric="ndcg",
    boosting_type="dart",
    n_estimators=100,
    importance_type='gain',
    verbose=10,
)

In [64]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.844955
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.151118
[LightGBM] [Debug] init for col-wise cost 0.049665 seconds, init for row-wise cost 0.199860 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1149
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 16
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 18
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 17
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 19
[LightGBM] [Debug] Trained a tree with leaves = 200 and depth = 17
[LightGBM] [Debug] Trained a tree with leave

In [65]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9376861493387726
article_id 0.02177341244641252
product_type_no 0.01025941808456561
department_no 0.007932840748007937
garment_group_no 0.004583792064670282
colour_group_code 0.0040037332457556534
graphical_appearance_no 0.002800727487572376
section_no 0.0024967487904410174
perceived_colour_value_id 0.0023000001772138087
perceived_colour_master_id 0.002241744761084165
age 0.0014257376163587592
index_code 0.0008015751875014689
index_group_no 0.000561076141218755
club_member_status 0.0005071431459506904
postal_code 0.00041600993534005054
Active 0.0001224149128857914
fashion_news_frequency 6.177883894921548e-05
FN 2.569707729928902e-05


# Calculate predictions

In [66]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 1.91 µs


# Evaluate results

In [67]:
val_week_purchases_by_cust = pd.read_pickle(f'{DATA_PATH}/val_week_purchases_by_cust.pkl')

In [68]:
apks = []

for c_id, gt in val_week_purchases_by_cust.items():
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    apks.append(apk(gt, pred[:12]))

np.mean(apks) # 0.023166716930571193

0.02376389969599154