In [1]:
import numpy as np
import pandas as pd
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.algorithms import KUNN
from utils import DATA_PATH, customer_hex_id_to_int
# This file builds on the code in https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
customers = pd.read_parquet(f'{DATA_PATH}/customers.parquet')
articles = pd.read_parquet(f'{DATA_PATH}/articles.parquet')

In [3]:
test_week = transactions.week.max() + 1
unused_transactions = transactions[(transactions.week > transactions.week.max() - 20) & (transactions.week <= transactions.week.max() - 10)][['article_id', 'customer_id', 'week']]
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Make new features

## Member status (combine `Active` and `FN`)

In [4]:
"""
A   FN      Result
-1  -1      -3
-1   1       1
 1  -1      -1    
 1   1       3
------------------
Result      Meaning             Mapping
-3          NotActive_NoFN      0
 1          NotActive_FN        1
-1          Active_NoFN         2
 3          Active_FN           3
"""
customers['member_status'] = (customers.Active + customers.FN * 2)\
    .replace({-3: 0, -1: 2, 1: 1, 3: 3})

## Time-weighted article popularity

In [5]:
last_date = transactions.t_dat.max()

# Engineered feature 1: item popularity, weighted by amount of weeks since last purchase
transactions['weeks_ago'] = (last_date - transactions.t_dat).dt.days // 7 + 1

# calculate the popularity of each article_id: sum of the number of times it was purchased each week, weighted by weeks_ago
popularity = transactions\
    .groupby(['article_id', 'weeks_ago']).size()\
    .reset_index(name='purchase_count')\
    .groupby('article_id')\
    .apply(lambda x: np.sum(x.purchase_count / x.weeks_ago))\
    .reset_index(name='article_popularity')

transactions = transactions.merge(popularity, how='inner', on='article_id')
transactions.drop(columns=['weeks_ago'], inplace=True)

# Generating candidates

### Last purchase candidates

In [6]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 8.06 s, sys: 123 ms, total: 8.18 s
Wall time: 8.19 s


In [7]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 242 ms, sys: 20.7 ms, total: 263 ms
Wall time: 262 ms


In [8]:
candidates_last_purchase = transactions.copy()

In [9]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 8.63 s, sys: 76.4 ms, total: 8.7 s
Wall time: 8.73 s


### Bestsellers candidates

In [10]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [11]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [12]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [13]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [14]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [15]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week
test_set_transactions.shape

(437365, 5)

In [16]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [17]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

### Item/user similarity

In [18]:
proc = DataFramePreprocessor(item_ix='article_id', user_ix='customer_id', timestamp_ix='week')
proc.add_filter(MinUsersPerItem(20, item_ix='article_id', user_ix='customer_id'))
proc.add_filter(MinItemsPerUser(20, item_ix='article_id', user_ix='customer_id'))

interaction_matrix = proc.process(transactions[["customer_id", "article_id", "week"]])

100%|██████████| 420382/420382 [00:00<00:00, 878425.49it/s]
100%|██████████| 420382/420382 [00:00<00:00, 765780.30it/s]


In [19]:
kunn = KUNN(Ku=10, Ki=10)
kunn.fit(interaction_matrix)

2022-11-13 17:48:47,934 - base - recpack - INFO - Fitting KUNN complete - Took 0.373s


In [20]:
predictions = kunn.predict(interaction_matrix)

In [21]:
def top_n_idx_sparse(matrix, n):
    """Return index of top n values in each row of a sparse matrix.
    source: https://stackoverflow.com/questions/49207275/finding-the-top-n-values-in-a-row-of-a-scipy-sparse-matrix
    """
    '''Return index of top n values in each row of a sparse matrix'''
    top_n_idx = []
    for le, ri in zip(matrix.indptr[:-1], matrix.indptr[1:]):
        n_row_pick = min(n, ri - le)
        top_n_idx.append(matrix.indices[le + np.argpartition(matrix.data[le:ri], -n_row_pick)[-n_row_pick:]].tolist())
    return top_n_idx

In [22]:
# get top 10 item indices for each user
pred_indices = top_n_idx_sparse(predictions, 10)

In [23]:
# # maps uid (user_id) value vack to customer_id
uid_cid_map = interaction_matrix._df[["uid", "customer_id"]].drop_duplicates().set_index("uid").to_dict()["customer_id"]
# maps iid (item_id) value vack to article_id
iid_aid_map = interaction_matrix._df[["iid", "article_id"]].drop_duplicates().set_index("iid").to_dict()["article_id"]

In [24]:
pred_customers = []
pred_articles = []
for i, row in enumerate(pred_indices):
    for item_id in row:
        pred_customers.append(uid_cid_map[i])
        pred_articles.append(iid_aid_map[item_id])

similarity_items = pd.DataFrame({"customer_id": pred_customers, "article_id": pred_articles})
similarity_items["week"] = test_week
similarity_items.head()

Unnamed: 0,customer_id,article_id,week
0,3286895746334673872,909924002,105
1,3286895746334673872,944506001,105
2,3286895746334673872,887681002,105
3,3286895746334673872,774043005,105
4,3286895746334673872,854826002,105


In [29]:
test_set_transactions.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_popularity
0,2020-07-15,272412481300040,1,105,252.978175
1,2020-07-15,251373156213565105,2,105,252.978175
2,2020-07-15,401724781178962911,1,105,252.978175
3,2020-07-15,664658955429565460,1,105,252.978175
4,2020-07-15,1535663517786086475,2,105,252.978175


In [31]:
similarity_candidates = pd.merge(
    similarity_items,
    test_set_transactions,
    on=["week", "customer_id"],
    how="left",
)
similarity_candidates.head()

Unnamed: 0,customer_id,article_id,week,t_dat,sales_channel_id,article_popularity
0,3286895746334673872,909924002,105,2020-07-15,2,252.978175
1,3286895746334673872,944506001,105,2020-07-15,2,252.978175
2,3286895746334673872,887681002,105,2020-07-15,2,252.978175
3,3286895746334673872,774043005,105,2020-07-15,2,252.978175
4,3286895746334673872,854826002,105,2020-07-15,2,252.978175


# Combining transactions and candidates / negative examples

In [36]:
transactions['purchased'] = 1

In [None]:
transactions['purchased'] = 1

In [37]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers, similarity_candidates])
data.purchased.fillna(0, inplace=True)

In [38]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [39]:
data.purchased.mean()

0.13518841083268948

### Add bestseller information

In [40]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [41]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [42]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [43]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [44]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [45]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [46]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'member_status', 'article_popularity',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [47]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 124 ms, sys: 275 ms, total: 399 ms
Wall time: 595 ms


# Model training

In [48]:
from lightgbm.sklearn import LGBMRanker

In [49]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [50]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.850065
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.183398
[LightGBM] [Debug] init for col-wise cost 0.133471 seconds, init for row-wise cost 0.218139 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1397
[LightGBM] [Info] Number of data points in the train set: 11383971, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
CPU times: user 8.65 s, sys: 1.7 s, total: 10.4 s
Wall time: 4.8 s


In [51]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9816601479090191
article_popularity 0.012186790204185673
article_id 0.003454411979686151
product_type_no 0.000702214256630729
colour_group_code 0.0006936667475777567
section_no 0.0003966379234883029
perceived_colour_master_id 0.0003263479064362502
garment_group_no 0.00019612022128237137
perceived_colour_value_id 0.00019554148792473323
graphical_appearance_no 0.00018812136376886016
fashion_news_frequency 0.0
club_member_status 0.0
age 0.0
member_status 0.0
postal_code 0.0
index_code 0.0
department_no 0.0
index_group_no 0.0


# Calculate predictions

In [52]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 1e+03 ns, sys: 1e+03 ns, total: 2 µs
Wall time: 3.1 µs


# Create submission

In [53]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [54]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 2.06 s, sys: 71.1 ms, total: 2.13 s
Wall time: 2.13 s


In [55]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [56]:
sub_name = 'submission_lgbm_1'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)