In [1]:
import numpy as np
import pandas as pd
from recpack.preprocessing.preprocessors import DataFramePreprocessor
from recpack.preprocessing.filters import MinItemsPerUser, MinUsersPerItem
from recpack.algorithms import KUNN
from utils import DATA_PATH, customer_hex_id_to_int
# This file builds on the code in https://github.com/radekosmulski/personalized_fashion_recs/blob/main/03c_Basic_Model_Submission.ipynb

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
transactions = pd.read_parquet(f'{DATA_PATH}/transactions_train.parquet')
customers = pd.read_parquet(f'{DATA_PATH}/customers.parquet')
articles = pd.read_parquet(f'{DATA_PATH}/articles.parquet')

In [3]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Make new features

## Member status (combine `Active` and `FN`)

In [4]:
"""
A   FN      Result
-1  -1      -3
-1   1       1
 1  -1      -1    
 1   1       3
------------------
Result      Meaning             Mapping
-3          NotActive_NoFN      0
 1          NotActive_FN        1
-1          Active_NoFN         2
 3          Active_FN           3
"""
customers['member_status'] = (customers.Active + customers.FN * 2)\
    .replace({-3: 0, -1: 2, 1: 1, 3: 3})

# Generating candidates

### Last purchase candidates

In [5]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 8.41 s, sys: 128 ms, total: 8.54 s
Wall time: 9.04 s


In [6]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 267 ms, sys: 23.3 ms, total: 290 ms
Wall time: 290 ms


In [7]:
candidates_last_purchase = transactions.copy()

In [8]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 7 s, sys: 99.9 ms, total: 7.1 s
Wall time: 7.14 s


### Bestsellers candidates

In [9]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [10]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [11]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [12]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [13]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [14]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week
test_set_transactions.shape

(437365, 4)

In [15]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [16]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

### Time-weighted popularity candidates

In [17]:
popularity = transactions\
    .groupby(['article_id', 'week']).size().reset_index(name='weekly_purchase_count')\

popularity.head()

Unnamed: 0,article_id,week,weekly_purchase_count
0,108775015,95,2
1,108775015,96,1
2,108775044,95,3
3,108775044,96,8
4,108775044,97,6


In [18]:
weekly_popularity = []
def func(row):
    weeks_before = popularity[(row.article_id == popularity.article_id) & (row.week > popularity.week)]
    # get last row of weeks_before
    previous_week_popularity = 0
    if weeks_before.shape[0] > 0:
        previous_week_popularity = weekly_popularity[-1]
    return previous_week_popularity / 2.0 + float(row.weekly_purchase_count)

In [19]:
# iterate over all rows
for i, row in popularity.iterrows():
    weekly_popularity.append(func(row))
popularity['weekly_popularity'] = weekly_popularity
popularity.head(20)

Unnamed: 0,article_id,week,weekly_purchase_count,weekly_popularity
0,108775015,95,2,2.0
1,108775015,96,1,2.0
2,108775044,95,3,3.0
3,108775044,96,8,9.5
4,108775044,97,6,10.75
5,108775044,98,5,10.375
6,108775044,99,5,10.1875
7,108775044,100,1,6.09375
8,108775044,101,8,11.046875
9,108775044,102,4,9.523438


In [35]:
popular_articles_per_week = popularity.sort_values(['week', 'weekly_popularity'], ascending=False)\
    .groupby('week').head(20).reset_index(drop=True)
popular_articles_per_week.head()

Unnamed: 0,article_id,week,weekly_purchase_count,weekly_popularity
0,924243001,104,852,1236.875
1,909370001,104,537,1190.5
2,751471001,104,526,1173.253906
3,918522001,104,609,1064.5
4,448509014,104,490,1004.210938


In [36]:
popular_articles_previous_week = pd.merge(popular_articles_per_week, mean_price, on=['week', 'article_id']).reset_index(drop=True)
popular_articles_previous_week.week += 1
popular_articles_previous_week.head()

Unnamed: 0,article_id,week,weekly_purchase_count,weekly_popularity,price
0,924243001,105,852,1236.875,0.041535
1,909370001,105,537,1190.5,0.03264
2,751471001,105,526,1173.253906,0.033423
3,918522001,105,609,1064.5,0.041435
4,448509014,105,490,1004.210938,0.04163


In [37]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

unique_transactions.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
29030503,2020-07-15,272412481300040,1,95
29064059,2020-07-15,1456826891333599,1,95
29067103,2020-07-15,2133687643102426,2,95
29027487,2020-07-15,6010692573790711,1,95
29046403,2020-07-15,6171059100114610,2,95


In [38]:
candidates_most_popular = pd.merge(
    unique_transactions,
    popular_articles_previous_week,
    on='week',
)
candidates_most_popular.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,weekly_purchase_count,weekly_popularity,price
0,2020-07-22,200292573348128,2,96,760084003,581,581.0,0.025094
1,2020-07-22,200292573348128,2,96,866731001,530,530.0,0.024919
2,2020-07-22,200292573348128,2,96,600886001,496,496.0,0.02298
3,2020-07-22,200292573348128,2,96,706016001,461,461.0,0.033197
4,2020-07-22,200292573348128,2,96,372860002,445,445.0,0.013193


In [39]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week
test_set_transactions.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-15,272412481300040,1,105
1,2020-07-15,1456826891333599,1,105
2,2020-07-15,2133687643102426,2,105
3,2020-07-15,6010692573790711,1,105
4,2020-07-15,6171059100114610,2,105


In [40]:
candidates_most_popular_test_week = pd.merge(
    test_set_transactions,
    popular_articles_previous_week,
    on='week'
)
candidates_most_popular_test_week.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,weekly_purchase_count,weekly_popularity,price
0,2020-07-15,272412481300040,1,105,924243001,852,1236.875,0.041535
1,2020-07-15,272412481300040,1,105,909370001,537,1190.5,0.03264
2,2020-07-15,272412481300040,1,105,751471001,526,1173.253906,0.033423
3,2020-07-15,272412481300040,1,105,918522001,609,1064.5,0.041435
4,2020-07-15,272412481300040,1,105,448509014,490,1004.210938,0.04163


In [41]:
candidates_most_popular = pd.concat([candidates_most_popular, candidates_most_popular_test_week])
candidates_most_popular.drop(columns=['weekly_purchase_count', 'weekly_popularity'], inplace=True)
candidates_most_popular.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.02298
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193


# Combining transactions and candidates / negative examples

In [42]:
transactions['purchased'] = 1

In [44]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers, candidates_most_popular])
data.purchased.fillna(0, inplace=True)
data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0


In [45]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [46]:
data.purchased.mean()

0.08490357819934453

### Add bestseller information

In [47]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [48]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [49]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

### Add item popularity information

In [50]:
data = pd.merge(
    data,
    popular_articles_previous_week[['week', 'article_id', 'weekly_purchase_count', 'weekly_popularity']],
    on=['week', 'article_id'],
    how='left'
)

In [51]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [52]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [53]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [54]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'member_status', 'weekly_purchase_count', 'weekly_popularity',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [55]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 194 ms, sys: 692 ms, total: 886 ms
Wall time: 1.35 s


# Model training

In [56]:
from lightgbm.sklearn import LGBMRanker

In [57]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [58]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.852975
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.173158
[LightGBM] [Debug] init for col-wise cost 0.185283 seconds, init for row-wise cost 0.421545 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1463
[LightGBM] [Info] Number of data points in the train set: 18461334, number of used features: 19
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
CPU times: user 15 s, sys: 3.73 s, total: 18.8 s
Wall time: 9.09 s


In [59]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

weekly_purchase_count 0.8141454723418815
bestseller_rank 0.18510623006892368
age 0.00023706736807253026
weekly_popularity 0.0001845603132702196
article_id 0.00014887074154170323
garment_group_no 5.500367406866335e-05
colour_group_code 4.4040845175984095e-05
perceived_colour_value_id 3.4796984550075124e-05
postal_code 2.2552588490657477e-05
club_member_status 2.1405074024945085e-05
fashion_news_frequency 0.0
member_status 0.0
index_group_no 0.0
index_code 0.0
department_no 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
product_type_no 0.0
section_no 0.0


# Calculate predictions

In [60]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 1e+03 ns, sys: 1 µs, total: 2 µs
Wall time: 3.1 µs


# Create submission

In [61]:
sub = pd.read_csv(f'{DATA_PATH}/sample_submission.csv')

In [62]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 2.16 s, sys: 80.9 ms, total: 2.24 s
Wall time: 2.26 s


In [63]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [64]:
sub_name = 'submission_lgbm_article_popularity'
sub.to_csv(f'{DATA_PATH}/subs/{sub_name}.csv.gz', index=False)