In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

In [4]:
%%time

transactions = pd.read_parquet('../input/hm-lgbm-supporting-data/transactions_train.parquet')
customers = pd.read_parquet('../input/hm-lgbm-supporting-data/customers.parquet')
articles = pd.read_parquet('../input/hm-lgbm-supporting-data/articles.parquet')

CPU times: user 2.13 s, sys: 2.73 s, total: 4.85 s
Wall time: 6.74 s


In [5]:
from collections import defaultdict

val_week_purchases_by_cust = defaultdict(list)

val_week_purchases_by_cust.update(
    transactions[transactions.week == transactions.week.max()] \
        .groupby('customer_id')['article_id'] \
        .apply(list) \
        .to_dict()
)

pd.to_pickle(dict(val_week_purchases_by_cust), 'val_week_purchases_by_cust.pkl')

In [6]:
test_week = transactions.week.max()
transactions = transactions[transactions.week != transactions.week.max()]
transactions = transactions[transactions.week > transactions.week.max() - 10]

In [7]:
test_week

104

In [8]:
transactions

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94
28777302,2020-07-08,857913002275398,844294001,0.011847,1,94
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94
...,...,...,...,...,...,...
31536744,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536745,2020-09-15,18446630855572834764,568601045,0.050831,2,103
31536746,2020-09-15,18446630855572834764,898713001,0.067780,2,103
31536747,2020-09-15,18446630855572834764,898713001,0.067780,2,103


# Generating candidates

### Last purchase candidates

In [9]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 22.3 s, sys: 235 ms, total: 22.6 s
Wall time: 22.4 s


In [10]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
94,2020-07-08,2020-07-14
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15


In [11]:
c2weeks

customer_id
28847241659200          [94, 95, 96, 101, 102]
41318098387474                            [98]
116809474287335                     [101, 103]
200292573348128              [95, 96, 99, 102]
208119717816961                           [94]
                                 ...          
18446590778427270109                 [97, 102]
18446624797007271432                      [95]
18446630855572834764                     [103]
18446662237889060501                     [100]
18446705133201055310                     [102]
Name: week, Length: 439368, dtype: object

In [12]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 1.02 s, sys: 87.6 ms, total: 1.11 s
Wall time: 1.11 s


In [13]:
c2weeks2shifted_weeks[28847241659200]

{94: 95, 95: 96, 96: 101, 101: 102, 102: 104}

In [14]:
candidates_last_purchase = transactions.copy()

In [15]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 12.1 s, sys: 65.7 ms, total: 12.2 s
Wall time: 12.2 s


In [16]:
candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
29319533,2020-07-22,272412481300040,885077001,0.008458,1,103
29410772,2020-07-24,272412481300040,850176003,0.029034,2,103
29410773,2020-07-24,272412481300040,875803001,0.064559,2,103
29410774,2020-07-24,272412481300040,892970003,0.020966,2,103
29410775,2020-07-24,272412481300040,854619003,0.020966,2,103


In [17]:
transactions[transactions['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95
29319533,2020-07-22,272412481300040,885077001,0.008458,1,96
29410772,2020-07-24,272412481300040,850176003,0.029034,2,96
29410773,2020-07-24,272412481300040,875803001,0.064559,2,96
29410774,2020-07-24,272412481300040,892970003,0.020966,2,96
29410775,2020-07-24,272412481300040,854619003,0.020966,2,96


### Bestsellers candidates

In [18]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [19]:
mean_price

week  article_id
94    108775044     0.008339
      110065002     0.005508
      111565001     0.005452
      111586001     0.013270
      111593001     0.011669
                      ...   
103   952267001     0.014631
      952938001     0.045746
      953450001     0.016769
      953763001     0.021932
      956217002     0.059203
Name: price, Length: 199492, dtype: float32

In [20]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [21]:
sales

week  article_id
94    806388001      1
      730683021      2
      610776002      3
      805308002      4
      866383006      5
                    ..
103   918292001      8
      762846027      9
      809238005     10
      673677002     11
      923758001     12
Name: bestseller_rank, Length: 120, dtype: int8

In [22]:
sales.loc[95]

article_id
760084003     1
866731001     2
600886001     3
706016001     4
372860002     5
610776002     6
877278002     7
547780003     8
817354001     9
827968001    10
866731003    11
866383006    12
Name: bestseller_rank, dtype: int8

In [23]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [24]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price
12,96,760084003,1,0.025094
13,96,866731001,2,0.024919
14,96,600886001,3,0.02298
15,96,706016001,4,0.033197
16,96,372860002,5,0.013193
17,96,610776002,6,0.008318
18,96,877278002,7,0.025036
19,96,547780003,8,0.024814
20,96,817354001,9,0.021913
21,96,827968001,10,0.016436


In [25]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [26]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
28777300,2020-07-08,857913002275398,1,94
28787123,2020-07-08,1658289241058394,1,94
28788562,2020-07-08,3828854365940846,1,94
28744235,2020-07-08,4195624216542755,1,94
28753719,2020-07-08,4233235614030232,2,94
...,...,...,...,...
31521960,2020-09-15,18439897732908966680,2,103
31531712,2020-09-15,18444276791873187543,2,103
31539937,2020-09-15,18444799607866739422,2,103
31543799,2020-09-15,18446250046654386343,1,103


In [27]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94
28744235,2020-07-08,4195624216542755,817417005,0.022864,1,94
28753719,2020-07-08,4233235614030232,876986001,0.030492,2,94
...,...,...,...,...,...,...
31521960,2020-09-15,18439897732908966680,794321007,0.061000,2,103
31531712,2020-09-15,18444276791873187543,867969008,0.033881,2,103
31539937,2020-09-15,18444799607866739422,909721003,0.042356,2,103
31543799,2020-09-15,18446250046654386343,869872006,0.033881,1,103


In [28]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [29]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [30]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-08,857913002275398,1,104
1,2020-07-08,1658289241058394,1,104
2,2020-07-08,3828854365940846,1,104
3,2020-07-08,4195624216542755,1,104
4,2020-07-08,4233235614030232,2,104
...,...,...,...,...
439363,2020-09-15,18431808737044686839,1,104
439364,2020-09-15,18436707407200418746,2,104
439365,2020-09-15,18439897732908966680,2,104
439366,2020-09-15,18446250046654386343,1,104


In [31]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [32]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [33]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-15,272412481300040,1,95,806388001,0.013301
1,2020-07-15,272412481300040,1,95,730683021,0.025643
2,2020-07-15,272412481300040,1,95,610776002,0.008303
3,2020-07-15,272412481300040,1,95,805308002,0.013609
4,2020-07-15,272412481300040,1,95,866383006,0.024971
...,...,...,...,...,...,...
5272411,2020-09-15,18446630855572834764,2,104,918292001,0.041424
5272412,2020-09-15,18446630855572834764,2,104,762846027,0.025104
5272413,2020-09-15,18446630855572834764,2,104,809238005,0.041656
5272414,2020-09-15,18446630855572834764,2,104,673677002,0.024925


# Combining transactions and candidates / negative examples

In [34]:
transactions['purchased'] = 1

In [35]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [36]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
28777300,2020-07-08,857913002275398,599580068,0.008458,1,94,1.0
28777301,2020-07-08,857913002275398,776237011,0.025407,1,94,1.0
28777302,2020-07-08,857913002275398,844294001,0.011847,1,94,1.0
28787123,2020-07-08,1658289241058394,877773001,0.007610,1,94,1.0
28788562,2020-07-08,3828854365940846,507883009,0.013542,1,94,1.0
...,...,...,...,...,...,...,...
5272411,2020-09-15,18446630855572834764,918292001,0.041424,2,104,0.0
5272412,2020-09-15,18446630855572834764,762846027,0.025104,2,104,0.0
5272413,2020-09-15,18446630855572834764,809238005,0.041656,2,104,0.0
5272414,2020-09-15,18446630855572834764,673677002,0.024925,2,104,0.0


In [37]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [38]:
data.shape

(18475436, 7)

In [39]:
data.purchased.mean()

0.1369278646522875

### Add bestseller information

In [40]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [41]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [42]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [43]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [44]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [45]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [46]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [47]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 338 ms, sys: 221 ms, total: 559 ms
Wall time: 556 ms


# Model training

In [48]:
from lightgbm.sklearn import LGBMRanker

In [49]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [50]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.844955
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.151118
[LightGBM] [Debug] init for col-wise cost 0.255483 seconds, init for row-wise cost 1.407858 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1149
[LightGBM] [Info] Number of data points in the train set: 11557594, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
CPU times: user 27.4 s, sys: 1.66 s, total: 29 s
Wall time: 9.46 s


In [51]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.999066481760161
article_id 0.00028298358632459234
age 0.0002444961714550688
garment_group_no 7.157281908016381e-05
club_member_status 6.724645263245181e-05
postal_code 6.510081422729875e-05
product_type_no 6.100419025057176e-05
colour_group_code 5.175429599815084e-05
department_no 3.6910397657910145e-05
Active 2.4511482560733915e-05
graphical_appearance_no 1.4670190618276406e-05
perceived_colour_value_id 1.3267839033840464e-05
fashion_news_frequency 0.0
FN 0.0
section_no 0.0
index_code 0.0
perceived_colour_master_id 0.0
index_group_no 0.0


# Calculate predictions

In [52]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 4 µs, sys: 0 ns, total: 4 µs
Wall time: 8.58 µs


# Evaluate predictions

In [53]:
apks = []

for c_id, gt in val_week_purchases_by_cust.items():
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    apks.append(apk(gt, pred[:12]))

np.mean(apks)

0.022921270696628283