Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [178]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [179]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [180]:
import pandas as pd

In [181]:
%%time

transactions = pd.read_parquet('../input/warmup/transactions_train.parquet')
customers = pd.read_parquet('../input/warmup/customers.parquet')
articles = pd.read_parquet('../input/article-with-fabric/articles_with_first_5_fabric.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: user 2.29 s, sys: 2.63 s, total: 4.92 s
Wall time: 4.35 s


In [182]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,section_no,section_name,garment_group_no,garment_group_name,detail_desc,cotton,elastic,jersey,knit,metal
0,108775015,108775,12855,253,9,0,1010016,0,9,0,...,16,30,1002,2,8834,0,0,1,0,0
1,108775044,108775,12855,253,9,0,1010016,0,10,2,...,16,30,1002,2,8834,0,0,1,0,0
2,108775051,108775,44846,253,9,0,1010017,3,11,11,...,16,30,1002,2,8834,0,0,1,0,0
3,110065001,110065,8159,306,13,4,1010016,0,9,0,...,61,5,1017,4,8243,0,0,0,0,0
4,110065002,110065,8159,306,13,4,1010016,0,10,2,...,61,5,1017,4,8243,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,24043,302,14,7,1010014,6,9,0,...,26,16,1021,13,28026,1,1,0,1,0
105538,953763001,953763,24027,253,9,0,1010016,0,9,0,...,2,15,1005,0,28025,0,0,0,0,0
105539,956217002,956217,24040,265,1,2,1010016,0,9,0,...,18,12,1005,0,28024,0,0,1,0,0
105540,957375001,957375,24041,72,48,3,1010016,0,9,0,...,52,25,1019,1,28023,0,0,0,0,0


In [183]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 2]

# Generating candidates

### Last purchase candidates

In [184]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 7.67 s, sys: 20.4 ms, total: 7.69 s
Wall time: 7.68 s


In [185]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [186]:
c2weeks

customer_id
116809474287335         [103]
272412481300040         [103]
690285180337957         [103]
1200402310946735        [103]
1219588721247131        [103]
                        ...  
18445641720816255142    [104]
18446250046654386343    [103]
18446420423308293068    [103]
18446630855572834764    [103]
18446737527580148316    [104]
Name: week, Length: 128333, dtype: object

In [187]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 353 ms, sys: 36.1 ms, total: 390 ms
Wall time: 389 ms


In [188]:
#c2weeks2shifted_weeks[28847241659200]

In [189]:
candidates_last_purchase = transactions.copy()

In [190]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 2.34 s, sys: 3.88 ms, total: 2.35 s
Wall time: 2.35 s


In [191]:
candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
31495014,2020-09-14,272412481300040,923460002,0.039932,2,105
31495015,2020-09-14,272412481300040,922381001,0.039932,2,105
31495016,2020-09-14,272412481300040,921906005,0.031949,2,105


In [192]:
transactions[transactions['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
31495014,2020-09-14,272412481300040,923460002,0.039932,2,103
31495015,2020-09-14,272412481300040,922381001,0.039932,2,103
31495016,2020-09-14,272412481300040,921906005,0.031949,2,103


### Bestsellers candidates

In [193]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [194]:
mean_price

week  article_id
103   108775044     0.008458
      111565001     0.006977
      111586001     0.012098
      111593001     0.011912
      111609001     0.008569
                      ...   
104   952267001     0.013732
      952938001     0.048651
      953450001     0.016932
      953763001     0.021885
      956217002     0.059068
Name: price, Length: 36597, dtype: float32

In [195]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [196]:
sales

week  article_id
103   909370001      1
      865799006      2
      918522001      3
      924243001      4
      448509014      5
      751471001      6
      809238001      7
      918292001      8
      762846027      9
      809238005     10
      673677002     11
      923758001     12
104   924243001      1
      924243002      2
      918522001      3
      923758001      4
      866731001      5
      909370001      6
      751471001      7
      915529003      8
      915529005      9
      448509014     10
      762846027     11
      714790020     12
Name: bestseller_rank, dtype: int8

In [198]:
sales.loc[103]

article_id
909370001     1
865799006     2
918522001     3
924243001     4
448509014     5
751471001     6
809238001     7
918292001     8
762846027     9
809238005    10
673677002    11
923758001    12
Name: bestseller_rank, dtype: int8

In [199]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [200]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==102])

Unnamed: 0,week,article_id,bestseller_rank,price


In [201]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [202]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
31317806,2020-09-09,4920151714340210,1,103
31327797,2020-09-09,6611639188934298,2,103
31304107,2020-09-09,7009916629804102,2,103
31329529,2020-09-09,8383252499052781,1,103
31306970,2020-09-09,8853302310499171,2,103
...,...,...,...,...
31760188,2020-09-22,18435221511488011015,1,104
31782234,2020-09-22,18436859303155335645,1,104
31787251,2020-09-22,18437941771381362708,2,104
31776022,2020-09-22,18438270306572912089,1,104


In [203]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
31317806,2020-09-09,4920151714340210,564358058,0.033881,1,103
31327797,2020-09-09,6611639188934298,685814003,0.030492,2,103
31304107,2020-09-09,7009916629804102,933032002,0.116831,2,103
31329529,2020-09-09,8383252499052781,865929007,0.016932,1,103
31306970,2020-09-09,8853302310499171,915526001,0.030492,2,103
...,...,...,...,...,...,...
31760188,2020-09-22,18435221511488011015,573085055,0.033881,1,104
31782234,2020-09-22,18436859303155335645,801447001,0.030492,1,104
31787251,2020-09-22,18437941771381362708,907188001,0.050831,2,104
31776022,2020-09-22,18438270306572912089,751471043,0.033881,1,104


In [204]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [205]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [206]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-09-09,4920151714340210,1,105
1,2020-09-09,6611639188934298,2,105
2,2020-09-09,7009916629804102,2,105
3,2020-09-09,8383252499052781,1,105
4,2020-09-09,8853302310499171,2,105
...,...,...,...,...
128328,2020-09-22,18422784312842572958,1,105
128329,2020-09-22,18432411165497420051,1,105
128330,2020-09-22,18435221511488011015,1,105
128331,2020-09-22,18437941771381362708,2,105


In [207]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [208]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [209]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-09-16,3177658828628418,1,104,909370001,0.032947
1,2020-09-16,3177658828628418,1,104,865799006,0.033340
2,2020-09-16,3177658828628418,1,104,918522001,0.041416
3,2020-09-16,3177658828628418,1,104,924243001,0.041549
4,2020-09-16,3177658828628418,1,104,448509014,0.041604
...,...,...,...,...,...,...
1539991,2020-09-22,18438270306572912089,1,105,915529003,0.033439
1539992,2020-09-22,18438270306572912089,1,105,915529005,0.033417
1539993,2020-09-22,18438270306572912089,1,105,448509014,0.041630
1539994,2020-09-22,18438270306572912089,1,105,762846027,0.025005


# Combining transactions and candidates / negative examples

In [210]:
transactions['purchased'] = 1

In [211]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [212]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
31317806,2020-09-09,4920151714340210,564358058,0.033881,1,103,1.0
31317807,2020-09-09,4920151714340210,568601044,0.050831,1,103,1.0
31317808,2020-09-09,4920151714340210,779781013,0.042356,1,103,1.0
31317809,2020-09-09,4920151714340210,843465004,0.050831,1,103,1.0
31317810,2020-09-09,4920151714340210,715828013,0.033881,1,103,1.0
...,...,...,...,...,...,...,...
1539991,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0
1539992,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0
1539993,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0
1539994,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0


In [213]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [214]:
data.purchased.mean()

0.13666592913850006

### Add bestseller information

In [215]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [216]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [217]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [218]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [219]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [220]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [221]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,section_no,section_name,garment_group_no,garment_group_name,detail_desc,cotton,elastic,jersey,knit,metal
0,108775015,108775,12855,253,9,0,1010016,0,9,0,...,16,30,1002,2,8834,0,0,1,0,0
1,108775044,108775,12855,253,9,0,1010016,0,10,2,...,16,30,1002,2,8834,0,0,1,0,0
2,108775051,108775,44846,253,9,0,1010017,3,11,11,...,16,30,1002,2,8834,0,0,1,0,0
3,110065001,110065,8159,306,13,4,1010016,0,9,0,...,61,5,1017,4,8243,0,0,0,0,0
4,110065002,110065,8159,306,13,4,1010016,0,10,2,...,61,5,1017,4,8243,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,24043,302,14,7,1010014,6,9,0,...,26,16,1021,13,28026,1,1,0,1,0
105538,953763001,953763,24027,253,9,0,1010016,0,9,0,...,2,15,1005,0,28025,0,0,0,0,0
105539,956217002,956217,24040,265,1,2,1010016,0,9,0,...,18,12,1005,0,28024,0,0,1,0,0
105540,957375001,957375,24041,72,48,3,1010016,0,9,0,...,52,25,1019,1,28023,0,0,0,0,0


In [222]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active', 'cotton', 'elastic', 'jersey', 'knit', 'metal',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [223]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 84.9 ms, sys: 3.11 ms, total: 88 ms
Wall time: 86.6 ms


# Model training

In [224]:
from lightgbm.sklearn import LGBMRanker

In [225]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [226]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.811102
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.307019
[LightGBM] [Debug] init for col-wise cost 0.069303 seconds, init for row-wise cost 0.256819 seconds
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 1144
[LightGBM] [Info] Number of data points in the train set: 1076597, number of used features: 23
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
CPU times: user 4.27 s, sys: 86 ms, total: 4.36 s
Wall time: 1.33 s


In [227]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9992792794004829
age 0.0002371779153809384
article_id 0.0001915818637345417
postal_code 7.660384842912943e-05
garment_group_no 6.117593430931383e-05
perceived_colour_value_id 5.4215518507862665e-05
Active 3.268939773768615e-05
fashion_news_frequency 2.943823918175323e-05
FN 2.8762620834597683e-05
club_member_status 9.075261401254192e-06
cotton 0.0
elastic 0.0
jersey 0.0
knit 0.0
index_group_no 0.0
index_code 0.0
department_no 0.0
perceived_colour_master_id 0.0
metal 0.0
colour_group_code 0.0
graphical_appearance_no 0.0
product_type_no 0.0
section_no 0.0


# Calculate predictions

In [228]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 5 µs, sys: 0 ns, total: 5 µs
Wall time: 38.9 µs


# Create submission

In [229]:
sub = pd.read_csv('/kaggle/input/h-and-m-personalized-fashion-recommendations/sample_submission.csv')

In [230]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 6.65 s, sys: 483 ms, total: 7.13 s
Wall time: 7.13 s


In [231]:
preds

[[924243001,
  924243002,
  918522001,
  923758001,
  866731001,
  909370001,
  751471001,
  915529003,
  915529005,
  448509014,
  762846027,
  714790020],
 [924243001,
  924243002,
  918522001,
  923758001,
  866731001,
  909370001,
  751471001,
  915529003,
  915529005,
  448509014,
  762846027,
  714790020],
 [794321007,
  924243001,
  924243002,
  918522001,
  923758001,
  866731001,
  909370001,
  751471001,
  448509014,
  762846027,
  714790020,
  915529003],
 [924243001,
  924243002,
  918522001,
  923758001,
  866731001,
  909370001,
  751471001,
  915529003,
  915529005,
  448509014,
  762846027,
  714790020],
 [924243001,
  924243002,
  918522001,
  923758001,
  866731001,
  909370001,
  751471001,
  915529003,
  915529005,
  448509014,
  762846027,
  714790020],
 [924243001,
  924243002,
  918522001,
  923758001,
  866731001,
  909370001,
  751471001,
  915529003,
  915529005,
  448509014,
  762846027,
  714790020],
 [719530003,
  924243001,
  924243002,
  918522001,
  9237

In [232]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [233]:
sub.to_csv('submission.csv', index=False)

In [234]:
pd.read_csv("/kaggle/working/submission.csv")

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0924243001 0924243002 0918522001 0923758001 08...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0924243001 0924243002 0918522001 0923758001 08...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0794321007 0924243001 0924243002 0918522001 09...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0924243001 0924243002 0918522001 0923758001 08...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0924243001 0924243002 0918522001 0923758001 08...
...,...,...
1371975,ffffbbf78b6eaac697a8a5dfbfd2bfa8113ee5b403e474...,0924243001 0924243002 0918522001 0923758001 08...
1371976,ffffcd5046a6143d29a04fb8c424ce494a76e5cdf4fab5...,0924243001 0924243002 0918522001 0923758001 08...
1371977,ffffcf35913a0bee60e8741cb2b4e78b8a98ee5ff2e6a1...,0924243001 0924243002 0918522001 0923758001 07...
1371978,ffffd7744cebcf3aca44ae7049d2a94b87074c3d4ffe38...,0924243001 0924243002 0918522001 0923758001 08...
