Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
%%time
import os
os.chdir("/Users/karol/Desktop/Antwerp/ai_project")

transactions = pd.read_parquet('data/transactions_train.parquet')
customers = pd.read_parquet('data/customers.parquet')
articles = pd.read_parquet('data/articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: user 2.19 s, sys: 1.05 s, total: 3.24 s
Wall time: 606 ms


In [4]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [5]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: user 7.46 s, sys: 60.9 ms, total: 7.52 s
Wall time: 7.53 s


In [6]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [7]:
c2weeks

customer_id
28847241659200          [95, 96, 101, 102]
41318098387474                        [98]
116809474287335                 [101, 103]
200292573348128          [95, 96, 99, 102]
248294615847351                       [96]
                               ...        
18446624797007271432                  [95]
18446630855572834764                 [103]
18446662237889060501                 [100]
18446705133201055310                 [102]
18446737527580148316                 [104]
Name: week, Length: 437365, dtype: object

In [8]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: user 289 ms, sys: 17.8 ms, total: 307 ms
Wall time: 307 ms


In [9]:
c2weeks2shifted_weeks[28847241659200]

{95: 96, 96: 101, 101: 102, 102: 105}

In [10]:
candidates_last_purchase = transactions.copy()

In [11]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: user 3.19 s, sys: 29.3 ms, total: 3.22 s
Wall time: 3.22 s


In [12]:
transactions['customer_id']

17          18439897732908966680
218          8486166240428812738
219          8486166240428812738
220          8486166240428812738
221          8486166240428812738
                    ...         
31788319    10759513741427764195
31788320    10759513741427764195
31788321     5804240836826735721
31788322     5804240836826735721
31788323     5804240836826735721
Name: customer_id, Length: 2762872, dtype: uint64

In [13]:
candidates_last_purchase[candidates_last_purchase['customer_id']==28847241659200]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week
7984616,2020-07-18,28847241659200,762846001,0.025407,1,3,0.033151,1.0,1,571.0,71185.5,0,96
7984617,2020-07-18,28847241659200,829308001,0.033881,1,3,0.033151,2.0,21,2153.5,71185.5,0,96
7984618,2020-07-26,28847241659200,887770001,0.016932,1,3,0.033151,1.0,21,259.5,71185.5,0,101
7984619,2020-08-31,28847241659200,760084003,0.025407,1,3,0.033151,1.0,1,31.0,228998.0,0,102
7984620,2020-09-03,28847241659200,925246001,0.128797,2,3,0.033151,2.0,11,980.5,167231.0,1,105


In [14]:
transactions[transactions['customer_id']==28847241659200]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week
7984616,2020-07-18,28847241659200,762846001,0.025407,1,3,0.033151,1.0,1,571.0,71185.5,0,95
7984617,2020-07-18,28847241659200,829308001,0.033881,1,3,0.033151,2.0,21,2153.5,71185.5,0,95
7984618,2020-07-26,28847241659200,887770001,0.016932,1,3,0.033151,1.0,21,259.5,71185.5,0,96
7984619,2020-08-31,28847241659200,760084003,0.025407,1,3,0.033151,1.0,1,31.0,228998.0,0,101
7984620,2020-09-03,28847241659200,925246001,0.128797,2,3,0.033151,2.0,11,980.5,167231.0,1,102


### Bestsellers candidates

In [15]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [16]:
mean_price

week  article_id
95    108775015     0.004729
      108775044     0.008458
      110065001     0.006085
      110065002     0.006085
      111565001     0.004288
                      ...   
104   952267001     0.013732
      952938001     0.048651
      953450001     0.016932
      953763001     0.021885
      956217002     0.059068
Name: price, Length: 196880, dtype: float64

In [17]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [18]:
sales

week  article_id
95    760084003      1
      866731001      2
      600886001      3
      706016001      4
      372860002      5
                    ..
104   915529003      8
      915529005      9
      448509014     10
      762846027     11
      714790020     12
Name: bestseller_rank, Length: 120, dtype: int8

In [19]:
sales.loc[95]

article_id
760084003     1
866731001     2
600886001     3
706016001     4
372860002     5
610776002     6
877278002     7
547780003     8
817354001     9
827968001    10
866731003    11
866383006    12
Name: bestseller_rank, dtype: int8

In [20]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [21]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.02298
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
5,96,610776002,6,0.008318
6,96,877278002,7,0.025036
7,96,547780003,8,0.024814
8,96,817354001,9,0.021913
9,96,827968001,10,0.016436


In [22]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [23]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week
17,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103
218,2020-07-18,8486166240428812738,2,3,0.024806,0.0,11,4025.0,149473.0,1,95
219,2020-08-06,8486166240428812738,2,3,0.024806,1.0,1,5620.0,63973.0,0,98
221,2020-08-14,8486166240428812738,2,3,0.024806,1.0,1,3758.0,63973.0,0,99
223,2020-08-27,8486166240428812738,2,3,0.024806,1.0,1,17136.0,63973.0,0,101
...,...,...,...,...,...,...,...,...,...,...,...
31788307,2020-09-22,986321080496670988,2,3,0.031218,2.0,19,381.0,29751.5,0,104
31788314,2020-09-22,6181448445921565452,2,3,0.033881,2.0,18,1404.5,167231.0,0,104
31788315,2020-09-22,15002336201696251112,2,3,0.022441,1.0,20,543.0,66900.5,0,104
31788319,2020-09-22,10759513741427764195,2,3,0.050831,2.0,27,757.0,124566.0,0,104


In [24]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week
17,2020-09-15,18439897732908966680,794321007,0.061000,2,3,0.039154,2.0,21,732.0,167231.0,1,103
218,2020-07-18,8486166240428812738,579302004,0.015237,2,3,0.024806,0.0,11,4025.0,149473.0,1,95
219,2020-08-06,8486166240428812738,857812002,0.016932,2,3,0.024806,1.0,1,5620.0,63973.0,0,98
221,2020-08-14,8486166240428812738,861173003,0.030492,2,3,0.024806,1.0,1,3758.0,63973.0,0,99
223,2020-08-27,8486166240428812738,808462002,0.022017,2,3,0.024806,1.0,1,17136.0,63973.0,0,101
...,...,...,...,...,...,...,...,...,...,...,...,...,...
31788307,2020-09-22,986321080496670988,867969008,0.033881,2,3,0.031218,2.0,19,381.0,29751.5,0,104
31788314,2020-09-22,6181448445921565452,915611003,0.033881,2,3,0.033881,2.0,18,1404.5,167231.0,0,104
31788315,2020-09-22,15002336201696251112,756322001,0.025407,2,3,0.022441,1.0,20,543.0,66900.5,0,104
31788319,2020-09-22,10759513741427764195,903647001,0.050831,2,3,0.050831,2.0,27,757.0,124566.0,0,104


In [25]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week,article_id,bestseller_rank,price
0,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,915526001,1,0.033127
1,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,751471043,2,0.033082
2,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,751471001,3,0.033257
3,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,706016001,4,0.033229
4,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,919365008,5,0.041837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8141191,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,610776002,8,0.008284
8141192,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,759871002,9,0.006345
8141193,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,895002002,10,0.012359
8141194,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,751471001,11,0.033274


In [26]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [27]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week
0,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,105
1,2020-07-18,8486166240428812738,2,3,0.024806,0.0,11,4025.0,149473.0,1,105
2,2020-08-12,17685132065567316621,2,3,0.019907,1.0,21,166.5,136411.0,1,105
3,2020-07-15,14353855670801342424,1,3,0.023900,1.0,11,3402.0,45738.5,0,105
4,2020-07-25,1122616650571133356,2,3,0.026219,0.0,1,2347.5,115579.5,1,105
...,...,...,...,...,...,...,...,...,...,...,...
437360,2020-09-22,986321080496670988,2,3,0.031218,2.0,19,381.0,29751.5,0,105
437361,2020-09-22,6181448445921565452,2,3,0.033881,2.0,18,1404.5,167231.0,0,105
437362,2020-09-22,15002336201696251112,2,3,0.022441,1.0,20,543.0,66900.5,0,105
437363,2020-09-22,10759513741427764195,2,3,0.050831,2.0,27,757.0,124566.0,0,105


In [28]:
bestsellers_previous_week

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.022980
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
...,...,...,...,...
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.041630
118,105,762846027,11,0.025005


In [29]:
bestsellers_previous_week[bestsellers_previous_week.week==105]

Unnamed: 0,week,article_id,bestseller_rank,price
108,105,924243001,1,0.041535
109,105,924243002,2,0.041877
110,105,918522001,3,0.041435
111,105,923758001,4,0.033462
112,105,866731001,5,0.025015
113,105,909370001,6,0.03264
114,105,751471001,7,0.033423
115,105,915529003,8,0.033439
116,105,915529005,9,0.033417
117,105,448509014,10,0.04163


In [30]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [31]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week,article_id,bestseller_rank,price
0,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,915526001,1,0.033127
1,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,751471043,2,0.033082
2,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,751471001,3,0.033257
3,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,706016001,4,0.033229
4,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,919365008,5,0.041837
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8141191,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,610776002,8,0.008284
8141192,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,759871002,9,0.006345
8141193,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,895002002,10,0.012359
8141194,2020-08-04,6792248941142017031,2,3,0.009806,0.0,16,396.5,6171.5,1,97,751471001,11,0.033274


In [32]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [33]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week,article_id,price
0,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,915526001,0.033127
1,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,751471043,0.033082
2,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,751471001,0.033257
3,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,706016001,0.033229
4,2020-09-15,18439897732908966680,2,3,0.039154,2.0,21,732.0,167231.0,1,103,919365008,0.041837
...,...,...,...,...,...,...,...,...,...,...,...,...,...
5248375,2020-09-22,5804240836826735721,2,3,0.033881,2.0,13,630.0,90456.0,0,105,915529003,0.033439
5248376,2020-09-22,5804240836826735721,2,3,0.033881,2.0,13,630.0,90456.0,0,105,915529005,0.033417
5248377,2020-09-22,5804240836826735721,2,3,0.033881,2.0,13,630.0,90456.0,0,105,448509014,0.041630
5248378,2020-09-22,5804240836826735721,2,3,0.033881,2.0,13,630.0,90456.0,0,105,762846027,0.025005


# Combining transactions and candidates / negative examples

In [34]:
transactions['purchased'] = 1

In [35]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [36]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week,purchased
17,2020-09-15,18439897732908966680,794321007,0.061000,2,3,0.039154,2.0,21,732.0,167231.0,1,103,1.0
218,2020-07-18,8486166240428812738,579302004,0.015237,2,3,0.024806,0.0,11,4025.0,149473.0,1,95,1.0
219,2020-08-06,8486166240428812738,857812002,0.016932,2,3,0.024806,1.0,1,5620.0,63973.0,0,98,1.0
220,2020-08-06,8486166240428812738,872575001,0.015237,2,3,0.024806,0.0,21,9501.5,63973.0,0,98,1.0
221,2020-08-14,8486166240428812738,861173003,0.030492,2,3,0.024806,1.0,1,3758.0,63973.0,0,99,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5248375,2020-09-22,5804240836826735721,915529003,0.033439,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0
5248376,2020-09-22,5804240836826735721,915529005,0.033417,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0
5248377,2020-09-22,5804240836826735721,448509014,0.041630,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0
5248378,2020-09-22,5804240836826735721,762846027,0.025005,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0


In [37]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [38]:
data.purchased.mean()

0.13607582749165664

### Add bestseller information

In [39]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,quarter,avg_price,price_group,age_index_interaction,top_articles,top_customers,price_diff,week,purchased
17,2020-09-15,18439897732908966680,794321007,0.061000,2,3,0.039154,2.0,21,732.0,167231.0,1,103,1.0
218,2020-07-18,8486166240428812738,579302004,0.015237,2,3,0.024806,0.0,11,4025.0,149473.0,1,95,1.0
219,2020-08-06,8486166240428812738,857812002,0.016932,2,3,0.024806,1.0,1,5620.0,63973.0,0,98,1.0
220,2020-08-06,8486166240428812738,872575001,0.015237,2,3,0.024806,0.0,21,9501.5,63973.0,0,98,1.0
221,2020-08-14,8486166240428812738,861173003,0.030492,2,3,0.024806,1.0,1,3758.0,63973.0,0,99,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5248375,2020-09-22,5804240836826735721,915529003,0.033439,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0
5248376,2020-09-22,5804240836826735721,915529005,0.033417,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0
5248377,2020-09-22,5804240836826735721,448509014,0.041630,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0
5248378,2020-09-22,5804240836826735721,762846027,0.025005,2,3,0.033881,2.0,13,630.0,90456.0,0,105,0.0


In [40]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [41]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [42]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [43]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [44]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [45]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [46]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [59]:
# My preprocessing and pruning
columns_to_use = ['article_id', 'product_type_name', 'graphical_appearance_name',
                    'perceived_colour_master_name', 'department_name', 'index_name',
                    'section_name', 'FN', 'Active','club_member_status', 'fashion_news_frequency', 
                    'age_group', 'postal_code', 'bestseller_rank', 'garment_group_name']

In [46]:
# My preprocessing and pruning
columns_to_use = ['article_id', 'product_type_name', 'graphical_appearance_name',
                    'perceived_colour_master_name', 'department_name', 'index_name',
                    'section_name', 'FN', 'Active','club_member_status', 'fashion_news_frequency', 
                    'age_group', 'postal_code', 'bestseller_rank', 'garment_group_name', 'price_diff']

In [47]:
train.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_id',
       'quarter', 'avg_price', 'price_group', 'age_index_interaction',
       'top_articles', 'top_customers', 'price_diff', 'week', 'purchased',
       'bestseller_rank', 'product_type_name', 'graphical_appearance_name',
       'perceived_colour_master_name', 'department_name', 'index_name',
       'section_name', 'garment_group_name', 'quarter_of_peak_sales', 'FN',
       'Active', 'club_member_status', 'fashion_news_frequency', 'postal_code',
       'product_type_cluster', 'index_name_cluster', 'garment_cluster',
       'age_group', 'preferred_sales_channel'],
      dtype='object')

In [48]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: user 156 ms, sys: 368 ms, total: 523 ms
Wall time: 577 ms


# Model training

In [49]:
train_X

Unnamed: 0,article_id,product_type_name,graphical_appearance_name,perceived_colour_master_name,department_name,index_name,section_name,FN,Active,club_member_status,fashion_news_frequency,age_group,postal_code,bestseller_rank,garment_group_name,price_diff
0,887770001,0,0,0,121,0,39,1.0,1.0,0,1,0,191806,999.0,17,0
1,762846001,29,0,1,121,0,32,1.0,1.0,0,1,0,191806,999.0,17,0
2,829308001,4,0,0,112,5,41,1.0,1.0,0,1,0,191806,999.0,3,0
3,760084003,7,0,0,63,7,25,1.0,1.0,0,1,0,191806,1.0,11,0
4,866731001,4,0,0,112,5,41,1.0,1.0,0,1,0,191806,2.0,3,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11381607,918292001,4,3,0,112,5,41,1.0,1.0,0,1,3,156851,8.0,3,0
11381608,762846027,29,0,8,121,0,32,1.0,1.0,0,1,3,156851,9.0,17,0
11381609,809238005,5,3,3,105,7,25,1.0,1.0,0,1,3,156851,10.0,9,0
11381610,673677002,5,0,0,41,0,32,1.0,1.0,0,1,3,156851,11.0,9,0


In [50]:
from lightgbm.sklearn import LGBMRanker

In [51]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [52]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.831347
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.284654
[LightGBM] [Debug] init for col-wise cost 0.178910 seconds, init for row-wise cost 0.395661 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.298032 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 964
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 16
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
CPU times: user 11 s, sys: 1.07 s, total: 12.1 s
Wall time: 3.47 s


In [53]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989394189246399
product_type_name 0.00044584821648950063
article_id 0.00022911648615408925
garment_group_name 9.846339520022175e-05
age_group 6.377758902162047e-05
postal_code 6.36668486542048e-05
department_name 5.716391008049366e-05
section_name 3.826837708014933e-05
club_member_status 3.50077301904203e-05
Active 1.4756841290570678e-05
price_diff 1.45116811988875e-05
fashion_news_frequency 0.0
FN 0.0
index_name 0.0
perceived_colour_master_name 0.0
graphical_appearance_name 0.0


# Calculate predictions

In [54]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 6.2 µs


# Create submission

In [55]:
os.chdir("/Users/karol/Desktop/Antwerp/ai_project")
sub = pd.read_csv('data/sample_submission.csv')

In [56]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: user 2.23 s, sys: 68.2 ms, total: 2.3 s
Wall time: 2.3 s


In [57]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [58]:
sub_name = 'submission/feature_engineering_price_diff'
sub.to_csv(f'{sub_name}.csv.gz', index=False)