Radek posted about this [here](https://www.kaggle.com/competitions/h-and-m-personalized-fashion-recommendations/discussion/309220), and linked to a GitHub repo with the code.

I just transferred that code here to Kaggle notebooks, that's all.

In [1]:
import numpy as np

def apk(actual, predicted, k=10):
    """
    Computes the average precision at k.

    This function computes the average prescision at k between two lists of
    items.

    Parameters
    ----------
    actual : list
             A list of elements that are to be predicted (order doesn't matter)
    predicted : list
                A list of predicted elements (order does matter)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The average precision at k over the input lists

    """
    if len(predicted)>k:
        predicted = predicted[:k]

    score = 0.0
    num_hits = 0.0

    for i,p in enumerate(predicted):
        if p in actual and p not in predicted[:i]:
            num_hits += 1.0
            score += num_hits / (i+1.0)

    if not actual:
        return 0.0

    return score / min(len(actual), k)

def mapk(actual, predicted, k=10):
    """
    Computes the mean average precision at k.

    This function computes the mean average prescision at k between two lists
    of lists of items.

    Parameters
    ----------
    actual : list
             A list of lists of elements that are to be predicted 
             (order doesn't matter in the lists)
    predicted : list
                A list of lists of predicted elements
                (order matters in the lists)
    k : int, optional
        The maximum number of predicted elements

    Returns
    -------
    score : double
            The mean average precision at k over the input lists

    """
    return np.mean([apk(a,p,k) for a,p in zip(actual, predicted)])

In [2]:
from sklearn.base import BaseEstimator, TransformerMixin
import numpy as np

# https://www.kaggle.com/c/h-and-m-personalized-fashion-recommendations/discussion/308635
def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

class Categorize(BaseEstimator, TransformerMixin):
    def __init__(self, min_examples=0):
        self.min_examples = min_examples
        self.categories = []
        
    def fit(self, X):
        for i in range(X.shape[1]):
            vc = X.iloc[:, i].value_counts()
            self.categories.append(vc[vc > self.min_examples].index.tolist())
        return self

    def transform(self, X):
        data = {X.columns[i]: pd.Categorical(X.iloc[:, i], categories=self.categories[i]).codes for i in range(X.shape[1])}
        return pd.DataFrame(data=data)


def calculate_apk(list_of_preds, list_of_gts):
    # for fast validation this can be changed to operate on dicts of {'cust_id_int': [art_id_int, ...]}
    # using 'data/val_week_purchases_by_cust.pkl'
    apks = []
    for preds, gt in zip(list_of_preds, list_of_gts):
        apks.append(apk(gt, preds, k=12))
    return np.mean(apks)

def eval_sub(sub_csv, skip_cust_with_no_purchases=True):
    sub=pd.read_csv(sub_csv)
    validation_set=pd.read_parquet('data/validation_ground_truth.parquet')

    apks = []

    no_purchases_pattern = []
    for pred, gt in zip(sub.prediction.str.split(), validation_set.prediction.str.split()):
        if skip_cust_with_no_purchases and (gt == no_purchases_pattern): continue
        apks.append(apk(gt, pred, k=12))
    return np.mean(apks)

In [3]:
import pandas as pd

In [5]:
%%time

transactions = pd.read_parquet('data/parquet/transactions_train.parquet')
customers = pd.read_parquet('data/parquet/customers.parquet')
articles = pd.read_parquet('data/parquet/articles.parquet')

# sample = 0.05
# transactions = pd.read_parquet(f'data/transactions_train_sample_{sample}.parquet')
# customers = pd.read_parquet(f'data/customers_sample_{sample}.parquet')
# articles = pd.read_parquet(f'data/articles_train_sample_{sample}.parquet')

CPU times: total: 4.08 s
Wall time: 1.97 s


In [6]:
test_week = transactions.week.max() + 1
transactions = transactions[transactions.week > transactions.week.max() - 10]

# Generating candidates

### Last purchase candidates

In [7]:
%%time

c2weeks = transactions.groupby('customer_id')['week'].unique()

CPU times: total: 22.6 s
Wall time: 22.7 s


In [8]:
transactions.groupby('week')['t_dat'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
week,Unnamed: 1_level_1,Unnamed: 2_level_1
95,2020-07-15,2020-07-21
96,2020-07-22,2020-07-28
97,2020-07-29,2020-08-04
98,2020-08-05,2020-08-11
99,2020-08-12,2020-08-18
100,2020-08-19,2020-08-25
101,2020-08-26,2020-09-01
102,2020-09-02,2020-09-08
103,2020-09-09,2020-09-15
104,2020-09-16,2020-09-22


In [9]:
c2weeks

customer_id
28847241659200          [95, 96, 101, 102]
41318098387474                        [98]
116809474287335                 [101, 103]
200292573348128          [95, 96, 99, 102]
248294615847351                       [96]
                               ...        
18446624797007271432                  [95]
18446630855572834764                 [103]
18446662237889060501                 [100]
18446705133201055310                 [102]
18446737527580148316                 [104]
Name: week, Length: 437365, dtype: object

In [10]:
%%time

c2weeks2shifted_weeks = {}

for c_id, weeks in c2weeks.items():
    c2weeks2shifted_weeks[c_id] = {}
    for i in range(weeks.shape[0]-1):
        c2weeks2shifted_weeks[c_id][weeks[i]] = weeks[i+1]
    c2weeks2shifted_weeks[c_id][weeks[-1]] = test_week

CPU times: total: 812 ms
Wall time: 823 ms


In [11]:
c2weeks2shifted_weeks[28847241659200]

{95: 96, 96: 101, 101: 102, 102: 105}

In [12]:
candidates_last_purchase = transactions.copy()

In [13]:
%%time

weeks = []
for i, (c_id, week) in enumerate(zip(transactions['customer_id'], transactions['week'])):
    weeks.append(c2weeks2shifted_weeks[c_id][week])
    
candidates_last_purchase.week=weeks

CPU times: total: 24.8 s
Wall time: 25 s


In [14]:
candidates_last_purchase[candidates_last_purchase['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,96
29030504,2020-07-15,272412481300040,816592008,0.016932,1,96
29030505,2020-07-15,272412481300040,621381021,0.033881,1,96
29030506,2020-07-15,272412481300040,817477003,0.025407,1,96
29030507,2020-07-15,272412481300040,899088002,0.025407,1,96
29319533,2020-07-22,272412481300040,885077001,0.008458,1,103
29410772,2020-07-24,272412481300040,850176003,0.029034,2,103
29410773,2020-07-24,272412481300040,875803001,0.064559,2,103
29410774,2020-07-24,272412481300040,892970003,0.020966,2,103
29410775,2020-07-24,272412481300040,854619003,0.020966,2,103


In [15]:
transactions[transactions['customer_id']==272412481300040]

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95
29319533,2020-07-22,272412481300040,885077001,0.008458,1,96
29410772,2020-07-24,272412481300040,850176003,0.029034,2,96
29410773,2020-07-24,272412481300040,875803001,0.064559,2,96
29410774,2020-07-24,272412481300040,892970003,0.020966,2,96
29410775,2020-07-24,272412481300040,854619003,0.020966,2,96


### Bestsellers candidates

In [16]:
mean_price = transactions \
    .groupby(['week', 'article_id'])['price'].mean()

In [17]:
mean_price

week  article_id
95    108775015     0.004729
      108775044     0.008458
      110065001     0.006085
      110065002     0.006085
      111565001     0.004288
                      ...   
104   952267001     0.013732
      952938001     0.048651
      953450001     0.016932
      953763001     0.021885
      956217002     0.059068
Name: price, Length: 196880, dtype: float32

In [18]:
sales = transactions \
    .groupby('week')['article_id'].value_counts() \
    .groupby('week').rank(method='dense', ascending=False) \
    .groupby('week').head(12).rename('bestseller_rank').astype('int8')

In [19]:
sales

week  article_id
95    760084003      1
      866731001      2
      600886001      3
      706016001      4
      372860002      5
                    ..
104   915529003      8
      915529005      9
      448509014     10
      762846027     11
      714790020     12
Name: bestseller_rank, Length: 120, dtype: int8

In [20]:
sales.loc[95]

article_id
760084003     1
866731001     2
600886001     3
706016001     4
372860002     5
610776002     6
877278002     7
547780003     8
817354001     9
827968001    10
866731003    11
866383006    12
Name: bestseller_rank, dtype: int8

In [21]:
bestsellers_previous_week = pd.merge(sales, mean_price, on=['week', 'article_id']).reset_index()
bestsellers_previous_week.week += 1

In [22]:
bestsellers_previous_week.pipe(lambda df: df[df['week']==96])

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.02298
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193
5,96,610776002,6,0.008318
6,96,877278002,7,0.025036
7,96,547780003,8,0.024814
8,96,817354001,9,0.021913
9,96,827968001,10,0.016436


In [23]:
unique_transactions = transactions \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [24]:
unique_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
29030503,2020-07-15,272412481300040,1,95
29064059,2020-07-15,1456826891333599,1,95
29067103,2020-07-15,2133687643102426,2,95
29027487,2020-07-15,6010692573790711,1,95
29046403,2020-07-15,6171059100114610,2,95
...,...,...,...,...
31760188,2020-09-22,18435221511488011015,1,104
31782234,2020-09-22,18436859303155335645,1,104
31787251,2020-09-22,18437941771381362708,2,104
31776022,2020-09-22,18438270306572912089,1,104


In [25]:
transactions.drop_duplicates(['week', 'customer_id'])

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95
29064059,2020-07-15,1456826891333599,888294001,0.013542,1,95
29067103,2020-07-15,2133687643102426,843642001,0.042356,2,95
29027487,2020-07-15,6010692573790711,857812010,0.039661,1,95
29046403,2020-07-15,6171059100114610,815447007,0.006763,2,95
...,...,...,...,...,...,...
31760188,2020-09-22,18435221511488011015,573085055,0.033881,1,104
31782234,2020-09-22,18436859303155335645,801447001,0.030492,1,104
31787251,2020-09-22,18437941771381362708,907188001,0.050831,2,104
31776022,2020-09-22,18438270306572912089,751471043,0.033881,1,104


In [26]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_previous_week,
    on='week',
)

In [27]:
test_set_transactions = unique_transactions.drop_duplicates('customer_id').reset_index(drop=True)
test_set_transactions.week = test_week

In [28]:
test_set_transactions

Unnamed: 0,t_dat,customer_id,sales_channel_id,week
0,2020-07-15,272412481300040,1,105
1,2020-07-15,1456826891333599,1,105
2,2020-07-15,2133687643102426,2,105
3,2020-07-15,6010692573790711,1,105
4,2020-07-15,6171059100114610,2,105
...,...,...,...,...
437360,2020-09-22,18410229429441241008,2,105
437361,2020-09-22,18417769707947924979,2,105
437362,2020-09-22,18418054986721795659,2,105
437363,2020-09-22,18421175435799911749,2,105


In [29]:
candidates_bestsellers_test_week = pd.merge(
    test_set_transactions,
    bestsellers_previous_week,
    on='week'
)

In [30]:
candidates_bestsellers = pd.concat([candidates_bestsellers, candidates_bestsellers_test_week])
candidates_bestsellers.drop(columns='bestseller_rank', inplace=True)

In [31]:
candidates_bestsellers

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.022980
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193
...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,1,105,915529003,0.033439
5248376,2020-09-22,18438270306572912089,1,105,915529005,0.033417
5248377,2020-09-22,18438270306572912089,1,105,448509014,0.041630
5248378,2020-09-22,18438270306572912089,1,105,762846027,0.025005


# Age Group Candidates

In [32]:
t_data = pd.merge(transactions, customers, on='customer_id', how='left')

In [33]:
t_data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,2020-07-15,272412481300040,778064028,0.008458,1,95,1,1,0,1,48,333369
1,2020-07-15,272412481300040,816592008,0.016932,1,95,1,1,0,1,48,333369
2,2020-07-15,272412481300040,621381021,0.033881,1,95,1,1,0,1,48,333369
3,2020-07-15,272412481300040,817477003,0.025407,1,95,1,1,0,1,48,333369
4,2020-07-15,272412481300040,899088002,0.025407,1,95,1,1,0,1,48,333369


In [34]:
t_data['age_group'] = pd.cut(t_data['age'], bins=[0, 25, 40, 60, 100], labels=[0, 1, 2, 3])

In [35]:
t_data.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,FN,Active,club_member_status,fashion_news_frequency,age,postal_code,age_group
0,2020-07-15,272412481300040,778064028,0.008458,1,95,1,1,0,1,48,333369,2
1,2020-07-15,272412481300040,816592008,0.016932,1,95,1,1,0,1,48,333369,2
2,2020-07-15,272412481300040,621381021,0.033881,1,95,1,1,0,1,48,333369,2
3,2020-07-15,272412481300040,817477003,0.025407,1,95,1,1,0,1,48,333369,2
4,2020-07-15,272412481300040,899088002,0.025407,1,95,1,1,0,1,48,333369,2


In [None]:
unique_t_data = t_data \
    .groupby(['week', 'customer_id']) \
    .head(1) \
    .drop(columns=['article_id', 'price']) \
    .copy()

In [36]:
sales_by_age_group = t_data \
    .groupby('age_group')['article_id'].value_counts() \
    .groupby('age_group').rank(method='dense', ascending=False) \
    .groupby('age_group').head(12).rename('bestseller_agegroup_rank').astype('int8')

  .groupby('age_group')['article_id'].value_counts() \
  .groupby('age_group').rank(method='dense', ascending=False) \
  .groupby('age_group').head(12).rename('bestseller_agegroup_rank').astype('int8')


In [37]:
sales_by_age_group

age_group  article_id
0          706016001      1
           448509014      2
           915526001      3
           751471001      4
           916468003      5
           918292001      6
           372860002      7
           706016003      8
           866731001      9
           760084003     10
           850917001     11
           759871002     12
1          918292001      1
           706016001      2
           866731001      3
           610776002      4
           730683050      5
           751471001      6
           827968001      7
           768912001      8
           717490064      9
           912204001     10
           706016003     11
           841383002     12
2          751471001      1
           751471043      2
           706016001      3
           783346001      4
           896152002      5
           863595006      6
           850917001      7
           372860002      8
           678942001      9
           915529003     10
           610776002     1

In [None]:
candidates_bestsellers = pd.merge(
    unique_transactions,
    bestsellers_agegroup_rank,
    on='age_group',
)

In [82]:
candidates_bestsellers.head()

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price
0,2020-07-22,200292573348128,2,96,760084003,0.025094
1,2020-07-22,200292573348128,2,96,866731001,0.024919
2,2020-07-22,200292573348128,2,96,600886001,0.02298
3,2020-07-22,200292573348128,2,96,706016001,0.033197
4,2020-07-22,200292573348128,2,96,372860002,0.013193


# Combining transactions and candidates / negative examples

In [38]:
transactions['purchased'] = 1

In [39]:
data = pd.concat([transactions, candidates_last_purchase, candidates_bestsellers])
data.purchased.fillna(0, inplace=True)

In [40]:
data

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,week,purchased
29030503,2020-07-15,272412481300040,778064028,0.008458,1,95,1.0
29030504,2020-07-15,272412481300040,816592008,0.016932,1,95,1.0
29030505,2020-07-15,272412481300040,621381021,0.033881,1,95,1.0
29030506,2020-07-15,272412481300040,817477003,0.025407,1,95,1.0
29030507,2020-07-15,272412481300040,899088002,0.025407,1,95,1.0
...,...,...,...,...,...,...,...
5248375,2020-09-22,18438270306572912089,915529003,0.033439,1,105,0.0
5248376,2020-09-22,18438270306572912089,915529005,0.033417,1,105,0.0
5248377,2020-09-22,18438270306572912089,448509014,0.041630,1,105,0.0
5248378,2020-09-22,18438270306572912089,762846027,0.025005,1,105,0.0


In [41]:
data.drop_duplicates(['customer_id', 'article_id', 'week'], inplace=True)

In [42]:
data.purchased.mean()

0.13607582749165664

### Add bestseller information

In [43]:
data = pd.merge(
    data,
    bestsellers_previous_week[['week', 'article_id', 'bestseller_rank']],
    on=['week', 'article_id'],
    how='left'
)

In [83]:
bestsellers_previous_week.head()

Unnamed: 0,week,article_id,bestseller_rank,price
0,96,760084003,1,0.025094
1,96,866731001,2,0.024919
2,96,600886001,3,0.02298
3,96,706016001,4,0.033197
4,96,372860002,5,0.013193


In [44]:
data = data[data.week != data.week.min()]
data.bestseller_rank.fillna(999, inplace=True)

In [45]:
agg_price_spent = transactions.groupby('customer_id')['price'].agg(['mean', 'max']).reset_index()
agg_price_spent.rename(columns={'mean': 'avg_price_spent', 'max': 'max_price_spent'}, inplace=True)

In [46]:
agg_price_spent

Unnamed: 0,customer_id,avg_price_spent,max_price_spent
0,28847241659200,0.046085,0.128797
1,41318098387474,0.033881,0.033881
2,116809474287335,0.029644,0.033881
3,200292573348128,0.029602,0.064305
4,248294615847351,0.022017,0.031458
...,...,...,...
437360,18446624797007271432,0.013119,0.016932
437361,18446630855572834764,0.061000,0.067780
437362,18446662237889060501,0.033881,0.033881
437363,18446705133201055310,0.050831,0.050831


In [47]:
transactions_with_age = pd.merge(transactions, customers[['customer_id', 'age']], on='customer_id', how='left')
avg_purchaser_age = transactions_with_age.groupby('article_id')['age'].mean().reset_index()
avg_purchaser_age.rename(columns={'age': 'avg_purchaser_age'}, inplace=True)

In [48]:
avg_purchaser_age

Unnamed: 0,article_id,avg_purchaser_age
0,108775015,40.333333
1,108775044,38.956522
2,110065001,36.500000
3,110065002,27.500000
4,110065011,27.500000
...,...,...
38326,952267001,34.243902
38327,952938001,36.111111
38328,953450001,31.764706
38329,953763001,35.457143


In [49]:
customers = pd.merge(customers, agg_price_spent, on='customer_id', how='left')
articles = pd.merge(articles, avg_purchaser_age, on='article_id', how='left')

In [50]:
articles

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc,avg_purchaser_age
0,108775015,108775,12855,253,9,0,1010016,0,9,0,...,0,0,1,0,16,30,1002,2,8834,40.333333
1,108775044,108775,12855,253,9,0,1010016,0,10,2,...,0,0,1,0,16,30,1002,2,8834,38.956522
2,108775051,108775,44846,253,9,0,1010017,3,11,11,...,0,0,1,0,16,30,1002,2,8834,
3,110065001,110065,8159,306,13,4,1010016,0,9,0,...,7,7,1,0,61,5,1017,4,8243,36.500000
4,110065002,110065,8159,306,13,4,1010016,0,10,2,...,7,7,1,0,61,5,1017,4,8243,27.500000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
105537,953450001,953450,24043,302,14,7,1010014,6,9,0,...,2,2,3,3,26,16,1021,13,28026,31.764706
105538,953763001,953763,24027,253,9,0,1010016,0,9,0,...,0,0,1,0,2,15,1005,0,28025,35.457143
105539,956217002,956217,24040,265,1,2,1010016,0,9,0,...,0,0,1,0,18,12,1005,0,28024,48.428571
105540,957375001,957375,24041,72,48,3,1010016,0,9,0,...,1,1,2,2,52,25,1019,1,28023,


In [51]:
articles.avg_purchaser_age.fillna(-1, inplace=True)

In [52]:
data = pd.merge(data, articles, on='article_id', how='left')
data = pd.merge(data, customers, on='customer_id', how='left')

In [53]:
articles['index_code'].drop_duplicates()

0       0
3       7
11      5
16      2
28      6
40      9
69      3
78      1
81      4
1367    8
Name: index_code, dtype: int8

In [54]:
pivot_table = pd.pivot_table(
    data,
    index='customer_id',
    columns='index_code',
    values='article_id',
    aggfunc='count',
    fill_value=0
)

pivot_table['total_purchases'] = pivot_table.sum(axis=1)

# Calculate the percentage of purchases in each category for each customer
pivot_table['percentage_women_purchases'] = ((pivot_table[0] + pivot_table[7] + pivot_table[6]) / pivot_table['total_purchases'])
pivot_table['percentage_children_purchases'] = ((pivot_table[5] + pivot_table[3] + pivot_table[4] + pivot_table[8]) / pivot_table['total_purchases'])
pivot_table['percentage_men_purchases'] = (pivot_table[2] / pivot_table['total_purchases'])

In [55]:
pivot_table.reset_index(inplace=True)
pivot_table

index_code,customer_id,0,1,2,3,4,5,6,7,8,9,total_purchases,percentage_women_purchases,percentage_children_purchases,percentage_men_purchases
0,28847241659200,35,11,0,0,0,0,0,3,0,7,56,0.678571,0.0,0.000000
1,41318098387474,18,5,0,0,0,0,0,2,0,1,26,0.769231,0.0,0.000000
2,116809474287335,30,13,0,0,0,0,0,0,0,4,47,0.638298,0.0,0.000000
3,200292573348128,39,13,1,0,0,0,0,17,0,4,74,0.756757,0.0,0.013514
4,248294615847351,13,5,8,0,0,0,0,3,0,3,32,0.500000,0.0,0.250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437360,18446624797007271432,11,2,0,0,0,0,2,0,0,1,16,0.812500,0.0,0.000000
437361,18446630855572834764,23,5,0,0,0,0,0,0,0,2,30,0.766667,0.0,0.000000
437362,18446662237889060501,18,7,0,0,0,0,0,1,0,2,28,0.678571,0.0,0.000000
437363,18446705133201055310,20,4,0,0,0,0,0,0,0,2,26,0.769231,0.0,0.000000


In [56]:
data = pd.merge(data, pivot_table[['customer_id', 'total_purchases', 'percentage_women_purchases', 'percentage_children_purchases', 'percentage_men_purchases']], on='customer_id', how='left')

In [57]:
data.sort_values(['week', 'customer_id'], inplace=True)
data.reset_index(drop=True, inplace=True)

In [58]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id']).copy()

In [59]:
train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [60]:
columns_to_use = ['article_id', 'product_type_no', 'graphical_appearance_no', 'colour_group_code', 'perceived_colour_value_id',
'perceived_colour_master_id', 'department_no', 'index_code',
'index_group_no', 'section_no', 'garment_group_no', 'FN', 'Active',
'club_member_status', 'fashion_news_frequency', 'age', 'postal_code', 'bestseller_rank']

In [61]:
%%time

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

CPU times: total: 703 ms
Wall time: 693 ms


# Model training

In [62]:
from lightgbm.sklearn import LGBMRanker

In [63]:
ranker = LGBMRanker(
    objective="lambdarank",
    metric="ndcg",
    boosting_type="dart",
    n_estimators=1,
    importance_type='gain',
    verbose=10
)

In [64]:
%%time

ranker = ranker.fit(
    train_X,
    train_y,
    group=train_baskets,
)

[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.848850
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.153113
[LightGBM] [Debug] init for col-wise cost 0.317892 seconds, init for row-wise cost 0.712228 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.503419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Dense Multi-Val Bin
[LightGBM] [Info] Total Bins 1079
[LightGBM] [Info] Number of data points in the train set: 11381612, number of used features: 18
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 12
CPU times: total: 29.3 s
Wall time: 6.6 s


In [65]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

bestseller_rank 0.9989805711820897
age 0.00024136039423249762
article_id 0.0001716082873112681
garment_group_no 0.000144767354190598
department_no 9.631753928857237e-05
product_type_no 9.014783466245737e-05
section_no 7.07609526662605e-05
postal_code 6.79219757232404e-05
club_member_status 6.519780365736126e-05
colour_group_code 5.3587542243445946e-05
perceived_colour_value_id 1.7759133934558557e-05
fashion_news_frequency 0.0
Active 0.0
FN 0.0
index_code 0.0
perceived_colour_master_id 0.0
graphical_appearance_no 0.0
index_group_no 0.0


# Calculate predictions

In [66]:
%time

test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

bestsellers_last_week = \
    bestsellers_previous_week[bestsellers_previous_week.week == bestsellers_previous_week.week.max()]['article_id'].tolist()

CPU times: total: 0 ns
Wall time: 0 ns


# Create submission

In [68]:
sub = pd.read_csv('data/original/sample_submission.csv')

In [69]:
%%time
preds = []
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    preds.append(pred[:12])

CPU times: total: 4.42 s
Wall time: 4.45 s


In [70]:
preds = [' '.join(['0' + str(p) for p in ps]) for ps in preds]
sub.prediction = preds

In [71]:
sub_name = 'baseline_model'
sub.to_csv(f'{sub_name}.csv.gz', index=False)

# Evaluate candidate generation methods

In [72]:
%%time
final_preds = {}
for c_id in customer_hex_id_to_int(sub.customer_id):
    pred = c_id2predicted_article_ids.get(c_id, [])
    pred = pred + bestsellers_last_week
    final_preds[c_id] = pred[:12]

CPU times: total: 5.05 s
Wall time: 5.12 s


In [73]:
def precision_at_K(true_candidates, predicted_candidates, K):
    relevant_candidates = set(true_candidates)
    top_K_predicted = predicted_candidates[:K]
    relevant_predicted = [c for c in top_K_predicted if c in relevant_candidates]
    return len(relevant_predicted) / K

In [74]:
def precision(true_candidates, predicted_candidates, K):
    relevant_candidates = set(true_candidates)
    top_K_predicted = predicted_candidates
    relevant_predicted = [c for c in top_K_predicted if c in relevant_candidates]
    return len(relevant_predicted) / K

In [75]:
grouped_candidates = candidates_bestsellers.groupby('customer_id')['article_id'].agg(list).reset_index()

In [76]:
grouped_candidates.head()

Unnamed: 0,customer_id,article_id
0,28847241659200,"[760084003, 866731001, 600886001, 706016001, 3..."
1,41318098387474,"[759871002, 464297007, 933838002, 861558002, 9..."
2,116809474287335,"[916468003, 896152003, 896152002, 751471001, 7..."
3,200292573348128,"[760084003, 866731001, 600886001, 706016001, 3..."
4,248294615847351,"[760084003, 866731001, 600886001, 706016001, 3..."


In [77]:
candidates_bestsellers[candidates_bestsellers['customer_id'] == 11246327431398957306]

Unnamed: 0,t_dat,customer_id,sales_channel_id,week,article_id,price


In [78]:
grouped_candidates[grouped_candidates['customer_id'] == 11246327431398957306]

Unnamed: 0,customer_id,article_id


In [79]:
final_preds[11246327431398957306]

[924243001,
 924243002,
 918522001,
 923758001,
 866731001,
 909370001,
 751471001,
 915529003,
 915529005,
 448509014,
 762846027,
 714790020]

In [80]:
grouped_dict = dict(zip(grouped_candidates['customer_id'], grouped_candidates['article_id']))
for key, item in grouped_dict.items():
    print(key, item)
    break

28847241659200 [760084003, 866731001, 600886001, 706016001, 372860002, 610776002, 877278002, 547780003, 817354001, 827968001, 866731003, 866383006, 916468003, 896152003, 896152002, 751471001, 706016001, 918292001, 921906003, 751471043, 706016003, 918292004, 915526002, 920610001, 898694001, 933706001, 751471001, 915526001, 915529003, 706016001, 918292001, 751471043, 915526002, 915529001, 862970001, 863595006, 924243001, 924243002, 918522001, 923758001, 866731001, 909370001, 751471001, 915529003, 915529005, 448509014, 762846027, 714790020]


In [81]:
from tqdm import tqdm
from collections import defaultdict

# Calculate P@12 for each user and each candidate generation method
results = {}

for method, generated_candidates in [('Recently Bought', candidates_last_purchase), ('Bestsellers', candidates_bestsellers), ('Bestsellers test week', candidates_bestsellers_test_week)]:
    grouped_candidates = generated_candidates.groupby('customer_id')['article_id'].agg(list).reset_index()
    grouped_dict = defaultdict(list, zip(grouped_candidates['customer_id'], grouped_candidates['article_id']))
    method_results = []
    for user, top_12 in tqdm(final_preds.items()):
        precision_at_12 = precision(top_12, grouped_dict[user], K=12)
        method_results.append(precision_at_12)
    results[method] = method_results

# Calculate the average P@12 for each method
average_precision = {method: sum(method_results) / len(method_results) for method, method_results in results.items()}

# Print the results
for method, avg_precision in average_precision.items():
    print(f"Method: {method}, Average P@12: {avg_precision}")

100%|██████████| 1371980/1371980 [00:04<00:00, 334936.23it/s]
100%|██████████| 1371980/1371980 [00:05<00:00, 234857.48it/s]
100%|██████████| 1371980/1371980 [00:04<00:00, 334724.61it/s]

Method: Recently Bought, Average P@12: 0.09223439846057346
Method: Bestsellers, Average P@12: 0.29828362415395177
Method: Bestsellers test week, Average P@12: 0.23812616316080873



